In [86]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup, NavigableString
import requests
import json
import time
import random
from ast import literal_eval
import os

In [8]:
def extraer_tabla(soup):
    tablas = soup.find_all('table')
    return tablas

In [100]:
def extraer_valores(texto):
    # Extraer número de patente
    patent = re.findall(r'<span class="highlight">EP<span class="highlight">(\d+)</span></span>', texto)
    if len(patent) == 0:
        patent = ''
    else:
        patent = 'EP' + patent[0]

    # Extraer nombre de la patente
    patent_name = re.findall(r'</span></span> - (.*?)</a>', texto)
    if len(patent_name) == 0:
        patent_name = ''
    else:
        patent_name = patent_name[0].strip()  

    # Extraer Status
    status = re.findall("""Status</td><td class="t2" colspan="3">.*?<br/>""", texto)
    if len(status) == 0:
        status = ''
    else:
        status = status[0]
        status = status.replace("""Status</td><td class="t2" colspan="3">""","").replace('<br/>','').strip()

    #Most Recent Event date
    most_recent_event_date = re.findall(r'Most recent event.*?<td class="t2">(.*?)</td>', texto, re.DOTALL)
    if len(most_recent_event_date) == 0:
        most_recent_event_date = ''
    else:
        most_recent_event_date = most_recent_event_date[0].strip()

    #Most recent event
    most_recent_event = re.findall(r'Most recent event.*?<td class="t3">(.*?)</td>', texto, re.DOTALL)
    if len(most_recent_event) == 0:
        most_recent_event = ''
    else:
        most_recent_event = most_recent_event[0].replace('<br/>', '').replace('\r\n', '').replace('\t', '').replace('\xa0', ' ').strip()

    # Extraer solicitantes
    pattern = re.compile(
        r'<td class="t2" colspan="3">(For all designated states)<br/>\r\n\t(.*?)(<br/>\r\n\t(.*?))?(<br/>\r\n\t(.*?))?(<br/>\r\n\t(.*?))?(<br/>\r\n\t(.*?))?\r\n\t  / (.*?)<br/>'
    )
    matches = pattern.findall(texto)

    applicants_dict = []
    for match in matches:
        address_parts = [match[1]]
        
        if match[3]:
            address_parts.append(match[3])
        if match[5]:
            address_parts.append(match[5])
        if match[7]:
            address_parts.append(match[7])
        if match[9]:
            address_parts.append(match[9])
            
        address = ', '.join(address_parts)
        country = match[10] if len(match) > 10 else ''
        
        applicant_info = {
            'for': match[0],
            'name': match[1],
            'address': address,
            'country': country
        }
        applicants_dict.append(applicant_info)

    #Inventors
    # Extraer inventores
    inventors_section_pattern = r'<td class="th" rowspan="\d+">Inventor\(s\)</td>(.*?)<td class="th"'
    inventors_section = re.search(inventors_section_pattern, texto, re.DOTALL)

    inventors_dict = []
    if inventors_section:
        inventor_rows = re.findall(r'<td class="t2" colspan="3">(.*?)</td>', inventors_section.group(1), re.DOTALL)
        for row in inventor_rows:            
            inventor_info = re.search(r'(\d+)\s*/\s*(.*?)<br/>\s*(.*?)\s*/\s*(.*)', row, re.DOTALL)
            if inventor_info:
                inventor_dict = {
                    'number': inventor_info.group(1).strip(),
                    'name': inventor_info.group(2).strip(),
                    'address': inventor_info.group(3).strip(),
                    'country': inventor_info.group(4).strip()
                }
                inventors_dict.append(inventor_dict)
                

    # Extraer números de prioridad
    priority_section_pattern = r'<td class="th" rowspan="\d+">Priority number, date</td>(.*?)<td class="th"'
    priority_section = re.search(priority_section_pattern, texto, re.DOTALL)

    priority_numbers = []
    if priority_section:
        priority_rows = re.findall(r'<td class="t2">(.*?)</td>.*?<td class="t3" colspan="2">(.*?)</td>', priority_section.group(1), re.DOTALL)
        for row in priority_rows:
            priority_number = row[0].strip()
            date_and_format = row[1].strip()
            
            date_match = re.search(r'(\d{2}\.\d{2}\.\d{4})', date_and_format)
            format_match = re.search(r'Original\s+published\s+format:\s+(.*)', date_and_format)
            
            date = date_match.group(1) if date_match else ''
            original_format = format_match.group(1).strip() if format_match else ''
            
            priority_numbers.append({
                priority_number: date
            })


    # Extraer clasificaciones IPC
    ipc_pattern = r'IPC:\s*</td>\s*<td class="t2">\s*(.*?)\s*</td>'
    ipc_matches = re.findall(ipc_pattern, texto, re.DOTALL)
    ipc_list = [re.sub(r'\s+', ' ', ipc).strip() for ipc in ipc_matches]

    # Extraer clasificaciones CPC
    cpc_pattern = r'CPC:</td>\s*<td class="t2" colspan="2">(.*?)</td>'
    cpc_matches = re.findall(cpc_pattern, texto, re.DOTALL)
    
    cpc_list = []
    for match in cpc_matches:
        soup = BeautifulSoup(match, 'html.parser')
        b_elements = soup.find_all('b') # Encontrar todos los elementos <b> dentro del bloque
        
        # Extraer el texto y limpiarlo
        items = [elem.get_text().strip() for elem in b_elements]
        cpc_list.extend(items)


    # Parent applications
    parent_pattern = r'<td class="th"[^>]*>Parent application\(s\).*?<\/td>(.*?)(?:<td class="th"[^>]*>Divisional application\(s\)|<\/table>)'
    parent_match = re.search(parent_pattern, texto, re.DOTALL)
    parent_list = []
    if parent_match:
        parent_text = parent_match.group(1)
        parent_apps = re.findall(r'<td class="t2" colspan="3">(EP\d+\.\d+)\s*\/\s*<a.*?>(EP\d+)</a>', parent_text)
        for app in parent_apps:
            parent_list.append({
                app[0]: app[1]
            })

    # Divisional applications
    divisional_pattern = r'<td class="th"[^>]*>Divisional application\(s\).*?<\/td>(.*?)<\/table>'
    divisional_match = re.search(divisional_pattern, texto, re.DOTALL)
    divisional_list = []
    if divisional_match:
        divisional_text = divisional_match.group(1)
        divisional_apps = re.findall(r'<td class="t2" colspan="3">(EP\d+\.\d+)\s*\/\s*<a.*?>(EP\d+)</a>', divisional_text)
        for app in divisional_apps:
            divisional_list.append({
                app[0]: app[1]
            })

    # Opponent
    opposition_pattern = r'<td class="th">\s*Opponent\(s\)</td>\s*<td class="t2" colspan="2">(.*?)</td>'
    opposition_raw = re.findall(opposition_pattern, texto, re.DOTALL)

    oppositions_list = []
    if opposition_raw:
        opposition_entries = re.split(r'</td>\s*</tr><td class="th">&nbsp;</td><td class="t2" colspan="2">', opposition_raw[0])
        for entry in opposition_entries:
            if entry.strip():
                # Limpiar y dividir la entrada
                parts = entry.strip().split('<br/>')
                if len(parts) >= 7:
                    opposition_info = {
                        'number': parts[0].strip(),
                        'date_received': parts[1].strip(),
                        'date_decision': parts[2].strip(),
                        'status': parts[3].strip(),
                        'name': parts[4].strip().replace('&nbsp;', ' '),
                        'address': ' '.join(parts[5:7]).strip().replace('&nbsp;', ' '),
                        'representative': ' '.join(parts[7:9]).strip().replace('&nbsp;', ' ')
                    }
                    oppositions_list.append(opposition_info)

    # Extraer representantes
    representatives_section_pattern = r'<td class="th" rowspan="\d+">Representative\(s\)</td>(.*?)<td class="th"'
    representatives_section = re.search(representatives_section_pattern, texto, re.DOTALL)

    representatives_dict = []
    if representatives_section:
        representative_rows = re.findall(r'<td class="t2" colspan="3">(.*?)</td>', representatives_section.group(1), re.DOTALL)
        for row in representative_rows:
            representative_info = re.search(r'(.*?)<br/>(.*?)<br/>(.*?)\s*/\s*(.*)', row, re.DOTALL)
            if representative_info:
                representative_dict = {
                    'name': representative_info.group(1).strip(),
                    'address_line1': representative_info.group(2).strip(),
                    'address_line2': representative_info.group(3).strip(),
                    'country': representative_info.group(4).strip()
                }
                representatives_dict.append(representative_dict)

    #Publications
    soup = BeautifulSoup(texto, 'html.parser')
    publication_section = soup.find('td', class_='th', string=re.compile('Publication', re.IGNORECASE))
    
    publications_list = []
    if publication_section:        
        current_publication = {}
        next_element = publication_section.next_sibling
        
        while next_element:
            if isinstance(next_element, NavigableString):
                next_element = next_element.next_sibling
                continue
            
            if next_element.name == 'td' and 'th' in next_element.get('class', []):
                field = next_element.text.strip().rstrip(':')
                value_td = next_element.find_next_sibling('td', class_='t2')
                
                if value_td:
                    value = value_td.text.strip()
                    
                    if field == 'Type':
                        if current_publication:
                            publications_list.append(current_publication)
                            current_publication = {}
                        current_publication[field] = value
                    elif field in ['No.', 'Date', 'Language']:
                        current_publication[field] = value
            elif next_element.name == 'td' and 't2' in next_element.get('class', []):
                if '[' in next_element.text and ']' in next_element.text:
                    current_publication['Publication week'] = next_element.text.strip()
                    publications_list.append(current_publication)
                    current_publication = {}
            
            if next_element.name == 'td' and 'th' in next_element.get('class', []) and next_element.has_attr('rowspan'):
                if next_element != publication_section:
                    break
            
            next_element = next_element.next_sibling
        
        if current_publication:
            publications_list.append(current_publication)

    # Opposition
    # Extraer la fecha de la oposición y la razón
    opposition_pattern = r'<td class="th">Opposition\(s\)</td><td class="t2">(.*?)</td><td class="t3" colspan="2">(.*?)</td>'
    opposition_matches = re.findall(opposition_pattern, texto, re.DOTALL)
    opposition_date = opposition_matches[0][0].strip() if opposition_matches else ''
    opposition_reason = opposition_matches[0][1].strip() if opposition_matches else ''

    # Opponent
    opponents_list = []
    opposition_section = soup.find('td', class_='th', string='Opposition(s)')
    if opposition_section and opposition_section.has_attr('rowspan'):
        opponents_section = opposition_section.find_next_sibling('td', class_='th', string='Opponent(s)')
        if opponents_section:
            # Extraer los detalles de los oponentes
            opponent_details = opponents_section.find_next_sibling('td', class_='t2', colspan='2')
            while opponent_details:
                # Obtener el contenido HTML y limpiarlo
                content = opponent_details.decode_contents()
                # Reemplazar <br> con nueva línea y limpiar espacios adicionales
                content = re.sub(r'<br\s*/?>', '\n', content)  # Reemplazar <br> con nueva línea
                content = re.sub(r'\s+', ' ', content).strip()  # Eliminar espacios adicionales
                
                # Crear un objeto para el oponente y añadirlo a la lista
                opponent_obj = {'opponent_info': content}
                opponents_list.append(opponent_obj)
                
                # Buscar el siguiente detalle de oponente si existe
                opponent_details = opponent_details.find_next('td', class_='t2', colspan='2')
                # Detener la extracción al encontrar una celda con diferente estructura
                if not opponent_details or opponent_details.find_previous_sibling('td', class_='th'):
                    break

    #Applicant
    applicants_section = soup.find('td', class_='th', string=re.compile('Applicant\(s\)', re.IGNORECASE))
    applicants_list = []

    if applicants_section:
        next_element = applicants_section.find_next_sibling('td', class_='t2')
        while next_element:
            if 't2' in next_element.get('class', []):
                applicant_text = next_element.get_text(strip=True, separator='|') if next_element else ''
                applicant_info = applicant_text.split('|')
                if len(applicant_info) >= 2:
                    applicant = {
                        'type': 'current',
                        'for': applicant_info[0],
                        'name': applicant_info[1],
                        'address': '|'.join(applicant_info[2:-1]) if len(applicant_info) > 3 else '',
                        'country': applicant_info[-1].strip('/ ') if len(applicant_info) > 2 else ''
                    }
                    applicants_list.append(applicant)

            next_element = next_element.find_next_sibling(['td', 'tr'])
            if next_element and next_element.name == 'tr':
                next_element = next_element.find('td', class_='t2')
            if next_element and 'th' in next_element.get('class', []):
                break

    former_applicants = soup.find_all('tr', class_='former')
    for former in former_applicants:
        former_date_elem = former.find('td', class_='t1')
        former_info_elem = former.find('td', class_='t2')
        
        if former_date_elem and former_info_elem:
            former_date = former_date_elem.text.strip()
            former_text = former_info_elem.get_text(strip=True, separator='|')
            former_info = former_text.split('|')
            
            if len(former_info) >= 2:
                applicant = {
                    'type': 'former',
                    'for': former_info[0],
                    'name': former_info[1],
                    'address': '|'.join(former_info[2:-1]) if len(former_info) > 3 else '',
                    'country': former_info[-1].strip('/ ') if len(former_info) > 2 else '',
                    'former_date': former_date
                }
                applicants_list.append(applicant)

    # Crear DataFrame
    df = pd.DataFrame({
        'Patent': [patent],
        'Patent_Name': [patent_name],
        'Status': [status],
        'Most_Recent_Event_Date': [most_recent_event_date],
        'Most_Recent_Event': [most_recent_event],
        'Divisional_applications': [json.dumps(divisional_list)],
        'Parent_applications': [json.dumps(parent_list)],
        'Inventors': [json.dumps(inventors_dict)],
        'Priority_numbers': [priority_numbers],
        'Applicants': [json.dumps(applicants_dict)],
        'Representatives': [json.dumps(representatives_dict)],
        'IPC': [json.dumps(ipc_list)],
        'CPC': [json.dumps(cpc_list)],
        'Opponent': [json.dumps(oppositions_list)],
        'Publications': [json.dumps(publications_list)],
        'Opposition Date': [json.dumps(opposition_date)],
        'Opposition Reason': [json.dumps(opposition_reason)],
        'Opponent': [json.dumps(opponents_list)],
        'Applicants': [json.dumps(applicants_list)]
    })
    
    return df



ruta_completa = 'C:/Users/iponc/Downloads/HTMLS/EP4036079.txt'  # Ruta de la carpeta HTMLS
try:
    if os.path.isfile(ruta_completa):
            
        with open(ruta_completa, 'r', encoding='utf-8') as file:
            contenido = file.read()
            #print(contenido)
            soup = BeautifulSoup(contenido, 'html.parser')
            mytable = extraer_tabla(soup)[0]
            df_actual = extraer_valores(str(mytable))
            
except Exception as e:
    print(f"Error inesperado al procesar el archivo: {str(e)}")

df_actual

Unnamed: 0,Patent,Patent_Name,Status,Most_Recent_Event_Date,Most_Recent_Event,Divisional_applications,Parent_applications,Inventors,Priority_numbers,Applicants,Representatives,IPC,CPC,Opponent,Publications
0,EP4036079,COMPOUNDS AND COMPOSITIONS FOR INTRACELLULAR D...,Examination is in progress,09.07.2024,New entry: Reply to examination report,[],"[{""EP16831870.7"": ""EP3394030""}]","[{""number"": ""01"", ""name"": ""BENENATO, Kerry E.""...","[{'US201562271160P': '22.12.2015'}, {'US201562...","[{""type"": ""current"", ""for"": ""For all designate...","[{""name"": ""Cooley (UK) LLP"", ""address_line1"": ...","[""C07D211/14, C07D211/16, C07D295/13, C07D295/...","[""C07D211/14 (EP,US);"", ""A61K9/5123 (US);"", ""A...",[],"[{""Type"": ""A2\u00a0Application without search ..."


In [101]:
df_actual['Applicants'][0]

'[{"type": "current", "for": "For all designated states", "name": "ModernaTX, Inc.", "address": "325 Binney Street", "country": "Cambridge, MA 02142\\n\\t  / US"}, {"type": "current", "for": "For all designated states", "name": "ModernaTX, Inc.", "address": "200 Technology Square", "country": "Cambridge, MA 02139\\n\\t  / US"}, {"type": "former", "for": "For all designated states", "name": "ModernaTX, Inc.", "address": "200 Technology Square", "country": "Cambridge, MA 02139\\n\\t  / US", "former_date": "Former [2022/31]"}]'

In [102]:
carpeta = 'C:/Users/iponc/Downloads/HTMLS'  # Ruta de la carpeta HTMLS
df_general = pd.DataFrame()  

for nombre_archivo in os.listdir(carpeta):
    try:
        ruta_completa = os.path.join(carpeta, nombre_archivo)
        if os.path.isfile(ruta_completa):
            print(f"Procesando archivo: {nombre_archivo}")
            
            with open(ruta_completa, 'r', encoding='utf-8') as file:
                contenido = file.read()
                soup = BeautifulSoup(contenido, 'html.parser')
                mytable = extraer_tabla(soup)[0]
                df_actual = extraer_valores(str(mytable))
                
                df_general = pd.concat([df_general, df_actual], ignore_index=True)
                
    except Exception as e:
        print(f"Error inesperado al procesar el archivo {nombre_archivo}: {str(e)}")
        continue

print(f"Total de filas en el DataFrame general: {len(df_general)}")


Procesando archivo: EP0721594.txt
Procesando archivo: EP0737549.txt
Procesando archivo: EP0788617.txt
Procesando archivo: EP0885233.txt
Procesando archivo: EP1105513.txt
Procesando archivo: EP1127791.txt
Procesando archivo: EP1164138.txt
Procesando archivo: EP1167108.txt
Procesando archivo: EP1168032.txt
Procesando archivo: EP1250728.txt
Procesando archivo: EP1250983.txt
Procesando archivo: EP1250989.txt
Procesando archivo: EP1251150.txt
Procesando archivo: EP1443598.txt
Procesando archivo: EP1475410.txt
Procesando archivo: EP1489185.txt
Procesando archivo: EP1502461.txt
Procesando archivo: EP1613746.txt
Procesando archivo: EP1613796.txt
Procesando archivo: EP1614628.txt
Procesando archivo: EP1616678.txt
Procesando archivo: EP1663881.txt
Procesando archivo: EP1717123.txt
Procesando archivo: EP1730131.txt
Procesando archivo: EP1744959.txt
Procesando archivo: EP1761553.txt
Procesando archivo: EP1769893.txt
Procesando archivo: EP1775298.txt
Procesando archivo: EP1873124.txt
Procesando arc

In [103]:
df_general

Unnamed: 0,Patent,Patent_Name,Status,Most_Recent_Event_Date,Most_Recent_Event,Divisional_applications,Parent_applications,Inventors,Priority_numbers,Applicants,Representatives,IPC,CPC,Opponent,Publications
0,EP0721594,AUTONOMOUS CRUISE CONTROL,No opposition filed within time limit,28.12.2007,Lapse of the patent in a contracting state\nNe...,"[{""EP01109826.6"": ""EP1167108""}]",[],"[{""number"": ""01"", ""name"": ""WOLL, Jerry"", ""addr...",[{'US19930130585': '01.10.1993'}],"[{""type"": ""current"", ""for"": ""For all designate...","[{""name"": ""R\u00fcger Abel Patentanw\u00e4lte ...","[""G01S13/60, B60K31/00, G01S13/93<br/>""]","[""B60K31/0008 (EP,KR,US);"", ""B60W30/146 (EP);""...",[],"[{""Type"": ""A1\u00a0Application with search rep..."
1,EP0737549,Abrasive tape and process for producing it,No opposition filed within time limit,24.10.2003,No opposition filed within time limit,"[{""EP02012459.0"": ""EP1250983""}]",[],"[{""number"": ""01"", ""name"": ""Fujii, Kazuhito, c/...","[{'JP19950109088': '10.04.1995'}, {'JP19960005...","[{""type"": ""current"", ""for"": ""For all designate...","[{""name"": ""M\u00fcller-Bor\u00e9 &amp; Partner...","[""B24D3/00, B24D11/00, B24D3/28<br/>""]","[""B24B19/226 (EP,US);"", ""B24D3/00 (KR);"", ""B24...",[],"[{""Type"": ""A2\u00a0Application without search ..."
2,EP0788617,MINIATURE OPTICAL SCANNER FOR A TWO AXIS SCANN...,No opposition filed within time limit,04.01.2008,Lapse of the patent in a contracting state\nNe...,"[{""EP01202983.1"": ""EP1168032""}]",[],"[{""number"": ""01"", ""name"": ""MELVILLE, Charles, ...",[{'US19940329508': '26.10.1994'}],"[{""type"": ""current"", ""for"": ""For all designate...","[{""name"": ""Hitchcock, Esmond Antony"", ""address...","[""G02B26/08, G02B26/10, G02B7/182<br/>""]","[""G02B26/10 (EP,US);"", ""G02B26/101 (EP,US);"", ...",[],"[{""Type"": ""A1\u00a0Application with search rep..."
3,EP0885233,TERPYRIDINE-PLATINUM(II) COMPLEXES,No opposition filed within time limit,25.04.2003,No opposition filed within time limit,"[{""EP01121776.7"": ""EP1164138""}]",[],"[{""number"": ""01"", ""name"": ""LOWE, Gordon"", ""add...",[{'GB19960001603': '26.01.1996'}],"[{""type"": ""current"", ""for"": ""For all designate...","[{""name"": ""Ellis-Jones, Patrick George Armine,...","[""C07F15/00, C07D213/06, C07D213/22, C07H19/06...","[""C07H21/00 (EP,US);"", ""A61P33/00 (EP);"", ""A61...",[],"[{""Type"": ""A1\u00a0Application with search rep..."
4,EP1105513,METHODS AND COMPOSITIONS FOR USE IN SPLICEOSOM...,The application is deemed to be withdrawn,14.01.2005,Application deemed to be withdrawn,"[{""EP04077408.5"": ""EP1489185""}]",[],"[{""number"": ""01"", ""name"": ""MITCHELL, Lloyd, G....","[{'US19980133717': '13.08.1998'}, {'US19980158...","[{""type"": ""current"", ""for"": ""For all designate...","[{""name"": ""Lucas, Brian Ronald"", ""address_line...","[""C12N15/90, A61K48/00, C12N15/11, // A61P11:0...","[""C12N15/1027 (EP,US);"", ""A61K48/00 (EP,US);"",...",[],"[{""Type"": ""A2\u00a0Application without search ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143,EP3937083,SMART METAL CARD WITH RADIO FREQUENCY (RF) TRA...,Examination is in progress,15.11.2023,New entry: Renewal fee paid,[],"[{""EP15874266.8"": ""EP3238139""}]","[{""number"": ""01"", ""name"": ""HERSLOW, John"", ""ad...",[{'US201462095901P': '23.12.2014'}],"[{""type"": ""current"", ""for"": ""For all designate...","[{""name"": ""Scheele Jaeger Wetzel Patentanw\u00...","[""G06K19/077, G06K19/06<br/>""]","[""G06K19/07773 (EP,US);"", ""G06K19/02 (EP,US);""...",[],"[{""Type"": ""A1\u00a0Application with search rep..."
144,EP3992203,PLANT CYTOCHROME P450,Request for examination was made,29.07.2024,New entry: Renewal fee paid,[],"[{""EP11748702.5"": ""EP2596012""}, {""EP16186318.8...","[{""number"": ""01"", ""name"": ""WINZER, Thilo"", ""ad...","[{'GB20100021707': '22.07.2010'}, {'GB20100012...","[{""type"": ""current"", ""for"": ""For all designate...","[{""name"": ""Symbiosis IP Limited"", ""address_lin...","[""C07K14/415, C12N15/82<br/>""]","[""C07K14/415 (EP,US);"", ""C12N15/8251 (US);"", ""...",[],"[{""Type"": ""A1\u00a0Application with search rep..."
145,EP4036079,COMPOUNDS AND COMPOSITIONS FOR INTRACELLULAR D...,Examination is in progress,09.07.2024,New entry: Reply to examination report,[],"[{""EP16831870.7"": ""EP3394030""}]","[{""number"": ""01"", ""name"": ""BENENATO, Kerry E.""...","[{'US201562271160P': '22.12.2015'}, {'US201562...","[{""type"": ""current"", ""for"": ""For all designate...","[{""name"": ""Cooley (UK) LLP"", ""address_line1"": ...","[""C07D211/14, C07D211/16, C07D295/13, C07D295/...","[""C07D211/14 (EP,US);"", ""A61K9/5123 (US);"", ""A...",[],"[{""Type"": ""A2\u00a0Application without search ..."
146,EP4215191,COMPOSITION COMPRISING A FUMARATE FOR USE IN A...,Request for examination was made,02.02.2024,The date on which the examining division becom...,[],"[{""EP15804260.6"": ""EP3220907""}, {""EP20195649.7...","[{""number"": ""01"", ""name"": ""Novas, Mark"", ""addr...","[{'US201462080783P': '17.11.2014'}, {'US201562...","[{""type"": ""current"", ""for"": ""For all designate...","[{""name"": ""Pohlman, Sandra M."", ""address_line1...","[""A61K31/225, A61P25/02<br/>""]","[""A61K31/225 (EP,CN,IL,KR,US);"", ""A61K47/542 (...",[],"[{""Type"": ""A1\u00a0Application with search rep..."


In [None]:
df_general.to_csv('C:/Users/iponc/OneDrive/Escritorio/Patents_Filled.csv', index=False)

In [60]:
ruta_completa = 'C:/Users/iponc/Downloads/HTMLS/EP4036079.txt'  # Ruta de la carpeta HTMLS
try:
    if os.path.isfile(ruta_completa):
            
        with open(ruta_completa, 'r', encoding='utf-8') as file:
            contenido = file.read()
            #print(contenido)
            soup = BeautifulSoup(contenido, 'html.parser')
            mytable = extraer_tabla(soup)[0]
            df_actual = extraer_valores(str(mytable))
            
except Exception as e:
    print(f"Error inesperado al procesar el archivo {nombre_archivo}: {str(e)}")

df_actual


Unnamed: 0,Patent,Patent_Name,Status,Most_Recent_Event_Date,Most_Recent_Event,Divisional_applications,Parent_applications,Inventors,Publications,Priority numbers,Applicants,Representatives,IPC,CPC,Opponent
0,EP4036079,COMPOUNDS AND COMPOSITIONS FOR INTRACELLULAR D...,Examination is in progress,09.07.2024,New entry: Reply to examination report,[],"[{""EP16831870.7"": ""EP3394030""}]",[],[],[],[],[],"[""C07D211/14, C07D211/16, C07D295/13, C07D295/...","[""C07D211/14 (EP,US);"", ""A61K9/5123 (US);"", ""A...",[]


In [49]:

df_actual['Parent_applications'][0]

'[{"EP11748702.5": "EP2596012"}, {"EP16186318.8": "EP3121193"}]'

In [50]:
ruta_completa = 'C:/Users/iponc/Downloads/EP About this file - European Patent Register.html'  # Ruta de la carpeta HTMLS
try:
    if os.path.isfile(ruta_completa):
            
        with open(ruta_completa, 'r', encoding='utf-8') as file:
            contenido = file.read()
            #print(contenido)
            soup = BeautifulSoup(contenido, 'html.parser')
            mytable = extraer_tabla(soup)[0]
            df_actual = extraer_valores(str(mytable))
            
except Exception as e:
    print(f"Error inesperado al procesar el archivo {nombre_archivo}: {str(e)}")

df_actual

Unnamed: 0,Patent,Patent_Name,Status,Most_Recent_Event_Date,Most_Recent_Event,Divisional_applications,Parent_applications,Inventors,Publications,Priority numbers,Applicants,IPC,CPC,Opponent
0,,,Patent revoked,26.05.2017,Lapse of the patent in a contracting state,"[{""EP10176720.0"": ""EP2319496""}, {""EP11177513.6...",[],[],[],"[DE20021015131, DE20021015067]",[],"[""A61K9/16, A61K9/20, A61K9/22, A61K31/485<br/>""]","[""A61K31/485 (EP,CN,US);"", ""A61K9/0053 (US);"",...","[{""number"": ""01\u00a0\n\t\t\t11.12.2015\u00a0\..."


In [52]:

df_actual['Divisional_applications'][0]

'[{"EP10176720.0": "EP2319496"}, {"EP11177513.6": "EP2425823"}, {"EP11177516.9": "EP2425824"}, {"EP11177518.5": "EP2425821"}, {"EP11177520.1": "EP2425825"}, {"EP17169621.4": "EP3326618"}]'