In [None]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup, NavigableString
import requests
import json
import time
import random
from ast import literal_eval
import os
import ast
from collections import Counter
import datetime


ruta_HTMLS = 'C:/Users/iponc/Downloads/HTMLS/' #Change the route to your HTML folder

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [None]:
def extraer_tabla(soup):#Extract the table of the HTML
    tablas = soup.find_all('table')
    return tablas

In [None]:
def extraer_seccion(html_str):
  # This function extracts sections where the data is informed for publication numbers
    # It searches for specific HTML tags to identify the start and end of the relevant section
    # Returns the content between these tags, or None if the tags are not found
    inicio = re.search(r'<td class="th">Publication No\.</td>', html_str)
    fin = re.search(r'<td class="th">Priority number</td>', html_str)

    if inicio and fin:
        contenido_intermedio = html_str[inicio.end():fin.start()]
        return contenido_intermedio.strip()
    else:
        return None


def extraer_informacion_seccion(html_text):
  # This function extracts data from publications
    # It uses BeautifulSoup to parse the HTML and find specific elements
    # It extracts publication numbers, dates, and types
    # It removes any 'Global Dossier' entries from publication numbers
    # Returns a list of dictionaries, each containing info for a publication
    soup = BeautifulSoup(html_text, 'html.parser')

    publication_numbers = [tag.text.strip() for tag in soup.find_all("td", class_="t2") if tag.text.strip()]
    dates = [tag.text.strip() for tag in soup.find_all("td", class_="t3")]
    types = [tag.text.strip() for tag in soup.find_all("td", class_="t4")]

    for item in publication_numbers[:]:
        if item == 'Global Dossier':
            publication_numbers.remove(item)
    informacion = []
    for pub_no, date, type_ in zip(publication_numbers, dates, types):
        informacion.append({
            'Publication No.': pub_no,
            'Date': date,
            'Type': type_
        })

    return informacion


def procesar_patent_family(archivo):
  # This function processes a patent family file
    # It reads the file, extracts tables, and splits them into sections
    # For each section, it extracts publication information
    # It creates a DataFrame with the patent family information
    # It also calculates and adds some statistics:
    #   - Count of each type of patent in the family
    #   - First and last dates in the family
    # Returns the resulting DataFrame
    try:
        df = pd.DataFrame()
        if os.path.isfile(archivo):
            with open(archivo, 'r', encoding='utf-8') as file:
                contenido = file.read()
                soup = BeautifulSoup(contenido, 'html.parser')

                tabla = extraer_tabla(soup)[0]
                if tabla is None:
                    print("No se pudo extraer ninguna tabla.")
                else:
                    tabla_str = str(tabla)
                    patron = re.compile(r'<td class="th" rowspan="\d+">Type</td>')
                    secciones = patron.split(tabla_str)
                    secciones = secciones[1:]

                    for i in range(len(secciones)):
                        if i < len(secciones) - 1:
                            match = patron.search(tabla_str)
                            if match:
                                secciones[i] = match.group() + secciones[i]
                                tabla_str = tabla_str[match.end():]

                    patent_family = []
                    for i, seccion in enumerate(secciones):
                        info_publicacion = extraer_seccion(seccion)
                        if info_publicacion:
                            info = extraer_informacion_seccion(info_publicacion)
                            patent_family.extend(info)

                    df = pd.DataFrame([{'patent_family': patent_family}])
                    types = [item['Type'] for item in df['patent_family'][0]]
                    type_counts = dict(Counter(types))
                    df['counterType_patent_family'] = [type_counts]

                    dates = [datetime.datetime.strptime(item['Date'], '%d.%m.%Y') for item in df['patent_family'][0]]
                    first_date = min(dates)
                    last_date = max(dates)
                    df['firstDate_patent_family'] = [first_date]
                    df['lastDate_patent_family'] = [last_date]

        return df

    except Exception as e:
        print(f"Error inesperado al procesar el archivo {archivo}: {str(e)}")


In [None]:
def extraer_valores(texto):#This funtion will extract all relevant data for our models, specified at the end.

    patent = re.findall(r'<span class="highlight">EP<span class="highlight">(\d+)</span></span>', texto)
    if len(patent) == 0:
        patent = ''
    else:
        patent = 'EP' + patent[0]


    patent_name = re.findall(r'</span></span> - (.*?)</a>', texto)
    if len(patent_name) == 0:
        patent_name = ''
    else:
        patent_name = patent_name[0].strip()


    status = re.findall("""Status</td><td class="t2" colspan="3">.*?<br/>""", texto)
    if len(status) == 0:
        status = ''
    else:
        status = status[0]
        status = status.replace("""Status</td><td class="t2" colspan="3">""","").replace('<br/>','').strip()


    most_recent_event_date = re.findall(r'Most recent event.*?<td class="t2">(.*?)</td>', texto, re.DOTALL)
    if len(most_recent_event_date) == 0:
        most_recent_event_date = ''
    else:
        most_recent_event_date = most_recent_event_date[0].strip()


    most_recent_event = re.findall(r'Most recent event.*?<td class="t3">(.*?)</td>', texto, re.DOTALL)
    if len(most_recent_event) == 0:
        most_recent_event = ''
    else:
        most_recent_event = most_recent_event[0].replace('<br/>', '').replace('\r\n', '').replace('\t', '').replace('\xa0', ' ').strip()


    pattern = re.compile(
        r'<td class="t2" colspan="3">(For all designated states)<br/>\r\n\t(.*?)(<br/>\r\n\t(.*?))?(<br/>\r\n\t(.*?))?(<br/>\r\n\t(.*?))?(<br/>\r\n\t(.*?))?\r\n\t  / (.*?)<br/>'
    )
    matches = pattern.findall(texto)

    applicants_dict = []
    for match in matches:
        address_parts = [match[1]]

        if match[3]:
            address_parts.append(match[3])
        if match[5]:
            address_parts.append(match[5])
        if match[7]:
            address_parts.append(match[7])
        if match[9]:
            address_parts.append(match[9])

        address = ', '.join(address_parts)
        country = match[10] if len(match) > 10 else ''

        applicant_info = {
            'for': match[0],
            'name': match[1],
            'address': address,
            'country': country
        }
        applicants_dict.append(applicant_info)

    inventors_section_pattern = r'<td class="th" rowspan="\d+">Inventor\(s\)</td>(.*?)<td class="th"'
    inventors_section = re.search(inventors_section_pattern, texto, re.DOTALL)

    inventors_dict = []
    if inventors_section:
        inventor_rows = re.findall(r'<td class="t2" colspan="3">(.*?)</td>', inventors_section.group(1), re.DOTALL)
        for row in inventor_rows:
            inventor_info = re.search(r'(\d+)\s*/\s*(.*?)<br/>\s*(.*?)\s*/\s*(.*)', row, re.DOTALL)
            if inventor_info:
                inventor_dict = {
                    'number': inventor_info.group(1).strip(),
                    'name': inventor_info.group(2).strip(),
                    'address': inventor_info.group(3).strip(),
                    'country': inventor_info.group(4).strip()
                }
                inventors_dict.append(inventor_dict)


    priority_section_pattern = r'<td class="th" rowspan="\d+">Priority number, date</td>(.*?)<td class="th"'
    priority_section = re.search(priority_section_pattern, texto, re.DOTALL)

    priority_numbers = []
    if priority_section:
        priority_rows = re.findall(r'<td class="t2">(.*?)</td>.*?<td class="t3" colspan="2">(.*?)</td>', priority_section.group(1), re.DOTALL)
        for row in priority_rows:
            priority_number = row[0].strip()
            date_and_format = row[1].strip()

            date_match = re.search(r'(\d{2}\.\d{2}\.\d{4})', date_and_format)
            format_match = re.search(r'Original\s+published\s+format:\s+(.*)', date_and_format)

            date = date_match.group(1) if date_match else ''
            original_format = format_match.group(1).strip() if format_match else ''

            priority_numbers.append({
                priority_number: date
            })

    ipc_pattern = r'IPC:\s*</td>\s*<td class="t2">\s*(.*?)\s*</td>'
    ipc_matches = re.findall(ipc_pattern, texto, re.DOTALL)
    ipc_list = [re.sub(r'\s+', ' ', ipc).strip() for ipc in ipc_matches]

    cpc_pattern = r'CPC:</td>\s*<td class="t2" colspan="2">(.*?)</td>'
    cpc_matches = re.findall(cpc_pattern, texto, re.DOTALL)

    cpc_list = []
    for match in cpc_matches:
        soup = BeautifulSoup(match, 'html.parser')
        b_elements = soup.find_all('b')

        items = [elem.get_text().strip() for elem in b_elements]
        cpc_list.extend(items)


    parent_pattern = r'<td class="th"[^>]*>Parent application\(s\).*?<\/td>(.*?)(?:<td class="th"[^>]*>Divisional application\(s\)|<\/table>)'
    parent_match = re.search(parent_pattern, texto, re.DOTALL)
    parent_list = []
    if parent_match:
        parent_text = parent_match.group(1)
        parent_apps = re.findall(r'<td class="t2" colspan="3">(EP\d+\.\d+)\s*\/\s*<a.*?>(EP\d+)</a>', parent_text)
        for app in parent_apps:
            parent_list.append({
                app[0]: app[1]
            })

    divisional_pattern = r'<td class="th"[^>]*>Divisional application\(s\).*?<\/td>(.*?)<\/table>'
    divisional_match = re.search(divisional_pattern, texto, re.DOTALL)
    divisional_list = []
    if divisional_match:
        divisional_text = divisional_match.group(1)
        divisional_apps = re.findall(r'<td class="t2" colspan="3">(EP\d+\.\d+)\s*\/\s*<a.*?>(EP\d+)</a>', divisional_text)
        for app in divisional_apps:
            divisional_list.append({
                app[0]: app[1]
            })

    opposition_pattern = r'<td class="th">\s*Opponent\(s\)</td>\s*<td class="t2" colspan="2">(.*?)</td>'
    opposition_raw = re.findall(opposition_pattern, texto, re.DOTALL)

    oppositions_list = []
    if opposition_raw:
        opposition_entries = re.split(r'</td>\s*</tr><td class="th">&nbsp;</td><td class="t2" colspan="2">', opposition_raw[0])
        for entry in opposition_entries:
            if entry.strip():

                parts = entry.strip().split('<br/>')
                if len(parts) >= 7:
                    opposition_info = {
                        'number': parts[0].strip(),
                        'date_received': parts[1].strip(),
                        'date_decision': parts[2].strip(),
                        'status': parts[3].strip(),
                        'name': parts[4].strip().replace('&nbsp;', ' '),
                        'address': ' '.join(parts[5:7]).strip().replace('&nbsp;', ' '),
                        'representative': ' '.join(parts[7:9]).strip().replace('&nbsp;', ' ')
                    }
                    oppositions_list.append(opposition_info)


    representatives_section_pattern = r'<td class="th" rowspan="\d+">Representative\(s\)</td>(.*?)<td class="th"'
    representatives_section = re.search(representatives_section_pattern, texto, re.DOTALL)

    representatives_dict = []
    if representatives_section:
        representative_rows = re.findall(r'<td class="t2" colspan="3">(.*?)</td>', representatives_section.group(1), re.DOTALL)
        for row in representative_rows:
            representative_info = re.search(r'(.*?)<br/>(.*?)<br/>(.*?)\s*/\s*(.*)', row, re.DOTALL)
            if representative_info:
                representative_dict = {
                    'name': representative_info.group(1).strip(),
                    'address_line1': representative_info.group(2).strip(),
                    'address_line2': representative_info.group(3).strip(),
                    'country': representative_info.group(4).strip()
                }
                representatives_dict.append(representative_dict)


    soup = BeautifulSoup(texto, 'html.parser')
    publication_section = soup.find('td', class_='th', string=re.compile('Publication', re.IGNORECASE))

    publications_list = []
    if publication_section:
        current_publication = {}
        next_element = publication_section.next_sibling

        while next_element:
            if isinstance(next_element, NavigableString):
                next_element = next_element.next_sibling
                continue

            if next_element.name == 'td' and 'th' in next_element.get('class', []):
                field = next_element.text.strip().rstrip(':')
                value_td = next_element.find_next_sibling('td', class_='t2')

                if value_td:
                    value = value_td.text.strip()

                    if field == 'Type':
                        if current_publication:
                            publications_list.append(current_publication)
                            current_publication = {}
                        current_publication[field] = value
                    elif field in ['No.', 'Date', 'Language']:
                        current_publication[field] = value
            elif next_element.name == 'td' and 't2' in next_element.get('class', []):
                if '[' in next_element.text and ']' in next_element.text:
                    current_publication['Publication week'] = next_element.text.strip()
                    publications_list.append(current_publication)
                    current_publication = {}

            if next_element.name == 'td' and 'th' in next_element.get('class', []) and next_element.has_attr('rowspan'):
                if next_element != publication_section:
                    break

            next_element = next_element.next_sibling

        if current_publication:
            publications_list.append(current_publication)



    applicants_section = soup.find('td', class_='th', string=re.compile('Applicant\(s\)', re.IGNORECASE))
    applicants_list = []

    if applicants_section:
        next_element = applicants_section.find_next_sibling('td', class_='t2')
        while next_element:
            if 't2' in next_element.get('class', []):
                applicant_text = next_element.get_text(strip=True, separator='|') if next_element else ''
                applicant_info = applicant_text.split('|')
                if len(applicant_info) >= 2:
                    applicant = {
                        'type': 'current',
                        'for': applicant_info[0],
                        'name': applicant_info[1],
                        'address': '|'.join(applicant_info[2:-1]) if len(applicant_info) > 3 else '',
                        'country': applicant_info[-1].strip('/ ') if len(applicant_info) > 2 else ''
                    }


                    if not any(a['name'] == applicant['name'] and a['country'] == applicant['country'] for a in applicants_list):
                        applicants_list.append(applicant)

            next_element = next_element.find_next_sibling(['td', 'tr'])
            if next_element and next_element.name == 'tr':
                next_element = next_element.find('td', class_='t2')
            if next_element and 'th' in next_element.get('class', []):
                break


    df = pd.DataFrame({
        'Patent': [patent],
        'Patent_Name': [patent_name],
        'Status': [status],
        'Most_Recent_Event_Date': [most_recent_event_date],
        'Most_Recent_Event': [most_recent_event],
        'Divisional_applications': [json.dumps(divisional_list)],
        'Parent_applications': [json.dumps(parent_list)],
        'Inventors': [json.dumps(inventors_dict)],
        'Priority_numbers': [priority_numbers],
        'Applicants': [json.dumps(applicants_dict)],
        'Representatives': [json.dumps(representatives_dict)],
        'IPC': [json.dumps(ipc_list)],
        'CPC': [json.dumps(cpc_list)],
        'Opponent': [json.dumps(oppositions_list)],
        'Publications': [json.dumps(publications_list)],
        'Applicants': [json.dumps(applicants_list)]
    })
    df_patent_family = pd.DataFrame()
    df_patent_family = procesar_patent_family('C:/Users/iponc/Downloads/HTMLS/Patents_Family/'+patent+'_Patent_Family.txt')
    df = pd.concat([df, df_patent_family], axis=1)

    return df


# Full path to the file containing patent information
ruta_completa = 'C:/Users/iponc/Downloads/HTMLS/EP0721594.txt'

try:
    # Check if the file exists
    if os.path.isfile(ruta_completa):

        # Open and read the file content
        with open(ruta_completa, 'r', encoding='utf-8') as file:
            contenido = file.read()
            # print(contenido)  # Commented out print statement

            # Parse the HTML content
            soup = BeautifulSoup(contenido, 'html.parser')

            # Extract the first table from the parsed HTML
            mytable = extraer_tabla(soup)[0]

            # Process the table and extract values into a DataFrame
            df_actual = extraer_valores(str(mytable))

except Exception as e:
    # Handle any unexpected errors during file processing
    print(f"Unexpected error while processing the file: {str(e)}")

df_actual

Unnamed: 0,Patent,Patent_Name,Status,Most_Recent_Event_Date,Most_Recent_Event,Divisional_applications,Parent_applications,Inventors,Priority_numbers,Applicants,Representatives,IPC,CPC,Opponent,Publications,patent_family,counterType_patent_family,firstDate_patent_family,lastDate_patent_family
0,EP0721594,AUTONOMOUS CRUISE CONTROL,No opposition filed within time limit,28.12.2007,Lapse of the patent in a contracting state\nNe...,"[{""EP01109826.6"": ""EP1167108""}]",[],"[{""number"": ""01"", ""name"": ""WOLL, Jerry"", ""addr...",[{'US19930130585': '01.10.1993'}],"[{""type"": ""current"", ""for"": ""For all designate...","[{""name"": ""R\u00fcger Abel Patentanw\u00e4lte ...","[""G01S13/60, B60K31/00, G01S13/93<br/>""]","[""B60K31/0008 (EP,KR,US);"", ""B60W30/146 (EP);""...",[],"[{""Type"": ""A1\u00a0Application with search rep...","[{'Publication No.': 'JP3104998B', 'Date': '30...","{'B2': 1, 'A': 4, 'A1': 4, 'B1': 1, 'A4': 1, '...",1995-04-13,2003-05-08


In [None]:
df_actual['Applicants'][0]

'[{"type": "current", "for": "For all designated states", "name": "VORAD SAFETY SYSTEMS, INC.", "address": "10802 Willow Court", "country": "San Diego, CA 92127\\n\\t  / US"}]'

In [None]:
# Directory containing HTML files to process
carpeta = 'C:/Users/iponc/Downloads/HTMLS'

# Initialize an empty DataFrame to store all extracted data
df_general = pd.DataFrame()

# Iterate through each file in the specified directory
for nombre_archivo in os.listdir(carpeta):
    try:
        # Construct the full path for each file
        ruta_completa = os.path.join(carpeta, nombre_archivo)

        # Check if it's a file (not a subdirectory)
        if os.path.isfile(ruta_completa):
            print(f"Processing file: {nombre_archivo}")

            # Open and read the file content
            with open(ruta_completa, 'r', encoding='utf-8') as file:
                contenido = file.read()

                # Parse the HTML content
                soup = BeautifulSoup(contenido, 'html.parser')

                # Extract the first table from the parsed HTML
                mytable = extraer_tabla(soup)[0]

                # Process the table and extract values into a DataFrame
                df_actual = extraer_valores(str(mytable))

                # Append the extracted data to the general DataFrame
                df_general = pd.concat([df_general, df_actual], ignore_index=True)

    except Exception as e:
        # Handle any unexpected errors during file processing
        print(f"Error inesperado al procesar el archivo {nombre_archivo}: {str(e)}")
        # Continue with the next file even if there's an error
        continue

# Print the total number of rows in the general DataFrame
print(f"Total de filas en el DataFrame general: {len(df_general)}")

Procesando archivo: EP0721594.txt
Procesando archivo: EP0737549.txt
Error inesperado al procesar el archivo C:/Users/iponc/Downloads/HTMLS/Patents_Family/EP0737549_Patent_Family.txt: list index out of range
Procesando archivo: EP0788617.txt
Procesando archivo: EP0885233.txt
Error inesperado al procesar el archivo C:/Users/iponc/Downloads/HTMLS/Patents_Family/EP0885233_Patent_Family.txt: list index out of range
Procesando archivo: EP1105513.txt
Procesando archivo: EP1127791.txt
Procesando archivo: EP1164138.txt
Procesando archivo: EP1167108.txt
Procesando archivo: EP1168032.txt
Procesando archivo: EP1250728.txt
Procesando archivo: EP1250983.txt
Procesando archivo: EP1250989.txt
Procesando archivo: EP1251150.txt
Procesando archivo: EP1443598.txt
Procesando archivo: EP1475410.txt
Error inesperado al procesar el archivo C:/Users/iponc/Downloads/HTMLS/Patents_Family/EP1475410_Patent_Family.txt: list index out of range
Procesando archivo: EP1489185.txt
Procesando archivo: EP1502461.txt
Proce

In [None]:
df_general

Unnamed: 0,Patent,Patent_Name,Status,Most_Recent_Event_Date,Most_Recent_Event,Divisional_applications,Parent_applications,Inventors,Priority_numbers,Applicants,Representatives,IPC,CPC,Opponent,Publications,patent_family,counterType_patent_family,firstDate_patent_family,lastDate_patent_family
0,EP0721594,AUTONOMOUS CRUISE CONTROL,No opposition filed within time limit,28.12.2007,Lapse of the patent in a contracting state\nNe...,"[{""EP01109826.6"": ""EP1167108""}]",[],"[{""number"": ""01"", ""name"": ""WOLL, Jerry"", ""addr...",[{'US19930130585': '01.10.1993'}],"[{""type"": ""current"", ""for"": ""For all designate...","[{""name"": ""R\u00fcger Abel Patentanw\u00e4lte ...","[""G01S13/60, B60K31/00, G01S13/93<br/>""]","[""B60K31/0008 (EP,KR,US);"", ""B60W30/146 (EP);""...",[],"[{""Type"": ""A1\u00a0Application with search rep...","[{'Publication No.': 'JP3104998B', 'Date': '30...","{'B2': 1, 'A': 4, 'A1': 4, 'B1': 1, 'A4': 1, '...",1995-04-13,2003-05-08
1,EP0737549,Abrasive tape and process for producing it,No opposition filed within time limit,24.10.2003,No opposition filed within time limit,"[{""EP02012459.0"": ""EP1250983""}]",[],"[{""number"": ""01"", ""name"": ""Fujii, Kazuhito, c/...","[{'JP19950109088': '10.04.1995'}, {'JP19960005...","[{""type"": ""current"", ""for"": ""For all designate...","[{""name"": ""M\u00fcller-Bor\u00e9 &amp; Partner...","[""B24D3/00, B24D11/00, B24D3/28<br/>""]","[""B24B19/226 (EP,US);"", ""B24D3/00 (KR);"", ""B24...",[],"[{""Type"": ""A2\u00a0Application without search ...",,,NaT,NaT
2,EP0788617,MINIATURE OPTICAL SCANNER FOR A TWO AXIS SCANN...,No opposition filed within time limit,04.01.2008,Lapse of the patent in a contracting state\nNe...,"[{""EP01202983.1"": ""EP1168032""}]",[],"[{""number"": ""01"", ""name"": ""MELVILLE, Charles, ...",[{'US19940329508': '26.10.1994'}],"[{""type"": ""current"", ""for"": ""For all designate...","[{""name"": ""Hitchcock, Esmond Antony"", ""address...","[""G02B26/08, G02B26/10, G02B7/182<br/>""]","[""G02B26/10 (EP,US);"", ""G02B26/101 (EP,US);"", ...",[],"[{""Type"": ""A1\u00a0Application with search rep...","[{'Publication No.': 'JP2001523350', 'Date': '...","{'A': 4, 'A3': 1, 'A2': 1, 'B1': 2, 'A4': 1, '...",1996-05-09,2006-08-22
3,EP0885233,TERPYRIDINE-PLATINUM(II) COMPLEXES,No opposition filed within time limit,25.04.2003,No opposition filed within time limit,"[{""EP01121776.7"": ""EP1164138""}]",[],"[{""number"": ""01"", ""name"": ""LOWE, Gordon"", ""add...",[{'GB19960001603': '26.01.1996'}],"[{""type"": ""current"", ""for"": ""For all designate...","[{""name"": ""Ellis-Jones, Patrick George Armine,...","[""C07F15/00, C07D213/06, C07D213/22, C07H19/06...","[""C07H21/00 (EP,US);"", ""A61P33/00 (EP);"", ""A61...",[],"[{""Type"": ""A1\u00a0Application with search rep...",,,NaT,NaT
4,EP1105513,METHODS AND COMPOSITIONS FOR USE IN SPLICEOSOM...,The application is deemed to be withdrawn,14.01.2005,Application deemed to be withdrawn,"[{""EP04077408.5"": ""EP1489185""}]",[],"[{""number"": ""01"", ""name"": ""MITCHELL, Lloyd, G....","[{'US19980133717': '13.08.1998'}, {'US19980158...","[{""type"": ""current"", ""for"": ""For all designate...","[{""name"": ""Lucas, Brian Ronald"", ""address_line...","[""C12N15/90, A61K48/00, C12N15/11, // A61P11:0...","[""C12N15/1027 (EP,US);"", ""A61K48/00 (EP,US);"",...",[],"[{""Type"": ""A2\u00a0Application without search ...","[{'Publication No.': 'WO9722250', 'Date': '26....","{'A1': 21, 'A3': 3, 'A2': 6, 'A': 13, 'B2': 8,...",1997-06-26,2015-11-24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143,EP3937083,SMART METAL CARD WITH RADIO FREQUENCY (RF) TRA...,Examination is in progress,15.11.2023,New entry: Renewal fee paid,[],"[{""EP15874266.8"": ""EP3238139""}]","[{""number"": ""01"", ""name"": ""HERSLOW, John"", ""ad...",[{'US201462095901P': '23.12.2014'}],"[{""type"": ""current"", ""for"": ""For all designate...","[{""name"": ""Scheele Jaeger Wetzel Patentanw\u00...","[""G06K19/077, G06K19/06<br/>""]","[""G06K19/07773 (EP,US);"", ""G06K19/02 (EP,US);""...",[],"[{""Type"": ""A1\u00a0Application with search rep...","[{'Publication No.': 'US9898699', 'Date': '20....","{'B2': 2, 'A1': 3, 'B1': 1, 'A4': 1, 'A2': 2, ...",2016-06-23,2022-04-11
144,EP3992203,PLANT CYTOCHROME P450,Request for examination was made,29.07.2024,New entry: Renewal fee paid,[],"[{""EP11748702.5"": ""EP2596012""}, {""EP16186318.8...","[{""number"": ""01"", ""name"": ""WINZER, Thilo"", ""ad...","[{'GB20100021707': '22.07.2010'}, {'GB20100012...","[{""type"": ""current"", ""for"": ""For all designate...","[{""name"": ""Symbiosis IP Limited"", ""address_lin...","[""C07K14/415, C12N15/82<br/>""]","[""C07K14/415 (EP,US);"", ""C12N15/8251 (US);"", ""...",[],"[{""Type"": ""A1\u00a0Application with search rep...","[{'Publication No.': 'US9725732', 'Date': '08....","{'B2': 6, 'A1': 10, 'A2': 4, 'B1': 2, 'A3': 2,...",2010-09-08,2023-12-19
145,EP4036079,COMPOUNDS AND COMPOSITIONS FOR INTRACELLULAR D...,Examination is in progress,09.07.2024,New entry: Reply to examination report,[],"[{""EP16831870.7"": ""EP3394030""}]","[{""number"": ""01"", ""name"": ""BENENATO, Kerry E.""...","[{'US201562271160P': '22.12.2015'}, {'US201562...","[{""type"": ""current"", ""for"": ""For all designate...","[{""name"": ""Cooley (UK) LLP"", ""address_line1"": ...","[""C07D211/14, C07D211/16, C07D295/13, C07D295/...","[""C07D211/14 (EP,US);"", ""A61K9/5123 (US);"", ""A...",[],"[{""Type"": ""A2\u00a0Application without search ...","[{'Publication No.': 'US2021161829', 'Date': '...","{'A1': 8, 'B2': 6, 'A': 2, 'B1': 2, 'A3': 1, '...",2017-06-29,2023-10-23
146,EP4215191,COMPOSITION COMPRISING A FUMARATE FOR USE IN A...,Request for examination was made,02.02.2024,The date on which the examining division becom...,[],"[{""EP15804260.6"": ""EP3220907""}, {""EP20195649.7...","[{""number"": ""01"", ""name"": ""Novas, Mark"", ""addr...","[{'US201462080783P': '17.11.2014'}, {'US201562...","[{""type"": ""current"", ""for"": ""For all designate...","[{""name"": ""Pohlman, Sandra M."", ""address_line1...","[""A61K31/225, A61P25/02<br/>""]","[""A61K31/225 (EP,CN,IL,KR,US);"", ""A61K47/542 (...",[],"[{""Type"": ""A1\u00a0Application with search rep...","[{'Publication No.': 'US2019008817', 'Date': '...","{'A1': 19, 'B2': 7, 'A': 17, 'B1': 2, 'B': 3, ...",2016-05-26,2024-06-13


In [None]:
df_general.to_csv('C:/Users/iponc/OneDrive/Escritorio/df_Preprocesado.csv', index=False)

In [None]:
""" Ejemplo patente
ruta_completa = 'C:/Users/iponc/Downloads/HTMLS/EP4036079.txt'  # Ruta de la carpeta HTMLS
try:
    if os.path.isfile(ruta_completa):

        with open(ruta_completa, 'r', encoding='utf-8') as file:
            contenido = file.read()
            #print(contenido)
            soup = BeautifulSoup(contenido, 'html.parser')
            mytable = extraer_tabla(soup)[0]
            df_actual = extraer_valores(str(mytable))

except Exception as e:
    print(f"Error inesperado al procesar el archivo {nombre_archivo}: {str(e)}")

df_actual
"""

' Ejemplo patente\nruta_completa = \'C:/Users/iponc/Downloads/HTMLS/EP4036079.txt\'  # Ruta de la carpeta HTMLS\ntry:\n    if os.path.isfile(ruta_completa):\n            \n        with open(ruta_completa, \'r\', encoding=\'utf-8\') as file:\n            contenido = file.read()\n            #print(contenido)\n            soup = BeautifulSoup(contenido, \'html.parser\')\n            mytable = extraer_tabla(soup)[0]\n            df_actual = extraer_valores(str(mytable))\n            \nexcept Exception as e:\n    print(f"Error inesperado al procesar el archivo {nombre_archivo}: {str(e)}")\n\ndf_actual\n'

In [None]:
def extract_publication_numbers(df):
    # Initialize an empty set to store unique publication numbers
    publication_numbers = set()

    # Iterate over each row of the dataframe
    for _, row in df.iterrows():
        # Check if the 'Divisional_applications' column has a value
        if row['Divisional_applications']:
            # Convert the string representation of a list of dictionaries to an actual list of dictionaries
            div_apps = ast.literal_eval(row['Divisional_applications'])
            # Iterate through each dictionary in the list
            for app in div_apps:
                # Update the set with all values from the current dictionary
                publication_numbers.update(app.values())

        # Check if the 'Parent_applications' column has a value
        if row['Parent_applications']:
            # Convert the string representation of a list of dictionaries to an actual list of dictionaries
            parent_apps = ast.literal_eval(row['Parent_applications'])
            # Iterate through each dictionary in the list
            for app in parent_apps:
                # Update the set with all values from the current dictionary
                publication_numbers.update(app.values())

    # Remove the original patent numbers present in the dataframe from the set
    publication_numbers = publication_numbers - set(df['Patent'])

    # Create a new dataframe from the unique publication numbers
    df2 = pd.DataFrame({'publication_numbers': list(publication_numbers)})

    # Return the new dataframe
    return df2

# Call the function to extract publication numbers and store the result in df2
df2 = extract_publication_numbers(df_general)
df2


Unnamed: 0,publication_numbers
