In [162]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import requests
import json
import time
import random
from ast import literal_eval

In [163]:
# Get lapto's headers: https://httpbin.org/anything
headers =  {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", 
    #"Accept-Encoding": "gzip, deflate, br, zstd", 
    "Accept-Language": "en-US,en;q=0.9,es;q=0.8", 
    "Dnt": "1", 
    #"Host": "httpbin.org", 
    "Referer": "https://www.scraperapi.com/", 
    "Sec-Ch-Ua": "\"Google Chrome\";v=\"125\", \"Chromium\";v=\"125\", \"Not.A/Brand\";v=\"24\"", 
    "Sec-Ch-Ua-Mobile": "?0", 
    "Sec-Ch-Ua-Platform": "\"Windows\"", 
    "Sec-Fetch-Dest": "document", 
    "Sec-Fetch-Mode": "navigate", 
    "Sec-Fetch-Site": "cross-site", 
    "Sec-Fetch-User": "?1", 
    "Upgrade-Insecure-Requests": "1", 
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36", 
    "X-Amzn-Trace-Id": "Root=1-665ec732-28d2172a4721ed726fc59ceb"
  }

def get_page(an):
    URL = f"https://register.epo.org/smartSearch?query={an}" #https://register.epo.org/smartSearch?query= #https://register.epo.org/application?number=EP
    page = requests.get(URL,headers=headers)
    return page.text

In [164]:
def extraer_tabla(soup):
    tablas = soup.find_all('table')
    return tablas

In [165]:
def extraer_valores(texto):
    # Extraer número de patente
    patent = re.findall(r'<span class="highlight">EP<span class="highlight">(\d+)</span></span>', texto)
    if len(patent) == 0:
        patent = ''
    else:
        patent = 'EP' + patent[0]

    # Extraer nombre de la patente
    patent_name = re.findall(r'</span></span> - (.*?)</a>', texto)
    if len(patent_name) == 0:
        patent_name = ''
    else:
        patent_name = patent_name[0].strip()  

    # Extraer Status
    status = re.findall("""Status</td><td class="t2" colspan="3">.*?<br/>""", texto)
    if len(status) == 0:
        status = ''
    else:
        status = status[0]
        status = status.replace("""Status</td><td class="t2" colspan="3">""","").replace('<br/>','').strip()

    #Most Recent Event date
    most_recent_event_date = re.findall(r'Most recent event.*?<td class="t2">(.*?)</td>', texto, re.DOTALL)
    if len(most_recent_event_date) == 0:
        most_recent_event_date = ''
    else:
        most_recent_event_date = most_recent_event_date[0].strip()

    #Most recent event
    most_recent_event = re.findall(r'Most recent event.*?<td class="t3">(.*?)</td>', texto, re.DOTALL)
    if len(most_recent_event) == 0:
        most_recent_event = ''
    else:
        most_recent_event = most_recent_event[0].replace('<br/>', '').replace('\r\n', '').replace('\t', '').replace('\xa0', ' ').strip()

    #Divisional Applications
    divisional_apps = re.findall(r'<td class="t2" colspan="3">(EP\d+\.\d+)\s*\/\s*<a.*?>(EP\d+)</a>', texto)
    divisional_dict = dict(divisional_apps)

    # Parent Applications
    parent_apps = re.findall(r'<td class="t2" colspan="3">(EP\d+\.\d+)\s*\/\s*<a.*?>(EP\d+)</a>', texto)
    parent_dict = dict(parent_apps)

    # Extraer inventores
    inventors = re.findall(r'<td class="t2" colspan="3">(\d+)\xa0/\r\n\t(.*?)<br/>\r\n\t(.*?)<br/>\r\n\t(.*?)\r\n\t  / (.*?)<br/>', texto)
    inventors_dict = []
    for inv in inventors:
        inventor_info = {
            'number': inv[0],
            'name': inv[1],
            'address': inv[2] + ', ' + inv[3],
            'country': inv[4]
        }
        inventors_dict.append(inventor_info)

    # Extraer publicaciones 
    publications = re.findall(r'<td class="th">Type:\r\n\s*</td>\s*<td class="t2" colspan="2">(.*?)</td>\s*<td class="th">No.:</td>\s*<td class="t2" colspan="2">(.*?)</td>\s*<td class="th">Date:</td>\s*<td class="t2" colspan="2">(.*?)</td>\s*<td class="th">Language:</td>\s*<td class="t2" colspan="2">(.*?)</td>', texto)
    publications_list = []
    for pub in publications:
        # Extraer el texto del enlace si existe, o usar el texto plano si no hay enlace
        type_text = re.search(r'>([^<]+)</a>$', pub[0])
        if type_text:
            pub_type = type_text.group(1).strip()
        else:
            pub_type = re.sub(r'\xa0', ' ', pub[0]).strip()
        publication_info = {
            'type': pub_type,
            'number': re.sub(r'<.*?>', '', pub[1]).strip(),
            'date': pub[2].strip(),
            'language': pub[3].strip()
        }
        publications_list.append(publication_info)

    # Extraer clasificaciones IPC
    ipc_pattern = r'IPC:\s*</td>\s*<td class="t2">\s*(.*?)\s*</td>'
    ipc_matches = re.findall(ipc_pattern, texto, re.DOTALL)
    ipc_list = [re.sub(r'\s+', ' ', ipc).strip() for ipc in ipc_matches]

    # Extraer clasificaciones CPC
    cpc_pattern = r'CPC:</td>\s*<td class="t2" colspan="2">(.*?)</td>'
    cpc_matches = re.findall(cpc_pattern, texto, re.DOTALL)
    
    cpc_list = []
    for match in cpc_matches:
        soup = BeautifulSoup(match, 'html.parser')
        b_elements = soup.find_all('b') # Encontrar todos los elementos <b> dentro del bloque
        
        # Extraer el texto y limpiarlo
        items = [elem.get_text().strip() for elem in b_elements]
        print("Elementos encontrados:", items)  # Depuración
        cpc_list.extend(items)
    
    
    # Crear DataFrame
    df = pd.DataFrame({
        'Patent': [patent],
        'Patent_Name': [patent_name],
        'Status': [status],
        'Most_Recent_Event_Date': [most_recent_event_date],
        'Most_Recent_Event': [most_recent_event],
        'Divisional_applications': [json.dumps(divisional_dict)],
        'Parent_applications': [json.dumps(parent_dict)],
        'Inventors': [json.dumps(inventors_dict)],
        'Publications': [json.dumps(publications_list)],
        'IPC': [json.dumps(ipc_list)],
        'CPC': [json.dumps(cpc_list)]
    })
    
    return df

In [166]:
# def extraer_valores(texto):    
#     # Extraer clasificaciones IPC
#     ipc_pattern = r'IPC:\s*</td>\s*<td class="t2">\s*(.*?)\s*</td>'
#     ipc_matches = re.findall(ipc_pattern, texto, re.DOTALL)
#     ipc_list = [re.sub(r'\s+', ' ', ipc).strip() for ipc in ipc_matches]

#     # Extraer clasificaciones CPC
#     cpc_pattern = r'CPC:</td>\s*<td class="t2" colspan="2">(.*?)</td>'
#     cpc_matches = re.findall(cpc_pattern, texto, re.DOTALL)
    
#     cpc_list = []
#     for match in cpc_matches:
#         soup = BeautifulSoup(match, 'html.parser')
#         b_elements = soup.find_all('b') # Encontrar todos los elementos <b> dentro del bloque
        
#         # Extraer el texto y limpiarlo
#         items = [elem.get_text().strip() for elem in b_elements]
#         print("Elementos encontrados:", items)  # Depuración
#         cpc_list.extend(items)


#     # Crear DataFrame
#     df = pd.DataFrame({
#         'IPC_Classifications': [json.dumps(ipc_list)],
#         'CPC_Classifications': [json.dumps(cpc_list)]
#     })
#     return df

In [167]:
htmlContent = get_page(1492505)
if(htmlContent and "Just a moment..." not in htmlContent):
    soup = BeautifulSoup(htmlContent, 'html.parser')
    mytable = extraer_tabla(soup)[0]    
    texto_limpio = extraer_valores(str(mytable))
    print(texto_limpio)
else:
    print("WebScrapping no disponible")


Elementos encontrados: ['A61K31/485 (EP,CN,US);', 'A61K9/0053 (US);', 'A61K9/16 (KR);', 'A61K9/1617 (EP,US);', 'A61K9/1652 (EP,US);', 'A61K9/20 (KR);', 'A61K9/2009 (US);', 'A61K9/2013 (EP,US);', 'A61K9/2018 (EP,US);', 'A61K9/2054 (EP,US);', 'A61K9/2077 (EP,US);', 'A61K9/2095 (EP,US);', 'A61K9/2866 (EP,US);', 'A61K9/48 (KR);', 'A61K9/70 (US);', 'A61P1/00 (EP);', 'A61P1/04 (EP);', 'A61P1/10 (EP);', 'A61P13/12 (EP);', 'A61P17/04 (EP);', 'A61P25/00 (EP);', 'A61P25/04 (EP);', 'A61P25/36 (EP);', 'A61P29/00 (EP);', 'A61P43/00 (EP)']
      Patent                                        Patent_Name  \
0  EP1492505  PHARMACEUTICAL PREPARATION CONTAINING OXYCODON...   

           Status Most_Recent_Event_Date  \
0  Patent revoked             26.05.2017   

                            Most_Recent_Event  \
0  Lapse of the patent in a contracting state   

                             Divisional_applications  \
0  {"EP10176720.0": "EP2319496", "EP11177513.6": ...   

                                

In [169]:
texto_limpio

Unnamed: 0,Patent,Patent_Name,Status,Most_Recent_Event_Date,Most_Recent_Event,Divisional_applications,Parent_applications,Inventors,Publications,IPC,CPC
0,EP1492505,PHARMACEUTICAL PREPARATION CONTAINING OXYCODON...,Patent revoked,26.05.2017,Lapse of the patent in a contracting state,"{""EP10176720.0"": ""EP2319496"", ""EP11177513.6"": ...","{""EP10176720.0"": ""EP2319496"", ""EP11177513.6"": ...","[{""number"": ""01"", ""name"": ""BR\u00d6GMANN, Bian...","[{""type"": ""A2 Application without search repor...","[""A61K9/16, A61K9/20, A61K9/22, A61K31/485<br/>""]","[""A61K31/485 (EP,CN,US);"", ""A61K9/0053 (US);"",..."


In [None]:
texto_limpio['CPC_Classifications'][0]

In [None]:
texto_limpio['IPC_Classifications']

In [170]:
# #Acá se obtiene el id atómico para una divissional application

# json_str = texto_limpio['Divisional_applications'][0]

# # Convertir la cadena JSON en un diccionario de Python
# json_dict = json.loads(json_str)

# # Obtener el primer key del diccionario
# primer_key = list(json_dict.keys())[0]

# primer_key

'EP10176720.0'