In [None]:
## Load necessary libraries

import requests # module to send HTTP requests
import os # module that provide a portable way of using operating system dependent functionality
import pandas as pd # data analysis and data manipulation tools
import numpy as np # fundamental package for scientific computing
import sys # system-specific and parameters and functions
import zipfile # work with zip archives
import io # core tools to work with data streams
from bs4 import BeautifulSoup # library to scrape information from HTML or XML files
import re # regular expression operations

print("All necessary libraries are loaded!")

In [None]:
## Overriding the default warning filter to hide ssl warnings from users

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

In [None]:
## Functions and classes to parse, download and generate reports from the
## Intelectual Property Journal (RPI - Revista de Propriedade Intelectual)

# Class to access and extract information from patent at INPI web application
class api_inpi:
    # Constructor to initialize the properties of the class
    def __init__(self, url):
        self.url = url
    
    # Return the status of the files on the server
    #     Use: api_inpi(url).url_status() or
    #          self.url_status() inside class functions
    def url_status(self):
        r = requests.head(self.url) # make a HEAD request to the web page and return the HTTP header
        return r.status_code
    
    # Extract the xml file from the zip package os the patentes section at RPI
    #     Use: api_inpi(url).xml_extract() or
    #          self.xml_extract() inside class functions
    def xml_extract(self):
        if self.url_status() == 200: # if the returned status code is 200, proceeds with the download
            r = requests.get(url, stream=True) # request data from the resource
            z = zipfile.ZipFile(io.BytesIO(r.content)) # download a ZIP file and extract its contents in memory
            f_names = z.namelist() # list the filenames inside the ZIP package in f_names
            for fileName in f_names: # for every file in list of names
                if fileName.endswith('.xml'): # check if the filename ends with a xml file extension
                    z.extract(fileName) # and extract the single file xml file from zip
            return fileName # return the name of the xml file
        
        if self.url_status() == 302: # if the returned status code is 302, the page was temporarily moved or the file does not yet exist
            print("The requested file does not exist.")
            
    # Search the patents documents by holder at RPI
    #     Use: api_inpi(url).busca_titular(rpi number,"holder name",dataframe name) or
    #          self.api_inpi(url).busca_titular(rpi number,"holder name",dataframe name) inside class functions
    def busca_titular(self,revista,org,df):
        rpi = revista # RPI number
        f_xml = self.xml_extract() # retrieve the xml file for the RPI
        infile = open(f_xml,'r') # open the xml downloaded file
        xml_data = infile.read() # and store the content in a variable
        xml_data = re.sub(' +',' ',xml_data) # replaces multiple spaces with single spaces in xml file
        soup = BeautifulSoup(xml_data, 'xml') # passes the stored data to the beautifulsoup analyzer and stores the returned object
        org_soup = soup.find_all('nome-completo', text=lambda x: x is not None and org in x.casefold()) # search how many times the name of the holder appeared in the xml file
        data_revista = soup.find('revista').attrs.get('dataPublicacao') # extract the publication date for the given RPI
        despacho_len = len(org_soup) # counts the total amount of records found
        
        for i in range(despacho_len): # browse the file to extract the information where the holder is founded
            despacho_soup = org_soup[i].find_parent('despacho') # for i-th element, find the parent tag <despacho> for the holder           
            registro_soup = soup.find_all('numero', text=lambda x: despacho_soup.find('numero').text in x) # for i-th element, search how many times the protection number was founded
            registro_len = len(registro_soup) # count the number of times that the protection number is repeated
            gestao = "" # set the variable to identify if the organization is the owner or co-owner of the patent
            flag = 1 # set flag to 1 to indicate that is necessary to discover if organization is the patent owner
            
            # protection number can happen more than one time to a given RPI , so we need to search the information
            # to all records with the same number
            for j in range(registro_len): # for j-th element, extract the selected information based on the protection number
                registros = registro_soup[j].find_parent('despacho')
        
                cod_despacho = registros.find('codigo') # search for the dispatch code
                if cod_despacho != None: # if the attribute is found
                    cod_despacho = cod_despacho.text # store the content in a variable
                    
                titulo = registros.find('titulo', inid="54") # search for the title
                if titulo != None: # if the attribute is found
                    titulo = titulo.text # store the content in a variable
        
                registro = registros.find('numero') # search for the protection number
                if registro != None: # if the attribute is found
                    registro = registro.text # store the content in a variable
                    
                # set the default codes to make the search for the documents at web server
                # if the protection number starts with BR, remove BR, the spaces and the last two digits
                # else, remove only the spaces and the last two digits
                cod_busca = registro[:-2].replace(' ', '').replace('BR', '')
                                
                kindcode = registros.find('numero').attrs.get('kindcode') # search and store the protection kind code
                    
                data_deposito = registros.find('data-deposito') # search for the deposit date
                if data_deposito != None: # if the attribute is found
                    data_deposito = data_deposito.text # store the content in a variable
                
                ci_len = len(registros.find_all('classificacao-internacional')) # search for the international classification info
                ci = "" # set the variable to store the information
                if ci_len > 0: # if the information is found
                    for k in range(ci_len): # for the k-th element
                        # store internation classification information in the format
                        # internation classification code (international classification year) | ...
                        ci_ano = registros.find_all('classificacao-internacional')[k].text + " (" + registros.find_all('classificacao-internacional')[k].attrs.get('ano') + ")"
                        ci = ci + ci_ano + " | "
                ci = ci[:-3] # delete the last three digits
            
                titular_1 = registros.find('titular', sequencia="1") # check if the organization is the patent owner
                if titular_1 != None and flag == 1: # if there is a first holder and flag is equal 1
                    nome_completo = titular_1.find('nome-completo').text # store the first holder name 
                    if org.lower() not in nome_completo.lower(): # if the first holder is not the organization, define the management variable as No
                        gestao = "Não"
                    if org.lower() in nome_completo.lower(): # if the first holder is the organization, define the management variable as Yes
                        gestao = "Sim"
                flag = 0 # set flag as 0, to indicate that is not necessary define the patent ownership if the protection code repeat

                titular_len = len(registros.find_all('titular')) # search for the ownership list
                titulares = "" # set the variable to store the information
                if titular_len > 0: # if the information is found
                    for k in range(titular_len): # for the k-th element
                        # store ownership information in the format
                        # organizartion name | ...
                        titular = registros.find_all('titular')[k].find('nome-completo').text
                        titulares = titulares + titular + " | "
                titulares = titulares[:-3] # delete the last three digits
    
                inventor_len = len(registros.find_all('inventor')) # seacrh for the list of inventors
                inventores = "" # set the variable to store the information
                if inventor_len > 0: # if the information is found
                    for k in range(inventor_len): # for the k-th element
                        # store inventors name in the format
                        # inventor name | ...
                        inventor = registros.find_all('inventor')[k].find('nome-completo').text
                        inventores = inventores + inventor + " | "
                inventores = inventores[:-3] # delete the last three digits
                
                # store the information extracted in the i-th line of the dataframe
                df.loc[len(df)] = [rpi, data_revista,registro, kindcode, cod_despacho,titulo,gestao, titulares, inventores, data_deposito, ci,cod_busca]
        
        df['rpi'] = df['rpi'].astype('int') # set the rpi column at dataframe as a integer type to avoid conflicts
        df['cod_busca'] = df['cod_busca'].astype('str') # set the cod_busca column as a string type to avoid conflicts
        df['cod_despacho'] = df['cod_despacho'].astype('str') # set the cod_despacho column as a string type to avoid conflicts
        os.remove(f_xml) # delete the xml file

# Generate a report in xlsx format with the information parsed from the RPI
#     Use: excel_report(source dataframe,file name,inside file sheet name)
def excel_report(xls_df,xls_name,sheet_name):
    writer = pd.ExcelWriter(xls_name + ".xlsx", engine='xlsxwriter', date_format='dd/mm/yyyy') # create a pandas excel writer using XlsxWriter
    xls_df.to_excel(writer, sheet_name=sheet_name, index=False) # convert the dataframe to an XlsxWriter Excel object
    workbook  = writer.book # get the xlsxwriter workbook object
    worksheet = writer.sheets[sheet_name] # get the xlsxwriter worksheet object
    
    for i, col in enumerate(xls_df.columns): # iterate through each column and set the width to the max length
        column_len = xls_df[col].astype(str).str.len().max() # find length of column i
        column_len = max(column_len, len(col)) + 2 # setting the length if the column header is larger than the max column value length
        worksheet.set_column(i, i, column_len) # set the column length
        
    writer.save() # close the Pandas Excel writer and output the Excel file
    print("RPI Report concluded!")

print("Bibliotecas carregadas.")

In [None]:
## RPI selection

rpi_i =  input("Digite o número da primeira RPI:\n")
rpi_i = int(rpi_i)
rpi_f =  input("Digite o número da última RPI:\n(Caso seja somente uma edição, aperte ENTER)\n") or rpi_i
rpi_f = int(rpi_f)

if rpi_f > rpi_i: # if the last rpi is greather than firts
    print("The information will be parsed from the RPI", rpi_i,"to the", rpi_f)
if rpi_f <= rpi_i: # else
    print("The information will be parsed from the RPI", rpi_i)

In [None]:
## Make the search in patent section of the RPI

# Set the columns and create the dataframe to store information
column_names = ['rpi','data de publicação','registro','kind code','cod_despacho','título','gestão','titulares','inventores','data de deposito','classificação internacional','cod_busca'] # cria as colunas do dataframe
patentes_df = pd.DataFrame(columns=column_names)

rpi = rpi_i # set rpi counter as rpi_i
for i in range(rpi_f-rpi_i+1): # for the i-th edition from the rpi
    url = "http://revistas.inpi.gov.br/txt/P" + str(rpi) + ".zip" # set the url to make the search
    api_inpi(url).busca_titular(rpi,'universidade estadual de campinas',patentes_df) # make the search
    rpi+=1 # ident the rpi counter

if rpi_f > rpi_i: # if the last rpi is greather than firts
    f_name = "rpi_patentes_" + str(rpi_i) + "-" + str(rpi_f) # set the xlsx filename
    s_name = str(rpi_i) + " - " + str(rpi_f) # set the sheet filename
if rpi_f <= rpi_i: # else 
    f_name = "rpi_patentes_" + str(rpi_i)  # set the xlsx filename
    s_name = str(rpi_i)  # set the sheet filename

excel_report(patentes_df.iloc[:,:-1],f_name,s_name) # generate the xlsx report with extracted information

In [None]:
## Download the files from the web server

# creates the columns with the file names and the folder names to download
# in this example we use this pattern to folders, with three levels
#     PareceresRPI/BR 10 2014 021620-0/RPI2589_180820
# and this pattern to files
#     PI 1104516-7_Despacho 7.1_180820_A.pdf
patentes_df['f_name'] = patentes_df['registro'] + "_Despacho " + patentes_df['cod_despacho'] + "_" + patentes_df['data de publicação'].str[0:2] + patentes_df['data de publicação'].str[3:5] + patentes_df['data de publicação'].str[8:11]
patentes_df['folder_name'] = "PareceresRPI/" + patentes_df['registro'] + "/RPI" + patentes_df['rpi'].astype(str) + "_" + patentes_df['data de publicação'].str[0:2] + patentes_df['data de publicação'].str[3:5] + patentes_df['data de publicação'].str[8:11]
patentes_df.drop(patentes_df.columns.difference(['rpi','registro','cod_despacho','inpi_name','f_name','folder_name','cod_busca']), axis=1, inplace=True)

# set te columns and create the dataframe to store information from INPI web server
# for all selected editions
column_names = ['inpi_name','rpi','cod_busca']
df_inpi = pd.DataFrame(columns=column_names)

rpi = rpi_i # set rpi counter as rpi_i
for i in range(rpi_f-rpi_i+1): # for the i-th edition from the rpi
    url = "https://parecer.inpi.gov.br/arquivos/RPI/" + str(rpi) # set the url to make the search
    
    if requests.get(url, verify=False).status_code != 200: # check if the files are avaiabel at web server
        print("There are no files on the server for this edition.")
        sys.exit()
    
    # create a temporary dataframe to store the extract information from i-th edition
    df_temp = pd.read_html(requests.get(url, verify=False).content, encoding='utf-8', header=0)[0].iloc[2:-1]
    
    df_temp.drop(df_temp.columns.difference(['Name']), axis=1, inplace=True) # drop all columns, except Name
    df_temp.columns = ['inpi_name'] # rename the column
    df_temp.loc[:,'rpi'] = rpi # create the columns with i-th RPI number for all records
    df_temp['cod_busca'] = df_temp['inpi_name'].str[3:-11] # create the column with the search code
    df_inpi = df_inpi.append(df_temp) # append the records for the i-th rpi at the end of INPI dataframe 
    del df_temp # delete temporary dataframe
    rpi+=1 # ident the rpi number
    
df_inpi = df_inpi.reset_index(drop=True) # reset the index from rpi dataframe

# merge the datframes with the information extracted from RPI and INPI web server
patentes_df = pd.merge(patentes_df, df_inpi, left_on=['rpi','cod_busca'],right_on=['rpi','cod_busca'],how='left')    
patentes_df.drop(['cod_busca'], axis=1, inplace=True) # after this, drop the column with the search code

# remove the records that are not in both dataframes
patentes_df.dropna(subset = ['registro'], inplace=True) # remove the lines where pi is NaN
patentes_df.dropna(subset = ['inpi_name'], inplace=True) # remove the lines where inpi_name is NaN

patentes_df['download'] = True # create a column and set dowload as True

# cases where a protection has two orders in the same magazine
# it is necessary to keep only the documents for one of the dispatch numbers
# so set dowload as False
patentes_df.loc[patentes_df.cod_despacho == '15.11', 'download'] = False # the documents are duplicated between 15.11 and 6.22
patentes_df.loc[patentes_df.cod_despacho == '8.7', 'download'] = False # the documents are duplicated between 7.5 and 8.7
# the patent certificate is not on the same server as the patent reports
# and must be downloaded directly from the INPI website
patentes_df.loc[patentes_df.cod_despacho == '16.1', 'download'] = False # set download as False
patentes_df.loc[patentes_df['download'] == False,'download'] = np.nan # change False to NaN at download
patentes_df.dropna(subset = ['download'], inplace=True) # remove the lines where download is NaN  
patentes_df.drop(['download'], axis=1, inplace=True) # remove the download column

# adding count flags to reports when there is more than one document
patentes_df['dupl'] = patentes_df['registro'].duplicated(keep=False) # create dupl column
patentes_df['flag'] = patentes_df.groupby('registro').cumcount() # group by register number and count
patentes_df.loc[patentes_df['dupl'] == False, 'flag'] = "" # if there only one document, the flag column is empty
# in this example, we changed the numeric counters to alphabetic counters using a dict
dict_letters = {'' : '', 0 : '_A',1 : '_B', 2 : '_C', 3 : '_D', 4 : '_E', 5 : '_F', 6 : '_G', 7 : '_H', 8 : '_I', 9 : '_J'}
patentes_df['flag']= patentes_df['flag'].map(dict_letters)  # change the counters
patentes_df.drop(['dupl'], axis=1, inplace=True) # remove the column dupl
patentes_df['f_name'] = patentes_df['f_name'] + patentes_df['flag'] + '.pdf' # add the count flags and pdf extension to filename
patentes_df.drop(['flag'], axis=1, inplace=True) # remove the column flag

i = 1 # start position in the dataframe to perform the download of the files
# for the i-th elementh in the dataframe
for folder_name,inpi_name,f_name,rpi in zip(patentes_df['folder_name'],patentes_df['inpi_name'],patentes_df['f_name'],patentes_df['rpi']):
    if not os.path.exists(folder_name): # if the download folder not exist
        os.makedirs(folder_name) # create the folder
    url_cam = "http://parecer.inpi.gov.br/download.php?cam=arquivos/RPI/" + str(rpi) + "/" + inpi_name # set the url to download
    f_request = requests.get(url_cam, verify = False)  # request data from the resource
    path = folder_name + "/" + f_name # set the path to donwload the file
    f = open(path, 'wb').write(f_request.content) # make the download
    i+=1 # ident the position counter

print("Download completed!")