Importing the necessary libraries

In [8]:
from epo.tipdata.epab import EPABClient
import os
import numpy as np
import pandas as pd
import re

Defining two functions to make readable the abstracts and claims

In [1]:
def extract_texts(claims):
    return [claim['text'] for claim in claims if 'text' in claim]
def remove_brackets(text):
    if isinstance(text, list):
        return [re.sub(r'<.*?>', '', item) for item in text]
    elif isinstance(text, str):
        return re.sub(r'<.*?>', '', text)
    return text

In [4]:
# turn env='PROD' for full data
epab = EPABClient(env = 'TEST')


The following code creates folders named with the year considered in the for loop and stores inside the folder all the abstracts and claims of each publication in .txt files

In [38]:
q = epab.query_abstract_language("en")
#counter = 0
for year in range(2006,2024):
    #creating directory related to the current year
    new_folder_path = str(year)
    os.makedirs(new_folder_path, exist_ok=True)
    output_folder = new_folder_path
    
    p = epab.query_publication_date(str(year)+"%")
    s = q & p
    # s represents the data with abstracts in english and in the current year
    
    df = s.get_results("title.en,publication,ipc,abstract.text,claims",output_type="dataframe")
    df['abstract.text'] = df['abstract.text'].apply(remove_brackets)
    for _, row in df.iterrows():
        pub_number = row['publication.number']
        #check that the rows 'claims' and 'abstract.text' are non empty elements
        if(row['claims'] and row['abstract.text']):
    # Define the filenames for abstract and claims
            abstract_filename = os.path.join(output_folder, f'{pub_number}_abstract.txt')
            claims_filename = os.path.join(output_folder, f'{pub_number}_claims.txt')
            abstract = row['abstract.text']
            claims = row['claims'][0]['text']
            claims_ = remove_brackets(claims)
            #print(row['claims'])
            with open(claims_filename, 'w') as claims_file:
                claims_file.write(claims_)
    # Write the abstract content to a text file
            with open(abstract_filename, 'w') as abstract_file:
                abstract_file.write(row['abstract.text'])

If needed we can also put everything into a dataframe (year,publication_number,title,ipc,abstract,claims)

In [54]:
# choose the language of the abstracts
q = epab.query_abstract_language("en")
# select the publication period
p = epab.query_publication_date("20060101-20231231")
#get the intersection between both q and p hence s contains all the publication in english in the period 2006-2023
s = q & p
# define the dataframe
df = s.get_results("title.en,publication,ipc,abstract.text,claims",output_type="dataframe")
#make the abstract column readable
df['abstract.text'] = df['abstract.text'].apply(remove_brackets)
#make the claims column readable
df['claims'] = df['claims'].apply(extract_texts)
df['claims'] = df['claims'].apply(remove_brackets)
#take only the rows in which claims are present
df = df[df['claims'].apply(lambda x: len(x) > 0)].reset_index(drop=True)
df['claims'] = df['claims'].apply(lambda x: x[0])
#delete other columns with not needed informations
df = df.drop(columns=['publication.country','publication.kind','publication.language'])
#transform each element of the IPC column a list of IPC codes
df['ipc'] = df['ipc'].apply(lambda x: [item['symbol'] for item in x])
# Save DataFrame as a zipped CSV file
csv_file_in_zip = '2006_2023_all_data.csv'
df.to_csv('2006_2023_all_data.zip', index=False, compression=dict(method='zip', archive_name=csv_file_in_zip))


In [55]:
df

Unnamed: 0,title.en,publication.number,publication.date,ipc,abstract.text,claims
0,CONTROL DEVICE FOR A TORQUE CONVERTER IN AN AU...,6943,19800123,[F16H45/02],In an automatic transmission for a vehicle in ...,1. A control device for a torque converter in ...
1,Physiologically adaptive cardiac pacemaker,7189,19800123,[A61N1/36],A cardiac pacemaker is disclosed having means ...,1. A physiologically adaptive cardiac pacemake...
2,Improved electrode array and method of making ...,7157,19800123,[A61F11/04],An electrode array 10 specifically for implant...,1. An electrode array comprising a flexible bi...
3,Decorative panel,6848,19800123,"[B44C5/04, B44F7/00, E04F13/00, B05D5/06]",A base coated substrate carries a pattern defi...,1. In a decorative panel comprising a pigmente...
4,Process for the preparation of cyclopropane de...,7154,19800123,"[C07C120/00, C07C121/48]",Compounds of formula:\nwherein both groups X a...,1. A process for the preparation of a compound...
5,Fuel injection pump assembly,7799,19800206,[F02M59/04],A fuel injection pump assembly includes a sing...,1. A plunger type fuel injection pump assembly...
6,AUTOMATIC FOCUSING SYSTEM,7902,19800206,"[G11B7/08, G11B7/12, G02B7/11]",An automatic focusing system to focus automati...,1) An automatic focusing apparatus which compr...
7,Fluid bearing,7409,19800206,[F16C32/06],A fluid bearing for rotatably supporting a rot...,1. A fluid bearing for rotatably supporting a ...
8,Flexible container for liquids,7685,19800206,"[B65D53/08, B65B7/02, B65D85/72, B65D30/24]",A flexible container (1) for liquids in which ...,"1. A flexible container for liquids, the conta..."
9,Sheet material cutting device,7523,19800206,[B23D27/02],A generally portable sheet material cutting to...,1. Apparatus for cutting sheet material compri...
