*Licensed under the MIT License. See LICENSE-CODE in the repository root for details.*

*Copyright (c) 2025 Eleni Kamateri*

### Parsing and CSV File Generation for WPI Analysis

This script generates a CSV file containing essential data for analyzing patent documents in a core vertical of the WPI dataset. The file includes key information such as:
 
1. Document ucid
2. Document date
3. Classification Labels
4. The presence of abstract, description, and claims

#### Requirements

1. This script should be applied to the extracted patent documents, which are organized into separate folders. It requires that the script "*7z Files Extraction and Organization by Vertical.ipynb*" has been run first.

#### Label Formatting

Labels for each document are concatenated into a single string, separated by commas. To avoid conflicts with the default CSV delimiter, we use a semicolon (;) separator when storing the data.

#### Configurable Parameters

Researchers can customize the extraction process using the following parameters:

**vertical_origin_path** – Path to the core vertical of the WPI dataset, containing the extracted files to be parsed for CSV creation. 

        Example: "/YOUR_PATH/WPI-Dataset/EP/".  

**csv_file_name** – The name of the CSV file that will be generated
        
        Example: "csv_file_for_wpi_analysis".

**main_further** – Include main and further classification labels?
        
        0: No
        1: Yes


**ipcr** – Include IPCR classification labels?
        
        0: No
        1: Yes
    
**cpc** – Include CPC classification labels?
        
        0: No
        1: Yes
        

**vertical** – Select dataset vertical:
       
        0: EP
        1: WO
        2: US
        3: CN
        4: JP
        5: KR
        
#### Processing Time Estimates

    EP vertical: ~58,777.60 sec
    WO vertical: ~42,645.45 sec

### Import all required libraries for the script

In [1]:
import numpy as np
import pandas as pd
import os
from bs4 import BeautifulSoup
import time

### Set the required parameters for the script

In [2]:
vertical_origin_path="/YOUR_PATH/WPI-Dataset/EP/"
csv_file_name="csv_file_for_wpi_analysis"
main_further=0
ipcr=1
cpc=0
vertical=0

In [3]:
if vertical==0:
    csv_file_name="EP"+"_"+csv_file_name
elif vertical==1:
    csv_file_name="WO"+"_"+csv_file_name
elif vertical==1:
    csv_file_name="US"+"_"+csv_file_name
elif vertical==1:
    csv_file_name="CN"+"_"+csv_file_name
elif vertical==1:
    csv_file_name="JP"+"_"+csv_file_name
elif vertical==1:
    csv_file_name="KR"+"_"+csv_file_name
else:
    print("Provide a valid vertical number")

### Parse the data and generate the CSV file

In [None]:
# Count the time
start_time = time.time()

df_class = pd.DataFrame()
counter_class=0

for folder_level_1 in os.listdir(vertical_origin_path): #CC
    for folder_level_2 in os.listdir(vertical_origin_path+"/"+folder_level_1): #nnnnnn
        for folder_level_3 in os.listdir(vertical_origin_path+"/"+folder_level_1+"/"+folder_level_2): #nn
            for folder_level_4 in os.listdir(vertical_origin_path+"/"+folder_level_1+"/"+folder_level_2+"/"+folder_level_3): #nn
                for folder_level_5 in os.listdir(vertical_origin_path+"/"+folder_level_1+"/"+folder_level_2+"/"+folder_level_3+"/"+folder_level_4): #nn                                        
                    for folder_level_6 in os.listdir(vertical_origin_path+"/"+folder_level_1+"/"+folder_level_2+"/"+folder_level_3+"/"+folder_level_4+"/"+folder_level_5): #nn                                        
                        for files in os.listdir(vertical_origin_path+"/"+folder_level_1+"/"+folder_level_2+"/"+folder_level_3+"/"+folder_level_4+"/"+folder_level_5+"/"+folder_level_6): #nn                                        

                            counter_class=counter_class+1
                            if counter_class%100000==0:
                                print(counter_class)

                            content = open(vertical_origin_path+"/"+folder_level_1+"/"+folder_level_2+"/"+folder_level_3+"/"+folder_level_4+"/"+folder_level_5+"/"+folder_level_6+"/"+files,'r',encoding='utf-8').read()
                            soup = BeautifulSoup(content, 'xml')                           
                            document_info = soup.find_all("patent-document")                                  

                            try:
                                ucid=document_info[0]['ucid']
                            except Exception:
                                ucid=''
                                print("Exception 1, ucid does not exist", files)

                            try:
                                date=document_info[0]['date']
                            except Exception:
                                date=''
                                print("Exception 2, date does not exist", files)
                            
                            main_code=''
                            further_codes_help=[]
                            further_codes_list=[]
                            ipcr_codes_help=[]
                            ipcr_codes_list=[]
                            cpc_codes_help=[]
                            cpc_codes_list=[]
                            
                            if main_further == 1:
                                for main_classification in soup.find_all('main-classification'):
                                    main_code=main_classification.getText()

                                for further_classification in soup.find_all('further-classification'):
                                    further_code=further_classification.getText()
                                    further_codes_help.append(further_code) if further_code not in further_codes_help else further_codes_help
                                further_codes_list = ", ".join(further_codes_help)                                        

                            if ipcr == 1:
                                for classification_ipcr in soup.find_all('classification-ipcr'):
                                    ipcr_code=classification_ipcr.getText()
                                    ipcr_codes_help.append(ipcr_code) if ipcr_code not in ipcr_codes_help else ipcr_codes_help
                                ipcr_codes_list = ", ".join(ipcr_codes_help) 

                            if cpc == 1:
                                for classification_cpc in soup.find_all('classification-cpc'):
                                    cpc_code=classification_cpc.getText()
                                    cpc_codes_help.append(cpc_code) if cpc_code not in cpc_codes_help else cpc_codes_help
                                cpc_codes_list = ", ".join(cpc_codes_help)
           
                            abstract_en_exist=0        
                            abstract_en=soup.find('abstract', attrs={'lang':'EN'})
                            if abstract_en != None:
                                abstract_en_exist=1

                            description_en_exist=0
                            description_en = soup.find('description', attrs={'lang':'EN'})
                            if description_en != None:
                                description_en_exist=1

                            claims_en_exist=0
                            claims_en = soup.find('claims', attrs={'lang':'EN'})
                            if claims_en != None:
                                claims_en_exist=1
                                
                                
                            df_class.loc[counter_class-1, 'xml_file_name']=files
                            df_class.loc[counter_class-1, 'ucid']=ucid
                            df_class.loc[counter_class-1, 'date']=date
                            if main_further == 1:
                                df_class.loc[counter_class-1, 'main_classification']=main_code   
                                df_class.loc[counter_class-1, 'further_classification']=further_codes_list 
                            if ipcr == 1:
                                df_class.loc[counter_class-1, 'classification_ipcr']=ipcr_codes_list  
                            if cpc == 1:
                                df_class.loc[counter_class-1, 'classification_cpc']=cpc_codes_list  
                            df_class.loc[counter_class-1, 'abstract_lang_en_exist']=abstract_en_exist
                            df_class.loc[counter_class-1, 'description_lang_en_exist']=description_en_exist
                            df_class.loc[counter_class-1, 'claims_lang_en_exist']=claims_en_exist

In [5]:
df_class.head(10)

Unnamed: 0,xml_file_name,ucid,date,classification_ipcr,abstract_lang_en_exist,description_lang_en_exist,claims_lang_en_exist
0,EP-2677851-A1.xml,EP-2677851-A1,20140101,A01B 79/02 20060101AFI20120911BHEP ...,1.0,1.0,1.0
1,EP-2677852-A1.xml,EP-2677852-A1,20140101,A01D 43/063 20060101AFI20170131BHEP ...,1.0,1.0,1.0
2,EP-2677853-A1.xml,EP-2677853-A1,20140101,A01D 46/28 20060101AFI20120913BHEP ...,1.0,1.0,1.0
3,EP-2677854-A1.xml,EP-2677854-A1,20140101,E04D 11/00 20060101ALI20120913BHEP ...,1.0,1.0,1.0
4,EP-2677856-A1.xml,EP-2677856-A1,20140101,A01G 17/00 20060101AFI20120914BHEP ...,1.0,0.0,0.0
5,EP-2677857-A1.xml,EP-2677857-A1,20140101,A01G 25/16 20060101AFI20140925BHEP ...,1.0,0.0,0.0
6,EP-2677860-A1.xml,EP-2677860-A1,20140101,E03C 1/02 20060101ALI20120910BHEP ...,1.0,1.0,1.0
7,EP-2677862-A1.xml,EP-2677862-A1,20140101,A01M 25/00 20060101AFI20120911BHEP ...,1.0,0.0,0.0
8,EP-2677863-A1.xml,EP-2677863-A1,20140101,C12N 5/071 20100101ALI20120914BHEP ...,1.0,1.0,1.0
9,EP-2677864-A1.xml,EP-2677864-A1,20140101,A01N 25/34 20060101ALI20160118BHEP ...,1.0,1.0,1.0


In [6]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 7.6437201499938965 seconds ---


In [7]:
df_class.to_csv(vertical_origin_path+csv_file_name+".csv", sep =';')