In [4]:
import os
import pandas as pd
from os import listdir
from os.path import isfile, join
import json
import collections
import re
from tqdm import tqdm

In [13]:
# Function returning T if years are valid and present in dataset
def checkYears(year, yearsNeeded, pathData):
    ret = False
    folder_names = [folder for folder in os.listdir(pathData) if os.path.isdir(os.path.join(pathData, folder))]
    folder_years = [int(name) for name in folder_names] 
    required_years = set(range(year - yearsNeeded, year))
    missing_years = required_years - set(folder_years)

    # Check for missing 2004 and 2005
    if {2004}.issubset(required_years | {year}) or {2005}.issubset(required_years | {year}):
        print("Missing IPC")
    if {2017}.issubset(required_years | {year}) or {2018}.issubset(required_years | {year}):
        print("Incomplete data")

    # General check for missing years
    if not missing_years:
        print("All years present")
    else:
        print(f"Missing years: {missing_years}")
    if not missing_years and not({2004}.issubset(required_years | {year}) or {2005}.issubset(required_years | {year})) and not({2017}.issubset(required_years | {year}) or {2018}.issubset(required_years | {year})):
        ret = True                                                                                                                      
    return ret

In [16]:
# Function taking year, IPC class, path to data, and path to output and writing CSV file toEval for current year and IPC, and also writing secondary IPC in /text/_.txt
# Needs a directory /test in output path
def json2toEval(year, ipc, pathData, pathOutput):
    pathYear = pathData+ f"/{year}/"                                # Updates with variable year
    jsonNamesYear = [f for f in listdir(pathYear) if isfile(join(pathYear, f))] 

    # Initialization of list for IPC class
    decision_ipc  = collections.defaultdict(int) # Initialization of dict for IPC class - initialized at 0, used to count occurences   
    patent_ipc_list = [] # initialization of list for IPC class 

    # Creates list of patents from this IPC class
    for i in tqdm(range(len(jsonNamesYear))):
        patent_path = pathYear + jsonNamesYear[i]
        with open(patent_path) as f:
            d = json.load(f) # load json in d
            f.close()   # close f - not needed with "with ___ as" syntax
        class_mainIPC = d['main_ipcr_label']
        if re.match(f'^{ipc}', class_mainIPC):
            patent_ipc_list.append(jsonNamesYear[i])
            decision_ipc[d['decision']] += 1

    # Create list exluding all other than accepted and rejected
    final_patents = []
    for i in tqdm(range(len(patent_ipc_list))):
        patent_path = pathYear + patent_ipc_list[i]
        with open(patent_path) as f:
            d = json.load(f)
            f.close()
        if d['decision'] == 'ACCEPTED' or d['decision'] == 'REJECTED': # exclude all other
            final_patents.append(patent_ipc_list[i])

    # Load needed data for patents
    non_main_ipc = []
    labels, patent_number, titles, backgrounds, claims, summary, abstract, main_ipc, sec_ipc = [], [], [], [], [], [], [], [], []

    for i in tqdm(range(len(final_patents))):
        patent_path = pathYear + final_patents[i]
        with open(patent_path) as f:
            d = json.load(f)
            f.close() # ligne inutile
        
        #Creating the lists for the other information
        patent_number.append(d['application_number'])
        titles.append(d['title'])
        backgrounds.append(d['background'])
        claims.append(d['claims'])
        summary.append(d['summary'])
        abstract.append(d['abstract'])
        main_ipc.append(d['main_ipcr_label'])
        sec_ipc.append(d['ipcr_labels'])

        #Collecting non main ipc class -useful to create good expectation class
        non_main =  d['ipcr_labels']
        for ipcr in non_main:
            non_main_ipc.append(ipcr) # only 4 first characters to be sure of being at same level
        #Getting labels based on decision
        label = 0
        if d['decision'] == 'ACCEPTED':
            label = 1
        labels.append(label)

    # Keep only secondary class that are not the main ipc class
    expectations_classes = list(set(non_main_ipc)) # unique secondary ipc classes
    good_expectations_classes = []
    for ipcr in expectations_classes:
        if ipcr[0:4] != f"{ipc}":
            good_expectations_classes.append(ipcr)

    # df to csv
    df = pd.DataFrame({'application_number': patent_number, 'title': titles, 'abstract':abstract,
                        'claims':claims, 'background': backgrounds, 'summary':summary, 'ipc':ipc, 'sec_ipc': sec_ipc, 'label': labels})

    df.to_csv(pathOutput + f'/toEval/{year}_{ipc}_patents_toEval.csv', index=False)
    print("CSV done")
    # Save IPC in text format
    with open(pathOutput + f'/ES/text/{year}_{ipc}_expectation_IPC_class.txt', 'w') as fp:
        for item in good_expectations_classes:
            # write each item on a new line
            fp.write("%s\n" % item)
        print('.txt Done')





In [19]:
pathData = "C:/Users/edgar/OneDrive/Bureau/Ecole/HEC/A24/BrevetNLP/data"
pathOutput = "C:/Users/edgar/OneDrive/Bureau/Ecole/HEC/A24/BrevetNLP/exemple données"
json2toEval(2011, "H01L", pathData, pathOutput)

100%|██████████| 1000/1000 [00:00<00:00, 2631.70it/s]
100%|██████████| 121/121 [00:00<00:00, 2559.18it/s]
100%|██████████| 117/117 [00:00<00:00, 3087.25it/s]


CSV done
.txt Done
