In [1]:
import os
import pandas as pd
from os import listdir
from os.path import isfile, join
import json
import collections
import re
from tqdm import tqdm

In [162]:
# Function returning T if years are valid and present in dataset
def checkYears(year, yearsNeeded, pathData):
    ret = False
    folder_names = [folder for folder in os.listdir(pathData) if os.path.isdir(os.path.join(pathData, folder))]
    folder_years = [int(name) for name in folder_names] 
    required_years = set(range(year - yearsNeeded, year))
    missing_years = required_years - set(folder_years)

    # Check for missing 2004 and 2005
    if {2004}.issubset(required_years | {year}) or {2005}.issubset(required_years | {year}):
        print(f"For eval year {year}: Missing IPC - 2004 and/or 2005")
    if {2017}.issubset(required_years | {year}) or {2018}.issubset(required_years | {year}):
        print(f"For eval year {year}: Incomplete data - 2017 and/or 2018")

    # General check for missing years
    if not missing_years:
        print(f"All {yearsNeeded} reference years for eval year {year} present")
    else:
        print(f"Missing years: {missing_years}")
    if not missing_years and not({2004}.issubset(required_years | {year}) or {2005}.issubset(required_years | {year})) and not({2017}.issubset(required_years | {year}) or {2018}.issubset(required_years | {year})):
        ret = True                                                                                                                      
    return ret

In [190]:
# Function taking year, IPC class, path to data, and path to output and writing CSV file toEval for current year and IPC, and also writing secondary IPC in /text/_.txt
# Needs a directory /test in output path
def json2toEval(year, ipc, pathData, pathOutput):
    print(f"Create toEval, iterate through all patents of current year {year}")

    pathYear = pathData+ f"/{year}/"                                # Updates with variable year
    jsonNamesYear = [f for f in listdir(pathYear) if isfile(join(pathYear, f))] 

    # Initialization of list for IPC class
    decision_ipc  = collections.defaultdict(int) # Initialization of dict for IPC class - initialized at 0, used to count occurences   
    patent_ipc_list = [] # initialization of list for IPC class 

    # Creates list of patents from this IPC class
    for i in tqdm(range(len(jsonNamesYear))):
        patent_path = pathYear + jsonNamesYear[i]
        with open(patent_path) as f:
            d = json.load(f) # load json in d
            f.close()   # close f - not needed with "with ___ as" syntax
        class_mainIPC = d['main_ipcr_label']
        if re.match(f'^{ipc}', class_mainIPC):
            patent_ipc_list.append(jsonNamesYear[i])
            decision_ipc[d['decision']] += 1

    # Create list exluding all other than accepted and rejected
    final_patents = []
    for i in tqdm(range(len(patent_ipc_list))):
        patent_path = pathYear + patent_ipc_list[i]
        with open(patent_path) as f:
            d = json.load(f)
            f.close()
        if d['decision'] == 'ACCEPTED' or d['decision'] == 'REJECTED': # exclude all other
            final_patents.append(patent_ipc_list[i])

    # Load needed data for patents
    non_main_ipc = []
    labels, patent_number, titles, backgrounds, claims, summary, abstract, main_ipc, sec_ipc = [], [], [], [], [], [], [], [], []

    for i in tqdm(range(len(final_patents))):
        patent_path = pathYear + final_patents[i]
        with open(patent_path) as f:
            d = json.load(f)
            f.close() # ligne inutile
        
        #Creating the lists for the other information
        patent_number.append(d['application_number'])
        titles.append(d['title'])
        backgrounds.append(d['background'])
        claims.append(d['claims'])
        summary.append(d['summary'])
        abstract.append(d['abstract'])
        main_ipc.append(d['main_ipcr_label'])
        sec_ipc.append(d['ipcr_labels'])

        #Collecting non main ipc class -useful to create good expectation class
        non_main =  d['ipcr_labels']
        for ipcr in non_main:
            non_main_ipc.append(ipcr) # only 4 first characters to be sure of being at same level
        #Getting labels based on decision
        label = 0
        if d['decision'] == 'ACCEPTED':
            label = 1
        labels.append(label)

    # Keep only secondary class that are not the main ipc class
    expectations_classes = list(set(non_main_ipc)) # unique secondary ipc classes
    good_expectations_classes = []
    for ipcr in expectations_classes:
        if ipcr[0:4] != f"{ipc}":
            good_expectations_classes.append(ipcr)

    # df to csv
    df = pd.DataFrame({'application_number': patent_number, 'title': titles, 'abstract':abstract,
                        'claims':claims, 'background': backgrounds, 'summary':summary, 'ipc':ipc, 'sec_ipc': sec_ipc, 'label': labels})

    df.to_csv(pathOutput + f'/toEval/{year}_{ipc}_patents_toEval.csv', index=False)
    print(f"toEval/toEval/{year}_{ipc} done")
    print("toEval shape: ", df.shape)
    # Save IPC in text format
    with open(pathOutput + f'/ES/text/{year}_{ipc}_expectation_IPC_class.txt', 'w') as fp:
        for item in good_expectations_classes:
            # write each item on a new line
            fp.write("%s\n" % item)
        print(f'text/{year}_{ipc} Done')
    print("Nb secondary IPC (text size): ", len(good_expectations_classes))





In [None]:
pathData = "C:/Users/edgar/OneDrive/Bureau/Ecole/HEC/A24/BrevetNLP/data"
pathOutput = "C:/Users/edgar/OneDrive/Bureau/Ecole/HEC/A24/BrevetNLP/exemple données"
json2toEval(2011, "H01L", pathData, pathOutput)

In [199]:
# Function taking ips, year studied, year in which we are searching, and path for input and output, and outputs dfs for ES et KS for the year_yearRef

def json2_KS_ES(year, yearRef, ipc, pathData, pathOutput):


    pathYear = pathData+ f"/{yearRef}/"                                # Updates with varialbe year
    jsonNamesYear = [f for f in listdir(pathYear) if isfile(join(pathYear, f))]

    # import secondary ipc classes of year toEval
    expect_classes_ipc_yearRef = []
    with open(pathOutput + f'/ES/text/{year}_{ipc}_expectation_IPC_class.txt', 'r') as fp:
        for line in fp:
            x = line[:-1]
            expect_classes_ipc_yearRef.append(x)#[0:4])  ######

    
    # Initialize KS and ES for this IPC class (for toEval yearRef)
    KS_ipc = []
    ES_ipc = []
    print(f"Create KS, iterate through patents of IPC {ipc}, of reference year {yearRef} for evalYear {year}")
    for i in tqdm(range(len(jsonNamesYear))):
        patent_path = pathYear + jsonNamesYear[i]
        with open(patent_path) as f:
            d = json.load(f)
            f.close()
        
        class_mainIPC = d['main_ipcr_label']#[0:4] #######
        class_main = class_mainIPC[0:4]

        #We are collecting all documents related to the main class - we distinguish them later by date
        if class_main == ipc:
            KS_ipc.append(jsonNamesYear[i])
        
        #For the expectations states - we have one for each year since the class are not similar ???
        if class_mainIPC in expect_classes_ipc_yearRef:
            ES_ipc.append(jsonNamesYear[i])

    current_date = int(f"{year}"+"0101")

    #Create knowledge space per year in df
    patent_number, titles, backgrounds, claims, summary, abstract, main_ipc, labels, sec_ipc, yearRefVec = [], [], [], [], [], [], [], [], [], []

    print(f"Create ES, iterate through patents of secondary IPC {ipc}, of reference year {yearRef} for evalYear {year}")
    for i in tqdm(range(len(KS_ipc))):
        patent_path = pathYear + KS_ipc[i]
        with open(patent_path) as f:
            d = json.load(f)
            f.close()
        #Not taking patents that are not published yet
        if int(d['date_published']) < current_date:
            
            #Creating the lists for the other information
            patent_number.append(d['application_number'])
            titles.append(d['title'])
            backgrounds.append(d['background'])
            claims.append(d['claims'])
            summary.append(d['summary'])
            abstract.append(d['abstract'])
            main_ipc.append(d['main_ipcr_label'])
            labels.append(d['decision'])
            sec_ipc.append(d['ipcr_labels'])
            yearRefVec.append(yearRef)
        else:
            #If the date is superior, we still take accepted or rejected into account
            if d['decision'] == 'ACCEPTED' or d['decision'] == 'REJECTED':
                #Creating the lists for the other information
                patent_number.append(d['application_number'])
                titles.append(d['title'])
                backgrounds.append(d['background'])
                claims.append(d['claims'])
                summary.append(d['summary'])
                abstract.append(d['abstract'])
                main_ipc.append(d['main_ipcr_label'])
                labels.append(d['decision'])
                sec_ipc.append(d['ipcr_labels'])
                yearRefVec.append(yearRef)

    df_KS = pd.DataFrame({'application_number': patent_number, 'title': titles, 'abstract':abstract,
                    'claims':claims, 'background': backgrounds, 'summary':summary, 'ipc':main_ipc, 'sec_ipc': sec_ipc, 'label': labels, 'yearRef': yearRefVec})
    
    
    #Create expectations space per year in df
    patent_number, titles, backgrounds, claims, summary, abstract, main_ipc, labels, sec_ipc, yearRefVec = [], [], [], [], [], [], [], [], [], []
    for i in tqdm(range(len(ES_ipc))):
        patent_path = pathYear + ES_ipc[i]
        with open(patent_path) as f:
            d = json.load(f)
            f.close()
        
        #Not taking patnts that are not published yet
        if int(d['date_published']) < current_date:
            
            #Creating the lists for the other information
            patent_number.append(d['application_number'])
            titles.append(d['title'])
            backgrounds.append(d['background'])
            claims.append(d['claims'])
            summary.append(d['summary'])
            abstract.append(d['abstract'])
            main_ipc.append(d['main_ipcr_label'])
            labels.append(d['decision'])
            sec_ipc.append(d['ipcr_labels'])
            yearRefVec.append(yearRef)
        else:
            #If the date is superior to 2016, we still take accepted or rejected into account ???
            if d['decision'] == 'ACCEPTED' or d['decision'] == 'REJECTED':
                #Creating the lists for the other information
                patent_number.append(d['application_number'])
                titles.append(d['title'])
                backgrounds.append(d['background'])
                claims.append(d['claims'])
                summary.append(d['summary'])
                abstract.append(d['abstract'])
                main_ipc.append(d['main_ipcr_label'])
                labels.append(d['decision'])
                sec_ipc.append(d['ipcr_labels'])
                yearRefVec.append(yearRef)

    df_ES = pd.DataFrame({'application_number': patent_number, 'title': titles, 'abstract':abstract,
                    'claims':claims, 'background': backgrounds, 'summary':summary, 'ipc': main_ipc, "sec_ipc": sec_ipc, 'label': labels, 'yearRef': yearRefVec})
    
    return (df_KS, df_ES)

In [None]:
# Function that simply loops over json2_KS_ES yearsNeeded times, and binds dataframes together. Writes a CSV for KS and ES.
year =2011
yearsNeeded = 5
required_years = set(range(year - yearsNeeded, year))
required_years

for i in required_years:
    print(i)

str(list(required_years)[0])[2:4]
str(list(required_years)[-2])[2:4]


In [200]:
def loop_KS_ES(year, yearsNeeded, ipc, pathData, pathOutput):
    required_years = set(range(year - yearsNeeded, year))
    df_KS = pd.DataFrame()
    df_ES = pd.DataFrame()
    for i in required_years:
        dfs_temp = json2_KS_ES(year, i, ipc, pathData, pathOutput)
        df_KS_temp = dfs_temp[0]
        df_ES_temp = dfs_temp[1]
        df_KS = pd.concat([df_KS, df_KS_temp], axis=0, ignore_index=True)
        df_ES = pd.concat([df_ES, df_ES_temp], axis=0, ignore_index=True)
        print("ES and KS done for: " + f"{year}_{i}_{ipc}")
    
    df_KS.to_csv(pathOutput + f'/KS/{year}_{str(list(required_years)[0])[2:4]}{str(list(required_years)[-1])[2:4]}_{ipc}_KS_raw.csv', index=False)
    df_ES.to_csv(pathOutput + f'/ES/{year}_{str(list(required_years)[0])[2:4]}{str(list(required_years)[-1])[2:4]}_{ipc}_ES_raw.csv', index=False)
    print("ES and KS done for: " + f"{year}_{ipc}")
    print("df_KS shape: ", df_KS.shape)
    print("df_ES shape: ", df_ES.shape)

In [None]:
year =2011
yearsNeeded = 5
pathData = "C:/Users/edgar/OneDrive/Bureau/Ecole/HEC/A24/BrevetNLP/data"
pathOutput = "C:/Users/edgar/OneDrive/Bureau/Ecole/HEC/A24/BrevetNLP/exemple données"
ipc="G06F"
json2toEval(2016, "G06F", pathData, pathOutput)
loop_KS_ES(2016, 3, "G06F", pathData, pathOutput)

Il reste maintenant à créer des listes pour défiler au travers des années

In [194]:
# Function taking years to be evaluated, number of years as reference and a list of IPC classes.
def loopFinal(listIPC, listYearsEval, nbYearsRef, pathData, pathOutput):
    # check if valid years
    for year in listYearsEval:   
        cY = checkYears(year, nbYearsRef, pathData)
        if not cY:
            return
    # Loop through each ipc
    for ipc in tqdm(listIPC):
        # Loop through each year
        for year in listYearsEval:
            json2toEval(year, ipc, pathData, pathOutput)
            loop_KS_ES(year, nbYearsRef, ipc, pathData, pathOutput)

In [203]:
listIPC =  ["G06F", "A61B", "H01L"] #["G06F", "G01N", "A61B", "B60L", "E21B", "F03D", "H01L", "H04W"]
listYearsEval = range(2014, 2017) #range(2011, 2017)
nbYearsRef = 3 #5

loopFinal(listIPC, listYearsEval, 3, pathData, pathOutput)

All 3 reference years for eval year 2014 present
All 3 reference years for eval year 2015 present
All 3 reference years for eval year 2016 present


  0%|          | 0/3 [00:00<?, ?it/s]

Create toEval, iterate through all patents of current year 2014


100%|██████████| 1000/1000 [00:00<00:00, 3113.91it/s]
100%|██████████| 83/83 [00:00<00:00, 3467.54it/s]
100%|██████████| 68/68 [00:00<00:00, 3247.88it/s]


toEval/toEval/2014_G06F done
toEval shape:  (68, 9)
text/2014_G06F Done
Nb secondary IPC (text size):  20
Create KS, iterate through patents of IPC G06F, of reference year 2011 for evalYear 2014


100%|██████████| 1000/1000 [00:00<00:00, 3089.41it/s]


Create ES, iterate through patents of secondary IPC G06F, of reference year 2011 for evalYear 2014


100%|██████████| 121/121 [00:00<00:00, 2331.81it/s]
100%|██████████| 13/13 [00:00<00:00, 2160.81it/s]


ES and KS done for: 2014_2011_G06F
Create KS, iterate through patents of IPC G06F, of reference year 2012 for evalYear 2014


100%|██████████| 1000/1000 [00:00<00:00, 3593.80it/s]


Create ES, iterate through patents of secondary IPC G06F, of reference year 2012 for evalYear 2014


100%|██████████| 132/132 [00:00<00:00, 3308.64it/s]
100%|██████████| 11/11 [00:00<00:00, 1828.38it/s]


ES and KS done for: 2014_2012_G06F
Create KS, iterate through patents of IPC G06F, of reference year 2013 for evalYear 2014


100%|██████████| 1000/1000 [00:00<00:00, 3769.00it/s]


Create ES, iterate through patents of secondary IPC G06F, of reference year 2013 for evalYear 2014


100%|██████████| 113/113 [00:00<00:00, 1416.30it/s]
100%|██████████| 28/28 [00:00<00:00, 2340.57it/s]


ES and KS done for: 2014_2013_G06F
ES and KS done for: 2014_G06F
df_KS shape:  (358, 9)
df_ES shape:  (49, 10)
Create toEval, iterate through all patents of current year 2015


100%|██████████| 1000/1000 [00:00<00:00, 3433.44it/s]
100%|██████████| 84/84 [00:00<00:00, 3661.74it/s]
100%|██████████| 54/54 [00:00<00:00, 3184.92it/s]


toEval/toEval/2015_G06F done
toEval shape:  (54, 9)
text/2015_G06F Done
Nb secondary IPC (text size):  22
Create KS, iterate through patents of IPC G06F, of reference year 2012 for evalYear 2015


100%|██████████| 1000/1000 [00:00<00:00, 3234.10it/s]


Create ES, iterate through patents of secondary IPC G06F, of reference year 2012 for evalYear 2015


100%|██████████| 132/132 [00:00<00:00, 3777.96it/s]
100%|██████████| 8/8 [00:00<00:00, 2004.81it/s]


ES and KS done for: 2015_2012_G06F
Create KS, iterate through patents of IPC G06F, of reference year 2013 for evalYear 2015


100%|██████████| 1000/1000 [00:00<00:00, 3433.83it/s]


Create ES, iterate through patents of secondary IPC G06F, of reference year 2013 for evalYear 2015


100%|██████████| 113/113 [00:00<00:00, 3062.29it/s]
100%|██████████| 27/27 [00:00<00:00, 2082.57it/s]


ES and KS done for: 2015_2013_G06F
Create KS, iterate through patents of IPC G06F, of reference year 2014 for evalYear 2015


100%|██████████| 1000/1000 [00:00<00:00, 2664.75it/s]


Create ES, iterate through patents of secondary IPC G06F, of reference year 2014 for evalYear 2015


100%|██████████| 83/83 [00:00<00:00, 2972.25it/s]
100%|██████████| 32/32 [00:00<00:00, 2468.24it/s]


ES and KS done for: 2015_2014_G06F
ES and KS done for: 2015_G06F
df_KS shape:  (314, 9)
df_ES shape:  (60, 10)
Create toEval, iterate through all patents of current year 2016


100%|██████████| 1000/1000 [00:00<00:00, 3398.60it/s]
100%|██████████| 92/92 [00:00<00:00, 3416.59it/s]
100%|██████████| 19/19 [00:00<00:00, 3176.62it/s]


toEval/toEval/2016_G06F done
toEval shape:  (19, 9)
text/2016_G06F Done
Nb secondary IPC (text size):  9
Create KS, iterate through patents of IPC G06F, of reference year 2013 for evalYear 2016


100%|██████████| 1000/1000 [00:00<00:00, 3831.74it/s]


Create ES, iterate through patents of secondary IPC G06F, of reference year 2013 for evalYear 2016


100%|██████████| 113/113 [00:00<00:00, 3776.82it/s]
100%|██████████| 2/2 [00:00<00:00, 2003.97it/s]


ES and KS done for: 2016_2013_G06F
Create KS, iterate through patents of IPC G06F, of reference year 2014 for evalYear 2016


100%|██████████| 1000/1000 [00:00<00:00, 3355.72it/s]


Create ES, iterate through patents of secondary IPC G06F, of reference year 2014 for evalYear 2016


100%|██████████| 83/83 [00:00<00:00, 2684.63it/s]
100%|██████████| 3/3 [00:00<00:00, 1503.51it/s]


ES and KS done for: 2016_2014_G06F
Create KS, iterate through patents of IPC G06F, of reference year 2015 for evalYear 2016


100%|██████████| 1000/1000 [00:00<00:00, 2497.21it/s]


Create ES, iterate through patents of secondary IPC G06F, of reference year 2015 for evalYear 2016


100%|██████████| 84/84 [00:00<00:00, 2105.64it/s]
100%|██████████| 2/2 [00:00<00:00, 668.26it/s]


ES and KS done for: 2016_2015_G06F
ES and KS done for: 2016_G06F

 33%|███▎      | 1/3 [00:05<00:11,  5.95s/it]


df_KS shape:  (254, 9)
df_ES shape:  (7, 10)
Create toEval, iterate through all patents of current year 2014


100%|██████████| 1000/1000 [00:00<00:00, 3213.72it/s]
100%|██████████| 27/27 [00:00<00:00, 3008.35it/s]
100%|██████████| 19/19 [00:00<00:00, 2381.99it/s]


toEval/toEval/2014_A61B done
toEval shape:  (19, 9)
text/2014_A61B Done
Nb secondary IPC (text size):  6
Create KS, iterate through patents of IPC A61B, of reference year 2011 for evalYear 2014


100%|██████████| 1000/1000 [00:00<00:00, 3539.64it/s]


Create ES, iterate through patents of secondary IPC A61B, of reference year 2011 for evalYear 2014


100%|██████████| 11/11 [00:00<00:00, 2738.45it/s]
100%|██████████| 2/2 [00:00<00:00, 668.26it/s]


ES and KS done for: 2014_2011_A61B
Create KS, iterate through patents of IPC A61B, of reference year 2012 for evalYear 2014


100%|██████████| 1000/1000 [00:00<00:00, 2662.94it/s]


Create ES, iterate through patents of secondary IPC A61B, of reference year 2012 for evalYear 2014


100%|██████████| 22/22 [00:00<00:00, 2451.96it/s]
0it [00:00, ?it/s]


ES and KS done for: 2014_2012_A61B
Create KS, iterate through patents of IPC A61B, of reference year 2013 for evalYear 2014


100%|██████████| 1000/1000 [00:00<00:00, 3057.32it/s]


Create ES, iterate through patents of secondary IPC A61B, of reference year 2013 for evalYear 2014


100%|██████████| 35/35 [00:00<00:00, 2064.24it/s]
100%|██████████| 2/2 [00:00<00:00, 1002.58it/s]


ES and KS done for: 2014_2013_A61B
ES and KS done for: 2014_A61B
df_KS shape:  (63, 9)
df_ES shape:  (4, 10)
Create toEval, iterate through all patents of current year 2015


100%|██████████| 1000/1000 [00:00<00:00, 3047.65it/s]
100%|██████████| 26/26 [00:00<00:00, 2166.74it/s]
100%|██████████| 16/16 [00:00<00:00, 2281.30it/s]


toEval/toEval/2015_A61B done
toEval shape:  (16, 9)
text/2015_A61B Done
Nb secondary IPC (text size):  8
Create KS, iterate through patents of IPC A61B, of reference year 2012 for evalYear 2015


100%|██████████| 1000/1000 [00:00<00:00, 3871.32it/s]


Create ES, iterate through patents of secondary IPC A61B, of reference year 2012 for evalYear 2015


100%|██████████| 22/22 [00:00<00:00, 2198.48it/s]
0it [00:00, ?it/s]


ES and KS done for: 2015_2012_A61B
Create KS, iterate through patents of IPC A61B, of reference year 2013 for evalYear 2015


100%|██████████| 1000/1000 [00:00<00:00, 2777.49it/s]


Create ES, iterate through patents of secondary IPC A61B, of reference year 2013 for evalYear 2015


100%|██████████| 35/35 [00:00<00:00, 2925.25it/s]
100%|██████████| 3/3 [00:00<00:00, 1002.38it/s]


ES and KS done for: 2015_2013_A61B
Create KS, iterate through patents of IPC A61B, of reference year 2014 for evalYear 2015


100%|██████████| 1000/1000 [00:00<00:00, 3643.07it/s]


Create ES, iterate through patents of secondary IPC A61B, of reference year 2014 for evalYear 2015


100%|██████████| 27/27 [00:00<00:00, 3386.45it/s]
100%|██████████| 1/1 [00:00<00:00, 1002.94it/s]


ES and KS done for: 2015_2014_A61B
ES and KS done for: 2015_A61B
df_KS shape:  (80, 9)
df_ES shape:  (4, 10)
Create toEval, iterate through all patents of current year 2016


100%|██████████| 1000/1000 [00:00<00:00, 3436.61it/s]
100%|██████████| 30/30 [00:00<00:00, 2726.52it/s]
100%|██████████| 3/3 [00:00<00:00, 1481.04it/s]


toEval/toEval/2016_A61B done
toEval shape:  (3, 9)
text/2016_A61B Done
Nb secondary IPC (text size):  6
Create KS, iterate through patents of IPC A61B, of reference year 2013 for evalYear 2016


100%|██████████| 1000/1000 [00:00<00:00, 3469.46it/s]


Create ES, iterate through patents of secondary IPC A61B, of reference year 2013 for evalYear 2016


100%|██████████| 35/35 [00:00<00:00, 2193.35it/s]
100%|██████████| 10/10 [00:00<00:00, 2005.60it/s]


ES and KS done for: 2016_2013_A61B
Create KS, iterate through patents of IPC A61B, of reference year 2014 for evalYear 2016


100%|██████████| 1000/1000 [00:00<00:00, 3287.62it/s]


Create ES, iterate through patents of secondary IPC A61B, of reference year 2014 for evalYear 2016


100%|██████████| 27/27 [00:00<00:00, 3007.95it/s]
100%|██████████| 8/8 [00:00<00:00, 1604.40it/s]


ES and KS done for: 2016_2014_A61B
Create KS, iterate through patents of IPC A61B, of reference year 2015 for evalYear 2016


100%|██████████| 1000/1000 [00:00<00:00, 3583.43it/s]


Create ES, iterate through patents of secondary IPC A61B, of reference year 2015 for evalYear 2016


100%|██████████| 26/26 [00:00<00:00, 3259.17it/s]
100%|██████████| 7/7 [00:00<00:00, 1403.51it/s]
 67%|██████▋   | 2/3 [00:11<00:05,  5.43s/it]

ES and KS done for: 2016_2015_A61B
ES and KS done for: 2016_A61B
df_KS shape:  (80, 9)
df_ES shape:  (20, 10)
Create toEval, iterate through all patents of current year 2014


100%|██████████| 1000/1000 [00:00<00:00, 3234.09it/s]
100%|██████████| 45/45 [00:00<00:00, 2820.40it/s]
100%|██████████| 44/44 [00:00<00:00, 2751.84it/s]


toEval/toEval/2014_H01L done
toEval shape:  (44, 9)
text/2014_H01L Done
Nb secondary IPC (text size):  9
Create KS, iterate through patents of IPC H01L, of reference year 2011 for evalYear 2014


100%|██████████| 1000/1000 [00:00<00:00, 2284.23it/s]


Create ES, iterate through patents of secondary IPC H01L, of reference year 2011 for evalYear 2014


100%|██████████| 73/73 [00:00<00:00, 3177.73it/s]
0it [00:00, ?it/s]


ES and KS done for: 2014_2011_H01L
Create KS, iterate through patents of IPC H01L, of reference year 2012 for evalYear 2014


100%|██████████| 1000/1000 [00:00<00:00, 3376.06it/s]


Create ES, iterate through patents of secondary IPC H01L, of reference year 2012 for evalYear 2014


100%|██████████| 57/57 [00:00<00:00, 3649.89it/s]
0it [00:00, ?it/s]


ES and KS done for: 2014_2012_H01L
Create KS, iterate through patents of IPC H01L, of reference year 2013 for evalYear 2014


100%|██████████| 1000/1000 [00:00<00:00, 3463.31it/s]


Create ES, iterate through patents of secondary IPC H01L, of reference year 2013 for evalYear 2014


100%|██████████| 55/55 [00:00<00:00, 3938.65it/s]
0it [00:00, ?it/s]


ES and KS done for: 2014_2013_H01L
ES and KS done for: 2014_H01L
df_KS shape:  (184, 9)
df_ES shape:  (0, 10)
Create toEval, iterate through all patents of current year 2015


100%|██████████| 1000/1000 [00:00<00:00, 3622.35it/s]
100%|██████████| 61/61 [00:00<00:00, 3630.50it/s]
100%|██████████| 52/52 [00:00<00:00, 3468.19it/s]


toEval/toEval/2015_H01L done
toEval shape:  (52, 9)
text/2015_H01L Done
Nb secondary IPC (text size):  26
Create KS, iterate through patents of IPC H01L, of reference year 2012 for evalYear 2015


100%|██████████| 1000/1000 [00:00<00:00, 2808.36it/s]


Create ES, iterate through patents of secondary IPC H01L, of reference year 2012 for evalYear 2015


100%|██████████| 57/57 [00:00<00:00, 3175.10it/s]
100%|██████████| 7/7 [00:00<00:00, 1403.51it/s]


ES and KS done for: 2015_2012_H01L
Create KS, iterate through patents of IPC H01L, of reference year 2013 for evalYear 2015


100%|██████████| 1000/1000 [00:00<00:00, 3114.07it/s]


Create ES, iterate through patents of secondary IPC H01L, of reference year 2013 for evalYear 2015


100%|██████████| 55/55 [00:00<00:00, 3667.46it/s]
100%|██████████| 6/6 [00:00<00:00, 1503.60it/s]


ES and KS done for: 2015_2013_H01L
Create KS, iterate through patents of IPC H01L, of reference year 2014 for evalYear 2015


100%|██████████| 1000/1000 [00:00<00:00, 3056.95it/s]


Create ES, iterate through patents of secondary IPC H01L, of reference year 2014 for evalYear 2015


100%|██████████| 45/45 [00:00<00:00, 2255.97it/s]
100%|██████████| 12/12 [00:00<00:00, 1719.27it/s]


ES and KS done for: 2015_2014_H01L
ES and KS done for: 2015_H01L
df_KS shape:  (155, 9)
df_ES shape:  (25, 10)
Create toEval, iterate through all patents of current year 2016


100%|██████████| 1000/1000 [00:00<00:00, 3162.70it/s]


Quelques tests

In [None]:
pathData = "C:/Users/edgar/OneDrive/Bureau/Ecole/HEC/A24/BrevetNLP/data"
pathOutput = "C:/Users/edgar/OneDrive/Bureau/Ecole/HEC/A24/BrevetNLP/exemple données"
test = json2_KS_ES(2016, 2015, "G06F", pathData, pathOutput)

In [None]:
pd.options.display.max_colwidth = 10000000
print(test[0][test[0]['application_number']== '14815898'].summary)



In [None]:
for i in range(2011, 2017):
    print(i)

In [None]:
pathOutput + f'/ES/2016_1315_ES_raw.csv'

In [115]:
df = pd.read_csv(pathOutput + f'/KS/2016_1315_G06F_KS_raw.csv', index_col=False)

In [118]:
pd.options.display.max_colwidth = 100
# df.head

In [None]:
df[df.application_number == 14815898].summary
# df[235:]
# pd.options.display.max_colwidth = 100000
# print(df[df.application_number == 14815898].summary)