In [5]:
import os
import pandas as pd

From the website https://www.cooperativepatentclassification.org/cpcSchemeAndDefinitions/bulk, I downloaded the CPC title list.

It comes in a text file for each category.

In [6]:
f_path = "../input/CPC_Title_list/"
f = []
for (dirpath, dirnames, filenames) in os.walk(f_path):
    f.extend(filenames)
    
print(f)

['cpc-section-A_20220501.txt', 'cpc-section-B_20220501.txt', 'cpc-section-C_20220501.txt', 'cpc-section-D_20220501.txt', 'cpc-section-E_20220501.txt', 'cpc-section-F_20220501.txt', 'cpc-section-G_20220501.txt', 'cpc-section-H_20220501.txt', 'cpc-section-Y_20220501.txt']


In [9]:
df = pd.read_csv('../input/train_folds.csv')
df.head()

Unnamed: 0,id,anchor,target,context,score,kfold
0,a5d73ee843b04779,triethylammonium salt,desired product,C07,0.25,0
1,018aaf30c323687f,gate insulator film,film oxide insulating layer,H01,0.5,0
2,0fe44dd61e389881,retinal tissue,patient,A61,0.0,0
3,80fbb8814987103f,base fuels,fuel generally,C10,0.5,0
4,f3a23414e228563d,electromagnetic input,received frequency,G01,0.5,0


In [12]:
context_list = df['context'].unique()
print(len(context_list))

106


In [16]:
context_df = pd.DataFrame(context_list, columns=['cpc'])
context_df.head()

Unnamed: 0,cpc
0,C07
1,H01
2,A61
3,C10
4,G01


In [32]:
context_df['section_code'] = context_df['cpc'].str[:1]
context_df['class_code'] = context_df['cpc'].str[1:3]

context_df.head()

Unnamed: 0,cpc,section,class,section_code,class_code
0,C07,C,7,C,7
1,H01,H,1,H,1
2,A61,A,61,A,61
3,C10,C,10,C,10
4,G01,G,1,G,1


In [23]:
section_code_list = context_df['section_code'].unique()
section_code_list

array(['C', 'H', 'A', 'G', 'F', 'B', 'E', 'D'], dtype=object)

In [40]:
# Create a dictionary of section names
d_section={}

for cd in section_code_list:
    cpc_df = pd.read_csv(os.path.join(f_path, f'cpc-section-{cd}_20220501.txt'), sep='\t', header=None, 
                         names=['cpc', 'subgroup', 'desc'])
    section_text = cpc_df['desc'][cpc_df['cpc'] == cd].values[0]
    d_section[cd] = section_text
print(d_section)

{'C': 'CHEMISTRY; METALLURGY', 'H': 'ELECTRICITY', 'A': 'HUMAN NECESSITIES', 'G': 'PHYSICS', 'F': 'MECHANICAL ENGINEERING; LIGHTING; HEATING; WEAPONS; BLASTING', 'B': 'PERFORMING OPERATIONS; TRANSPORTING', 'E': 'FIXED CONSTRUCTIONS', 'D': 'TEXTILES; PAPER'}


In [44]:
# Create a dictionary of group names
d_group={}

for cd in section_code_list:
    cpc_df = pd.read_csv(os.path.join(f_path, f'cpc-section-{cd}_20220501.txt'), sep='\t', header=None, 
                         names=['cpc', 'subgroup', 'desc'])
    
    # get list of relevant CPCs for this section
    section_cpc_list = context_df['cpc'][context_df['section'] == cd].unique()
    
    # get definition for each one
    for gp in section_cpc_list:
        group_text = cpc_df['desc'][cpc_df['cpc'] == gp].values[0]
        d_group[gp] = group_text
print(d_group)


{'C07': 'ORGANIC CHEMISTRY', 'C10': 'PETROLEUM, GAS OR COKE INDUSTRIES; TECHNICAL GASES CONTAINING CARBON MONOXIDE; FUELS; LUBRICANTS; PEAT', 'C03': 'GLASS; MINERAL OR SLAG WOOL', 'C23': 'COATING METALLIC MATERIAL; COATING MATERIAL WITH METALLIC MATERIAL; CHEMICAL SURFACE TREATMENT; DIFFUSION TREATMENT OF METALLIC MATERIAL; COATING BY VACUUM EVAPORATION, BY SPUTTERING, BY ION IMPLANTATION OR BY CHEMICAL VAPOUR DEPOSITION, IN GENERAL; INHIBITING CORROSION OF METALLIC MATERIAL OR INCRUSTATION IN GENERAL', 'C11': 'ANIMAL OR VEGETABLE OILS, FATS, FATTY SUBSTANCES OR WAXES; FATTY ACIDS THEREFROM; DETERGENTS; CANDLES', 'C01': 'INORGANIC CHEMISTRY', 'C09': 'DYES; PAINTS; POLISHES; NATURAL RESINS; ADHESIVES; COMPOSITIONS NOT OTHERWISE PROVIDED FOR; APPLICATIONS OF MATERIALS NOT OTHERWISE PROVIDED FOR', 'C12': 'BIOCHEMISTRY; BEER; SPIRITS; WINE; VINEGAR; MICROBIOLOGY; ENZYMOLOGY; MUTATION OR GENETIC ENGINEERING', 'C08': 'ORGANIC MACROMOLECULAR COMPOUNDS; THEIR PREPARATION OR CHEMICAL WORKING-UP

In [59]:
# write cpc dictionaries to input files

section_df = pd.DataFrame.from_dict(d_section, orient='index').reset_index()
section_df = section_df.rename(columns={'index':'section_code', 0:'section_text'})
section_df.to_csv('../input/section_lookup_tb.csv', index=False)

group_df = pd.DataFrame.from_dict(d_group, orient='index').reset_index()
group_df = group_df.rename(columns={'index':'group_code', 0:'group_text'})
group_df.to_csv('../input/group_lookup_tb.csv', index=False)