In [1]:
import pandas as pd
import numpy as np
import gensim
from scipy.linalg import qr
import scipy.sparse as sp
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=25,progress_bar=True)

INFO: Pandarallel will run on 25 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## identify the difference of CPCs in pairs

In [2]:
df_matched_expand = pd.read_parquet('data/matched_ccmt_other.parquet',columns=['ccmt_patent_id','patent_id','cos'])
df_matched_expand = df_matched_expand[df_matched_expand['cos'].between(0.01,0.5)]
df_matched_expand.head()

Unnamed: 0,ccmt_patent_id,patent_id,cos
0,1187498,24371233,0.186756
1,1187498,25512219,0.191544
2,1187498,11782729,0.191563
3,1187498,25511142,0.195533
4,1187498,40387622,0.195809


In [None]:
## load trained cpc2vec from ocrrence of cpc in patents
cpc2vec = gensim.models.Word2Vec.load('data/external/cpc_sg_ns_dim50.model').wv
cpc2vec.unit_normalize_all()
cpc2vec.vectors.shape

(253017, 50)

In [4]:
famidlist = set(df_matched_expand.ccmt_patent_id.tolist()+df_matched_expand.patent_id.tolist())
len(famidlist)

1271058

In [None]:
# load cpc
cpc = pd.read_parquet(
    "../patstat/tls225.parquet",
    columns=['docdb_family_id','cpc_class_symbol'],
)
cpc.columns=['patent_id','cpc']
cpc['cpc'] = cpc.cpc.str.replace(" ", "")
cpc = cpc[cpc.patent_id.isin(famidlist)&(cpc.cpc.isin(cpc2vec.index_to_key))]
cpc.drop_duplicates(inplace=True)
cpc.head()

Unnamed: 0,patent_id,cpc
613644,569328,A61M1/32
613645,569328,A61M1/325
613646,569328,Y10S261/28
619771,574675,A61F7/02
619772,574675,A61F2007/0001


In [6]:
cpc.shape, cpc.patent_id.nunique()

((7921721, 2), 1265550)

In [7]:
y02 = cpc[cpc.cpc.str[:3].isin(['Y02','Y04'])].copy().reset_index(drop=True)
cpc1 = cpc[cpc.cpc.str[0]!='Y'].copy().reset_index(drop=True)
del cpc

In [8]:
cpc4d = cpc1.assign(cpc4d = cpc1.cpc.str[:4])[['patent_id','cpc4d']].drop_duplicates()
cpc4d.head()

Unnamed: 0,patent_id,cpc4d
0,569328,A61M
2,574675,A61F
6,574675,A61L
8,1187498,C10K
9,1187498,F25J


In [10]:
from pandas.api.types import CategoricalDtype
pat_c = CategoricalDtype(sorted(cpc1.patent_id.unique()), ordered=True)
cpc_c = CategoricalDtype(sorted(cpc1.cpc.unique()), ordered=True)
pat_c.categories, cpc_c.categories

(Index([  569328,   574675,  1187498,  1236482,  1236560,  1236571,  1239098,
         1239277,  1239357,  1239634,
        ...
        68699051, 68699111, 68840825, 68979654, 69143348, 69185062, 69322564,
        69323073, 69323075, 69416627],
       dtype='int64', length=1265470),
 Index(['A01B1/00', 'A01B1/02', 'A01B1/022', 'A01B1/026', 'A01B1/028',
        'A01B1/06', 'A01B1/065', 'A01B1/08', 'A01B1/10', 'A01B1/12',
        ...
        'H05K9/0084', 'H05K9/0086', 'H05K9/0088', 'H05K9/009', 'H05K9/0092',
        'H05K9/0094', 'H05K9/0096', 'H05K9/0098', 'H05K999/00', 'H05K999/99'],
       dtype='object', length=194709))

In [11]:
cpc4d_c = CategoricalDtype(sorted(cpc4d.cpc4d.unique().tolist()), ordered=True)
cpc4d_c.categories

Index(['A01B', 'A01C', 'A01D', 'A01F', 'A01G', 'A01H', 'A01J', 'A01K', 'A01L',
       'A01M',
       ...
       'H04Q', 'H04R', 'H04S', 'H04W', 'H05B', 'H05C', 'H05F', 'H05G', 'H05H',
       'H05K'],
      dtype='object', length=653)

In [12]:
np.save('data/cpc4d_c.npy', cpc4d_c.categories)
np.save('data/cpc_c.npy', cpc_c.categories)
np.save('data/pat_c.npy', pat_c.categories)

In [13]:
dfrow2 = cpc4d.patent_id.astype(pat_c).cat.codes
dfcol2 = cpc4d.cpc4d.astype(cpc4d_c).cat.codes
dfcol2

0           70
2           64
6           69
8          305
9          498
          ... 
6990573    548
6990575    489
6990576    551
6990577    610
6990590     61
Length: 2294532, dtype: int16

In [None]:
## create sparse matrix of pat*cpc
patcpc4dmat = sp.coo_matrix(
    (np.ones(cpc4d.shape[0], dtype=int), (dfrow2, dfcol2)),
    shape=(pat_c.categories.size, cpc4d_c.categories.size),
    dtype=int,
).tocsr()
patcpc4dmat.shape

(1265470, 653)

In [16]:
df_matched = df_matched_expand[
    df_matched_expand.ccmt_patent_id.isin(cpc1.patent_id)
    & df_matched_expand.patent_id.isin(cpc1.patent_id)
].copy()
df_matched.shape

(2865880, 3)

In [17]:
ccmtcpc4dmat = patcpc4dmat[df_matched.ccmt_patent_id.astype(pat_c).cat.codes,:]
othercpc4dmat = patcpc4dmat[df_matched.patent_id.astype(pat_c).cat.codes,:]
ccmtcpc4dmat.shape,othercpc4dmat.shape

((2865880, 653), (2865880, 653))

In [None]:
## make sure matched patents has at least 1 common cpc 4 digit code
df_matched['samecpc4d'] = (ccmtcpc4dmat.multiply(othercpc4dmat)).sum(axis=1).A1
df_matched = df_matched.query('samecpc4d>0').copy()
df_matched.head()

Unnamed: 0,ccmt_patent_id,patent_id,cos,samecpc4d
4,1187498,40387622,0.195809,1
5,1187498,9232695,0.196449,1
12,1236571,9935855,0.202074,1
14,1236571,24695090,0.213555,1
17,1236571,22832154,0.224277,1


## identify least similar cpc within group

In [None]:
cpcnest = cpc1.groupby('patent_id')['cpc'].apply(list)
cpcnest.head()

patent_id
569328                                 [A61M1/32, A61M1/325]
574675     [A61F7/02, A61F2007/0001, A61F2007/0242, A61F2...
1187498    [C10K1/04, F25J3/0209, F25J3/0219, F25J3/0233,...
1236482                 [G11B25/043, G11B33/14, G11B33/1433]
1236560                                          [C23F11/18]
Name: cpc, dtype: object

In [None]:
df = df_matched.merge(cpc1.rename(columns={"patent_id": "ccmt_patent_id"}))
df.head()

Unnamed: 0,ccmt_patent_id,patent_id,cos,samecpc4d,cpc
0,1187498,40387622,0.195809,1,C10K1/04
1,1187498,40387622,0.195809,1,F25J3/0209
2,1187498,40387622,0.195809,1,F25J3/0219
3,1187498,40387622,0.195809,1,F25J3/0233
4,1187498,40387622,0.195809,1,F25J3/0238


In [None]:
def leastsimilar(row):
    return (cpc2vec[cpcnest.loc[row.patent_id]] @ cpc2vec[row.cpc]).max()

In [None]:
df['leastsimilar'] = df.parallel_apply(leastsimilar,axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=474458), Label(value='0 / 474458')…

In [41]:
df.to_parquet('data/green_proj_cosangle.parquet')

In [43]:
df_matched.to_parquet('data/matched_ccmt_other_filtered.parquet')

In [44]:
df_matched.shape

(1457968, 4)