In [None]:
# >>> Path configuration (auto-inserted) >>>
from pathlib import Path
import os

PROJECT_ROOT = Path(os.getenv("GBB_PROJECT_ROOT", ".")).resolve()
DATA = PROJECT_ROOT / "data"
PATSTAT = PROJECT_ROOT / "patstat"
PATTEXT = PROJECT_ROOT / "patent_text"
SAMPLEDATA = PROJECT_ROOT / "sampledata"

# Fallback to sampledata if primary paths not present
if not PATSTAT.exists() and (SAMPLEDATA / "patstat").exists():
    PATSTAT = SAMPLEDATA / "patstat"
if not PATTEXT.exists() and (SAMPLEDATA / "patent_text").exists():
    PATTEXT = SAMPLEDATA / "patent_text"
if not DATA.exists() and (PROJECT_ROOT / "data").exists():
    DATA = PROJECT_ROOT / "data"

def P(*parts):
    return str(Path(*parts))
# <<< Path configuration (auto-inserted) <<<


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df = pd.read_parquet(str(DATA / "clusangle_ccmtagg.parquet"))
df.head()

Unnamed: 0,ccmt_patent_id,cpc,prob
0,1187498,C10K1/04,0.804191
1,1187498,F25J2210/12,0.0
2,1187498,F25J2230/60,0.0
3,1187498,F25J2270/12,0.0
4,1187498,F25J2270/906,0.0


In [None]:
## path refers to gbbs
path = pd.read_parquet(str(DATA / "clusangle_outlier_hdbscan2.parquet")).query('clus!="c-1"')
path.head()

Unnamed: 0,cpc,mean,max,sum,count,cpc1d,cpc4d,title,x,y,clus,clus2
0,A01B1/00,0.864104,0.877908,1.728208,2,A,A01B,Hand tools,3.285313,-2.539227,c10,c1
8,A01B39/18,0.837579,0.837579,0.837579,1,A,A01B,Other machines specially adapted for working s...,3.493809,-2.73465,c10,c1
9,A01B61/00,0.840016,0.840016,0.840016,1,A,A01B,"Devices for, or parts of, agricultural machine...",3.526913,-2.810171,c10,c1
11,A01B63/1013,0.820958,0.820958,0.820958,1,A,A01B,Lifting or adjusting devices or arrangements f...,2.154277,-5.402795,c10,c65
12,A01B69/008,0.72845,0.72845,0.72845,1,A,A01B,Steering of agricultural machines or implement...,3.437483,-2.810456,c10,c1


In [None]:
## src refers to source fields
src = pd.read_parquet(str(DATA / "clusangle_outlier_hdbscan2_src2.parquet")).query('clus!="s-1"')
src.head()

Unnamed: 0,cpc,mean,max,sum,count,cpc1d,cpc4d,title,x,y,clus,clus2
0,A01B1/02,0.226079,0.226079,0.226079,1,A,A01B,Hand tools -Spades; Shovels,3.300158,-2.519649,s112,s181
1,A01B1/022,0.213669,0.223653,0.427339,2,A,A01B,Hand tools -Spades; Shovels -Collapsible; exte...,3.284784,-2.528554,s112,s181
2,A01B13/00,0.252779,0.255628,0.505558,2,A,A01B,Ploughs or like machines for special purposes ...,3.451846,-2.750587,s112,s181
3,A01B13/025,0.23564,0.23564,0.23564,1,A,A01B,Ploughs or like machines for special purposes ...,3.539223,-2.750748,s112,s181
4,A01B13/08,0.233413,0.240909,0.933653,4,A,A01B,Ploughs or like machines for special purposes ...,3.56078,-2.725041,s112,s181


## aggregate data to find associations

In [None]:
## associations between GBB and source fields
srcpath = (
    df[["ccmt_patent_id", "cpc"]]
    .merge(path[["cpc", "clus"]].rename(columns={"clus": "path"}))[
        ["ccmt_patent_id", "path"]
    ]
    .drop_duplicates()
    .merge(
        df[["ccmt_patent_id", "cpc"]]
        .merge(src[["cpc", "clus"]].rename(columns={"clus": "src"}))[
            ["ccmt_patent_id", "src"]
        ]
        .drop_duplicates()
    )
    .groupby(["src", "path"])["ccmt_patent_id"]
    .nunique()
    .reset_index()
)
srcpath.head()

Unnamed: 0,src,path,ccmt_patent_id
0,s0,c0,3
1,s0,c1,5
2,s0,c10,8
3,s0,c12,18
4,s0,c13,1


In [6]:
srcpath["rca"] = (
    srcpath.ccmt_patent_id
    * srcpath.ccmt_patent_id.sum()
    / (
        srcpath.groupby("src").ccmt_patent_id.transform(sum)
        * srcpath.groupby("path").ccmt_patent_id.transform(sum)
    )
)
srcpath.head()

Unnamed: 0,src,path,ccmt_patent_id,rca
0,s0,c0,3,1.33956
1,s0,c1,5,0.514582
2,s0,c10,8,0.231331
3,s0,c12,18,1.415693
4,s0,c13,1,0.242206


In [7]:
srcpath['binrca'] = np.where(srcpath.rca>1,1,0)

In [8]:
def hhi(series):
    _, cnt = np.unique(series, return_counts=True)
    return np.square(cnt/cnt.sum()).sum()    

In [9]:
srcpath = srcpath.merge(srcpath.groupby('path')['ccmt_patent_id'].agg(['size',hhi]).reset_index(),how='left')
srcpath.head()

Unnamed: 0,src,path,ccmt_patent_id,rca,binrca,size,hhi
0,s0,c0,3,1.33956,1,98,0.116618
1,s0,c1,5,0.514582,0,149,0.077969
2,s0,c10,8,0.231331,0,164,0.036957
3,s0,c12,18,1.415693,1,153,0.053697
4,s0,c13,1,0.242206,0,103,0.121124


In [None]:
srcpath['effhhi'] = 1/srcpath.hhi
srcpath.effhhi.plot.hist()

In [11]:
srcpath['rnk']=srcpath.groupby('path')['ccmt_patent_id'].rank(ascending=False)
srcpath.head()

Unnamed: 0,src,path,ccmt_patent_id,rca,binrca,size,hhi,effhhi,rnk
0,s0,c0,3,1.33956,1,98,0.116618,8.575,54.5
1,s0,c1,5,0.514582,0,149,0.077969,12.825534,74.5
2,s0,c10,8,0.231331,0,164,0.036957,27.05835,95.0
3,s0,c12,18,1.415693,1,153,0.053697,18.622912,50.0
4,s0,c13,1,0.242206,0,103,0.121124,8.256031,89.0


In [12]:
srcpath.to_parquet(str(DATA / "srcpath33.parquet"))

In [None]:
## associations between GBB and targets
y02 = pd.read_parquet(
    str(PATSTAT / "tls225.parquet"),
    columns=['docdb_family_id','cpc_class_symbol'],
)
y02.columns=['ccmt_patent_id','y02']
y02 = y02[y02.y02.str[:3].isin(['Y02','Y04'])].copy()
y02['y02'] = y02['y02'].str.split('/').str.get(0)
y02.drop_duplicates(inplace=True)
y02.head()

Unnamed: 0,ccmt_patent_id,y02
40,2985,Y02E 10
77,4031,Y02W 30
98,6071,Y02E 10
173,9985,Y02A 40
184,10027,Y02E 10


In [13]:
y02clus = (
    df[df.prob > 0][["ccmt_patent_id", "cpc"]]
    .merge(path[['cpc','clus']].query('clus!="c-1"').rename(columns={'clus':'path'}))
    .merge(y02)
    .groupby(["path", "y02"])["ccmt_patent_id"]
    .nunique()
    .reset_index()
)
y02clus.head()

Unnamed: 0,path,y02,ccmt_patent_id
0,c0,Y02A 40,1
1,c0,Y02B 40,1
2,c0,Y02B 70,2
3,c0,Y02C 10,2
4,c0,Y02E 10,9


In [14]:
y02clus = y02clus.merge(y02clus.groupby('path')['ccmt_patent_id'].agg(['size',hhi]).reset_index(),how='left')
y02clus.head()

Unnamed: 0,path,y02,ccmt_patent_id,size,hhi
0,c0,Y02A 40,1,14,0.204082
1,c0,Y02B 40,1,14,0.204082
2,c0,Y02B 70,2,14,0.204082
3,c0,Y02C 10,2,14,0.204082
4,c0,Y02E 10,9,14,0.204082


In [None]:
y02clus['effhhi'] = 1/y02clus.hhi
y02clus.effhhi.plot.hist()

In [16]:
y02clus["rca"] = (
    y02clus.ccmt_patent_id
    * y02clus.ccmt_patent_id.sum()
    / (
        y02clus.groupby("path").ccmt_patent_id.transform(sum)
        * y02clus.groupby("y02").ccmt_patent_id.transform(sum)
    )
)
y02clus.head()

Unnamed: 0,path,y02,ccmt_patent_id,size,hhi,effhhi,rca
0,c0,Y02A 40,1,14,0.204082,4.9,0.153156
1,c0,Y02B 40,1,14,0.204082,4.9,1.021043
2,c0,Y02B 70,2,14,0.204082,4.9,0.137703
3,c0,Y02C 10,2,14,0.204082,4.9,0.218275
4,c0,Y02E 10,9,14,0.204082,4.9,0.156637


In [17]:
y02clus['binrca'] = np.where(y02clus.rca>1,1,0)

In [19]:
y02clus.to_parquet(str(DATA / "y02path33.parquet"))

## visualize association

In [3]:
vizmat = srcpath.pivot(index='path',columns='src',values='ccmt_patent_id').fillna(0)
vizmat

src,s0,s1,s10,s100,s101,s102,s103,s104,s105,s106,...,s90,s91,s92,s93,s94,s95,s96,s97,s98,s99
path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
c0,3.0,0.0,1.0,1.0,0.0,4.0,51.0,0.0,0.0,3.0,...,3.0,0.0,1.0,36.0,0.0,0.0,11.0,1.0,1.0,0.0
c1,5.0,0.0,4.0,2.0,3.0,15.0,352.0,5.0,0.0,9.0,...,4.0,2.0,7.0,71.0,0.0,0.0,21.0,4.0,1.0,1.0
c10,8.0,3.0,0.0,26.0,55.0,4.0,9.0,273.0,8.0,106.0,...,0.0,0.0,1.0,0.0,0.0,1.0,12.0,1.0,3.0,8.0
c11,0.0,0.0,20.0,3.0,3.0,9.0,25.0,2.0,0.0,6.0,...,11.0,1.0,0.0,1.0,0.0,0.0,59.0,3.0,27.0,0.0
c12,18.0,5.0,34.0,5.0,0.0,1.0,26.0,0.0,3.0,7.0,...,1.0,2.0,0.0,5.0,0.0,0.0,32.0,71.0,50.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
c79,0.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
c8,5.0,0.0,0.0,0.0,0.0,9.0,35.0,0.0,0.0,0.0,...,15.0,40.0,151.0,212.0,8.0,4.0,0.0,0.0,0.0,0.0
c80,0.0,0.0,0.0,0.0,19.0,0.0,0.0,11.0,8.0,4.0,...,1.0,2.0,0.0,0.0,1.0,0.0,11.0,0.0,0.0,0.0
c81,0.0,1.0,1.0,0.0,2.0,0.0,0.0,2.0,0.0,21.0,...,0.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0


In [None]:
idcpc = np.argsort(-(vizmat).sum(axis=0))
idclus = np.argsort(-(vizmat).sum(axis=1))
vizmat2 = np.log(vizmat+1).values[:,idcpc][idclus]
import matplotlib.pyplot as plt
plt.figure(figsize=(30,10))
sns.heatmap(vizmat2,cmap='Reds',square=True,cbar=False)
plt.axis('off')