In [None]:
# >>> Path configuration (auto-inserted) >>>
from pathlib import Path
import os

PROJECT_ROOT = Path(os.getenv("GBB_PROJECT_ROOT", ".")).resolve()
DATA = PROJECT_ROOT / "data"
PATSTAT = PROJECT_ROOT / "patstat"
PATTEXT = PROJECT_ROOT / "patent_text"
SAMPLEDATA = PROJECT_ROOT / "sampledata"

# Fallback to sampledata if primary paths not present
if not PATSTAT.exists() and (SAMPLEDATA / "patstat").exists():
    PATSTAT = SAMPLEDATA / "patstat"
if not PATTEXT.exists() and (SAMPLEDATA / "patent_text").exists():
    PATTEXT = SAMPLEDATA / "patent_text"
if not DATA.exists() and (PROJECT_ROOT / "data").exists():
    DATA = PROJECT_ROOT / "data"

def P(*parts):
    return str(Path(*parts))
# <<< Path configuration (auto-inserted) <<<


In [None]:
import pandas as pd
import numpy as np
import gensim
import matplotlib.pyplot as plt
import seaborn as sns
import hdbscan
import umap

## Extract source fields from over over-represented cpc

In [2]:
df = pd.read_parquet(str(DATA / "green_proj_cosangle.parquet"))#,columns=['ccmt_patent_id','cos','cpc8d','leastwithin'])
df['weight'] = 1-df['cos']
df['prob'] = 1-df.leastsimilar
df.head()

Unnamed: 0,ccmt_patent_id,patent_id,cos,samecpc4d,cpc,leastsimilar,leastwithin,weight,prob
0,1187498,40387622,0.195809,1,C10K1/04,0.715514,0.85337,0.804191,0.284486
1,1187498,40387622,0.195809,1,F25J3/0209,0.907879,0.963588,0.804191,0.092121
2,1187498,40387622,0.195809,1,F25J3/0219,0.928604,0.9731,0.804191,0.071396
3,1187498,40387622,0.195809,1,F25J3/0233,0.940602,0.98421,0.804191,0.059398
4,1187498,40387622,0.195809,1,F25J3/0238,0.923259,0.974872,0.804191,0.076741


In [3]:
df.shape

(11861440, 9)

In [None]:
## Use IQR outlier detection to identify significant presence 
df_iqr = (
    df.groupby(["ccmt_patent_id", "patent_id"])["prob"]
    .quantile(0.25)
    .reset_index()
    .merge(
        df.groupby(["ccmt_patent_id", "patent_id"])["prob"].quantile(0.75).reset_index(),
        on=['ccmt_patent_id','patent_id']
    )
)
df_iqr['thresh'] = df_iqr.prob_y+1.5*(df_iqr.prob_y-df_iqr.prob_x)
df_iqr.head()

Unnamed: 0,ccmt_patent_id,patent_id,prob_x,prob_y,thresh
0,1187498,9232695,0.092724,0.137468,0.204584
1,1187498,40387622,0.070184,0.094831,0.131802
2,1236571,9935855,0.318573,0.362353,0.428023
3,1236571,17337750,0.325468,0.364993,0.424281
4,1236571,22832154,0.318065,0.419768,0.572322


In [5]:
df = df.merge(df_iqr[["ccmt_patent_id", "patent_id",'thresh']])
df['prob2'] = np.where(df.prob<=df.thresh,1-df.weight,0)
df.head()

Unnamed: 0,ccmt_patent_id,patent_id,cos,samecpc4d,cpc,leastsimilar,leastwithin,weight,prob,thresh,prob2
0,1187498,40387622,0.195809,1,C10K1/04,0.715514,0.85337,0.804191,0.284486,0.131802,0.0
1,1187498,40387622,0.195809,1,F25J3/0209,0.907879,0.963588,0.804191,0.092121,0.131802,0.195809
2,1187498,40387622,0.195809,1,F25J3/0219,0.928604,0.9731,0.804191,0.071396,0.131802,0.195809
3,1187498,40387622,0.195809,1,F25J3/0233,0.940602,0.98421,0.804191,0.059398,0.131802,0.195809
4,1187498,40387622,0.195809,1,F25J3/0238,0.923259,0.974872,0.804191,0.076741,0.131802,0.195809


In [None]:
## aggregate from observed pairs
df_agg = df.groupby(["ccmt_patent_id", "cpc"])['prob2'].max().reset_index().rename(columns={"prob2":'prob'})
df_agg.head()

Unnamed: 0,ccmt_patent_id,cpc,prob
0,1187498,C10K1/04,0.0
1,1187498,F25J2210/12,0.196449
2,1187498,F25J2230/60,0.196449
3,1187498,F25J2270/12,0.196449
4,1187498,F25J2270/906,0.196449


In [None]:
## exclude GBB parts from source fields
greencpc = pd.read_parquet(str(DATA / "clusangle_outlier_hdbscan2.parquet"),columns=['cpc','clus']).query('clus!="c-1"')
greencpc.head()

Unnamed: 0,cpc,clus
0,A01B1/00,c10
8,A01B39/18,c10
9,A01B61/00,c10
11,A01B63/1013,c10
12,A01B69/008,c10


In [None]:
## aggregate across patents
df_cpc_agg = df_agg[df_agg.prob>0].groupby('cpc')['prob'].agg(['mean','max','sum','count']).reset_index().merge(layout)
df_cpc_agg.head()

Unnamed: 0,cpc,mean,max,sum,count,cpc1d,cpc4d,title,x,y
0,A01B1/02,0.226079,0.226079,0.226079,1,A,A01B,Hand tools -Spades; Shovels,3.300158,-2.519649
1,A01B1/022,0.213669,0.223653,0.427339,2,A,A01B,Hand tools -Spades; Shovels -Collapsible; exte...,3.284784,-2.528554
2,A01B13/00,0.252779,0.255628,0.505558,2,A,A01B,Ploughs or like machines for special purposes ...,3.451846,-2.750587
3,A01B13/025,0.23564,0.23564,0.23564,1,A,A01B,Ploughs or like machines for special purposes ...,3.539223,-2.750748
4,A01B13/08,0.233413,0.240909,0.933653,4,A,A01B,Ploughs or like machines for special purposes ...,3.56078,-2.725041


In [22]:
df_cpc_agg = df_cpc_agg[~df_cpc_agg.cpc.isin(greencpc.cpc)].copy()

### cluster frequently presented semantically similar cpc as source fields

In [None]:
## load trained cpc2vec model
cpc2vec = gensim.models.Word2Vec.load(str(DATA / "external/cpc_sg_ns_dim50.model")).wv
cpc2vec.vectors.shape

(253017, 50)

In [15]:
ebd = cpc2vec[df_cpc_agg.cpc]
ebd.shape

(75636, 50)

In [None]:
## Use UMAP to reduce dimension to 5 to simplify clustering following HDBSCAN guidance

In [17]:
mapper = umap.UMAP(metric="cosine", n_neighbors=15,n_components=5)

In [18]:
mapper.fit(ebd)

In [19]:
ebd2 = mapper.embedding_
ebd2.shape

(75636, 5)

In [None]:
## Use hdbscan to cluster the frequently apprearing codes
clusterer = hdbscan.HDBSCAN(min_cluster_size=100)
clusterer.fit(ebd2)

In [23]:
df_cpc_agg['clus'] = clusterer.labels_
df_cpc_agg['clus'] = 's'+df_cpc_agg['clus'].astype(str)
df_cpc_agg['clus'].nunique()

194

In [42]:
df_cpc_agg.to_parquet(str(DATA / "clusangle_outlier_hdbscan2_src2.parquet"))