In [None]:
# >>> Path configuration (auto-inserted) >>>
from pathlib import Path
import os

PROJECT_ROOT = Path(os.getenv("GBB_PROJECT_ROOT", ".")).resolve()
DATA = PROJECT_ROOT / "data"
PATSTAT = PROJECT_ROOT / "patstat"
PATTEXT = PROJECT_ROOT / "patent_text"
SAMPLEDATA = PROJECT_ROOT / "sampledata"

# Fallback to sampledata if primary paths not present
if not PATSTAT.exists() and (SAMPLEDATA / "patstat").exists():
    PATSTAT = SAMPLEDATA / "patstat"
if not PATTEXT.exists() and (SAMPLEDATA / "patent_text").exists():
    PATTEXT = SAMPLEDATA / "patent_text"
if not DATA.exists() and (PROJECT_ROOT / "data").exists():
    DATA = PROJECT_ROOT / "data"

def P(*parts):
    return str(Path(*parts))
# <<< Path configuration (auto-inserted) <<<


In [None]:
import pandas as pd
import numpy as np

In [None]:
firmidname = pd.read_parquet(
    str(PATSTAT / "tls206.parquet"),
    columns=["person_id", "psn_id", "psn_name", "psn_sector"],
).query("psn_sector=='COMPANY'").drop(columns='psn_sector')
firmidname.head()

Unnamed: 0,person_id,psn_id,psn_name
1,50015511,24480361,PRINCETON GAMMA TECH INSTRUMENTS
4,50015514,24494825,PROCESS QUERY SYSTEMS
7,50015517,24500379,PRODUCTIVE RESEARCH
9,50015519,24522295,PRONERVE
11,50015521,24531532,PROTECTWISE


In [None]:
tls207 = pd.read_parquet(
    str(PATSTAT / "tls207.parquet"),
    columns=["person_id", "appln_id"],
)
tls207.head()

Unnamed: 0,person_id,appln_id
0,1,1
1,1,7
2,1,46
3,1,775
4,1,1192


In [None]:
tls201 = pd.read_parquet(
    str(PATSTAT / "tls201.parquet"),
    columns=["appln_id","docdb_family_id",'appln_filing_year']
)
tls201.head()

Unnamed: 0,appln_id,docdb_family_id,appln_filing_year
0,0,0,9999
1,1,8554171,2000
2,2,27517085,1992
3,3,7915918,2000
4,4,22889365,2000


In [5]:
firmpat = (
    tls201.merge(tls207)
    .merge(firmidname[["person_id", "psn_id"]].drop_duplicates())
    .groupby(["psn_id", "docdb_family_id"])["appln_filing_year"]
    .min()
    .reset_index()
    .rename(columns={"psn_id": "person_id", "appln_filing_year": "year"})
)
firmpat.head()

Unnamed: 0,person_id,docdb_family_id,year
0,2,20222983,1999
1,17,3593923,1975
2,19,51792190,2015
3,20,38567323,1959
4,21,3857377,9999


In [6]:
firmpat["period"] = np.where(firmpat.year.between(1985, 1994), 1985, 0)
firmpat["period"] = np.where(firmpat.year.between(1995, 2004), 1995, firmpat.period)
firmpat["period"] = np.where(firmpat.year.between(2005, 2014), 2005, firmpat.period)
firmpat["period"] = np.where(firmpat.year >= 2015, 0, firmpat.period)
firmpat = firmpat.merge(
    firmpat.groupby("person_id")["docdb_family_id"]
    .nunique()
    .reset_index()
    .rename(columns={"docdb_family_id": "totalpat"})
)
firmpat.head()

Unnamed: 0,person_id,docdb_family_id,year,period,totalpat
0,2,20222983,1999,1995,1
1,17,3593923,1975,0,1
2,19,51792190,2015,0,1
3,20,38567323,1959,0,1
4,21,3857377,9999,0,1


In [7]:
firmpat = firmpat.query('totalpat>=100')
firmpat.shape, firmpat.person_id.nunique()

((26426589, 5), 32909)

In [8]:
firmpat.to_parquet('firmpat.parquet')

In [2]:
firmpat = pd.read_parquet('firmpat.parquet')
firmpat.head()

Unnamed: 0,person_id,docdb_family_id,year,period,totalpat
4254,26821,9910158,2002,1995,147
4255,26821,9911704,2001,1995,147
4256,26821,9911708,2001,1995,147
4257,26821,9911711,2001,1995,147
4258,26821,9911961,2001,1995,147


In [3]:
firmpat.query('totalpat>=100').person_id.nunique()

32909

In [4]:
firmpat = firmpat.query('totalpat>=100')
firmpat.person_id.nunique()

32909

In [5]:
firmpat.to_parquet('firmpat2.parquet',index=False)

## start here

In [2]:
firmpat = pd.read_parquet('firmpat2.parquet')
firmpat.head()

Unnamed: 0,person_id,docdb_family_id,year,period,totalpat
0,26821,9910158,2002,1995,147
1,26821,9911704,2001,1995,147
2,26821,9911708,2001,1995,147
3,26821,9911711,2001,1995,147
4,26821,9911961,2001,1995,147


In [3]:
fpc = firmpat.groupby(['person_id','period'])['docdb_family_id'].nunique().reset_index().rename(columns={'docdb_family_id':'wt'})
fpc.head()

Unnamed: 0,person_id,period,wt
0,26821,1995,88
1,26821,2005,59
2,27530,0,278
3,27530,2005,15
4,27652,0,100


In [4]:
fpc.to_parquet('firm_period_patcount.parquet')

In [None]:
tls225 = pd.read_parquet(str(PATSTAT / "tls225.parquet"),columns=['docdb_family_id','cpc_class_symbol','cpc4d'])
tls225.head()

Unnamed: 0,docdb_family_id,cpc_class_symbol,cpc4d
0,76,E04C 1/42,E04C
1,184,B23Q 16/005,B23Q
2,277,F16C 33/4664,F16C
3,277,F16C 33/52,F16C
4,292,G01F 1/07,G01F


In [6]:
firm_cpc = firmpat[['person_id','docdb_family_id','period']].drop_duplicates().merge(tls225).rename(columns={'cpc_class_symbol':'cpc'})
firm_cpc.head()

Unnamed: 0,person_id,docdb_family_id,period,cpc,cpc4d
0,26821,9910158,1995,H01L 41/0993,H01L
1,26821,9910158,1995,H01L 41/338,H01L
2,26821,9911708,1995,H04R 1/26,H04R
3,26821,9911708,1995,H04R 1/403,H04R
4,26821,9911708,1995,H04R 3/12,H04R


In [7]:
firm_cpc.to_parquet('firm_pat_cpc.parquet')

In [8]:
firmperiody02 = (
    firm_cpc.assign(y02=firm_cpc.cpc.str.split('/').str.get(0))[firm_cpc.cpc.str[:3].isin(["Y02",'Y04'])]
    .groupby(["person_id", "period", "y02"])["docdb_family_id"]
    .nunique()
    .reset_index()
    .rename(columns={"docdb_family_id": "num_pat_ccmt"})
)
firmperiody02.head()

Unnamed: 0,person_id,period,y02,num_pat_ccmt
0,27974,0,Y02E 10,1
1,27974,0,Y02P 70,12
2,28258,2005,Y02W 90,1
3,35524,0,Y02B 10,2
4,35524,0,Y02D 70,1


In [9]:
firmperiody02.to_parquet('firmperiody02.parquet')

In [10]:
cluscpc = pd.read_parquet('../data/clusangle_outlier_hdbscan2.parquet').query('clus!="c-1"')
#cluscpc['cpc'] = cluscpc['cpc'].str.replace(' ','')
#cluscpc['num'] = cluscpc.groupby('clus')['cpc'].transform('nunique')
#cluscpc = cluscpc.assign(clus="c" + cluscpc.clus.astype(str))
cluscpc.head()

Unnamed: 0,cpc,mean,max,sum,count,cpc1d,cpc4d,title,x,y,clus,clus2
0,A01B1/00,0.864104,0.877908,1.728208,2,A,A01B,Hand tools,3.285313,-2.539227,c10,c1
8,A01B39/18,0.837579,0.837579,0.837579,1,A,A01B,Other machines specially adapted for working s...,3.493809,-2.73465,c10,c1
9,A01B61/00,0.840016,0.840016,0.840016,1,A,A01B,"Devices for, or parts of, agricultural machine...",3.526913,-2.810171,c10,c1
11,A01B63/1013,0.820958,0.820958,0.820958,1,A,A01B,Lifting or adjusting devices or arrangements f...,2.154277,-5.402795,c10,c65
12,A01B69/008,0.72845,0.72845,0.72845,1,A,A01B,Steering of agricultural machines or implement...,3.437483,-2.810456,c10,c1


In [11]:
cluscpc['rnk'] = cluscpc.groupby('clus')['max'].rank(ascending=False)
cluscpc['rca'] = cluscpc['max']*cluscpc['max'].sum()/cluscpc.groupby('clus')['max'].transform(sum)/cluscpc.groupby('cpc')['max'].transform(sum)
##note the filtering!
#cluscpc = cluscpc.query('count>5')
cluscpc.head()

Unnamed: 0,cpc,mean,max,sum,count,cpc1d,cpc4d,title,x,y,clus,clus2,rnk,rca
0,A01B1/00,0.864104,0.877908,1.728208,2,A,A01B,Hand tools,3.285313,-2.539227,c10,c1,351.0,12.292547
8,A01B39/18,0.837579,0.837579,0.837579,1,A,A01B,Other machines specially adapted for working s...,3.493809,-2.73465,c10,c1,995.0,12.292547
9,A01B61/00,0.840016,0.840016,0.840016,1,A,A01B,"Devices for, or parts of, agricultural machine...",3.526913,-2.810171,c10,c1,934.0,12.292547
11,A01B63/1013,0.820958,0.820958,0.820958,1,A,A01B,Lifting or adjusting devices or arrangements f...,2.154277,-5.402795,c10,c65,1323.0,12.292547
12,A01B69/008,0.72845,0.72845,0.72845,1,A,A01B,Steering of agricultural machines or implement...,3.437483,-2.810456,c10,c1,2340.5,12.292547


In [12]:
cluscpc.cpc.nunique()

30675

In [13]:
srccpc = pd.read_parquet('../data/clusangle_outlier_hdbscan2_src2.parquet').query('clus!="s-1"')
srccpc.head()

Unnamed: 0,cpc,mean,max,sum,count,cpc1d,cpc4d,title,x,y,clus,clus2
0,A01B1/02,0.226079,0.226079,0.226079,1,A,A01B,Hand tools -Spades; Shovels,3.300158,-2.519649,s112,s181
1,A01B1/022,0.213669,0.223653,0.427339,2,A,A01B,Hand tools -Spades; Shovels -Collapsible; exte...,3.284784,-2.528554,s112,s181
2,A01B13/00,0.252779,0.255628,0.505558,2,A,A01B,Ploughs or like machines for special purposes ...,3.451846,-2.750587,s112,s181
3,A01B13/025,0.23564,0.23564,0.23564,1,A,A01B,Ploughs or like machines for special purposes ...,3.539223,-2.750748,s112,s181
4,A01B13/08,0.233413,0.240909,0.933653,4,A,A01B,Ploughs or like machines for special purposes ...,3.56078,-2.725041,s112,s181


In [14]:
firmperiodcpc = firm_cpc.drop_duplicates(subset=["person_id", "period", "docdb_family_id",'cpc']).groupby(['person_id','period','cpc'])['docdb_family_id'].nunique().reset_index()
firmperiodcpc['cpc'] = firmperiodcpc['cpc'].str.replace(' ','')
firmperiodcpc.head()

Unnamed: 0,person_id,period,cpc,docdb_family_id
0,26821,1995,A63J5/04,1
1,26821,1995,B06B1/0603,3
2,26821,1995,B41J2/1607,3
3,26821,1995,B41J2/1623,3
4,26821,1995,B41J2/1632,2


In [15]:
firmperiodcpc.to_parquet('firm_cpc_period.parquet')

In [16]:
firmperiodclus = firmperiodcpc.merge(cluscpc[['clus','cpc']])
firmperiodclus.head()

Unnamed: 0,person_id,period,cpc,docdb_family_id,clus
0,26821,1995,F16C32/06,3,c10
1,565933,0,F16C32/06,1,c10
2,712151,1985,F16C32/06,1,c10
3,713524,0,F16C32/06,1,c10
4,713531,0,F16C32/06,2,c10


In [17]:
firmperiodclusrca = firmperiodclus.groupby(['person_id','period','clus'])['docdb_family_id'].sum().reset_index()
firmperiodclusrca['rca'] = (
    firmperiodclusrca.groupby("period")["docdb_family_id"].transform(sum)
    * firmperiodclusrca.docdb_family_id
    / firmperiodclusrca.groupby(["person_id", "period"])["docdb_family_id"].transform(sum)
    / firmperiodclusrca.groupby(["clus", "period"])["docdb_family_id"].transform(sum)
)
firmperiodclusrca['binrca'] = np.where(firmperiodclusrca.rca>=1,1,0)
firmperiodclusrca.head()

Unnamed: 0,person_id,period,clus,docdb_family_id,rca,binrca
0,26821,1995,c10,6,3.589151,1
1,26821,1995,c13,6,43.738779,1
2,26821,1995,c23,1,2.678196,1
3,26821,1995,c57,2,7.088781,1
4,26821,1995,c78,13,3.160562,1


In [18]:
firmperiodcluscnt = firmperiodclusrca.query('binrca>=1').groupby(['person_id','period'])['clus'].nunique().reset_index().rename(columns={"clus":'num_clus'})
firmperiodcluscnt.head()

Unnamed: 0,person_id,period,num_clus
0,26821,1995,5
1,26821,2005,2
2,27530,0,3
3,27530,2005,1
4,27652,0,10


In [19]:
firmperiodcluscnt.to_parquet('firmperiodclus.parquet')

In [20]:
firmperiodsrc = firmperiodcpc.merge(srccpc[['clus','cpc']])
firmperiodsrc.head()

Unnamed: 0,person_id,period,cpc,docdb_family_id,clus
0,26821,1995,B06B1/0603,3,s114
1,26821,2005,B06B1/0603,1,s114
2,130605,2005,B06B1/0603,1,s114
3,130609,2005,B06B1/0603,1,s114
4,406138,2005,B06B1/0603,1,s114


In [21]:
firmperiodsrcrca = firmperiodsrc.rename(columns={'clus':'src'}).groupby(['person_id','period','src'])['docdb_family_id'].sum().reset_index()
firmperiodsrcrca['rca'] = (
    firmperiodsrcrca.groupby("period")["docdb_family_id"].transform(sum)
    * firmperiodsrcrca.docdb_family_id
    / firmperiodsrcrca.groupby(["person_id", "period"])["docdb_family_id"].transform(sum)
    / firmperiodsrcrca.groupby(["src", "period"])["docdb_family_id"].transform(sum)
)
firmperiodsrcrca['binrca'] = np.where(firmperiodsrcrca.rca>=1,1,0)
firmperiodsrcrca.head()

Unnamed: 0,person_id,period,src,docdb_family_id,rca,binrca
0,26821,1995,s104,3,3.762498,1
1,26821,1995,s111,6,6.692257,1
2,26821,1995,s114,60,273.465079,1
3,26821,1995,s130,1,0.261601,0
4,26821,1995,s132,4,6.874619,1


In [22]:
firmperiodsrccnt = firmperiodsrcrca.query('binrca>=1').groupby(['person_id','period'])['src'].nunique().reset_index().rename(columns={"src":'num_src'})
firmperiodsrccnt.head()

Unnamed: 0,person_id,period,num_src
0,26821,1995,9
1,26821,2005,4
2,27530,0,6
3,27530,2005,1
4,27652,0,14


In [23]:
firmperiodsrccnt.to_parquet('firmperiodsrc.parquet')

In [24]:
cpc4dclus = pd.read_parquet('../data/srcpath33.parquet').query('path!="c-1" and src!="s-1"')
#cpc4dclus['clus'] = 'c'+cpc4dclus['clus'].astype(str)
cpc4dclus.head()

Unnamed: 0,src,path,ccmt_patent_id,rca,binrca,size,hhi,effhhi,rnk
0,s0,c0,3,1.33956,1,98,0.116618,8.575,54.5
1,s0,c1,5,0.514582,0,149,0.077969,12.825534,74.5
2,s0,c10,8,0.231331,0,164,0.036957,27.05835,95.0
3,s0,c12,18,1.415693,1,153,0.053697,18.622912,50.0
4,s0,c13,1,0.242206,0,103,0.121124,8.256031,89.0


In [25]:
cpc4dclus2 = cpc4dclus[(cpc4dclus.binrca==1)]
#cpc4dclus2 = cpc4dclus[(cpc4dclus.rnk<cpc4dclus.effhhi)]
cpc4dclus2.path.nunique(),cpc4dclus2.src.nunique()

(82, 193)

In [26]:
firmperiodsrcpath = (
    firmperiodsrcrca[firmperiodsrcrca.binrca == 1][["person_id", "src", "period"]]
    .merge(cpc4dclus2[["src", "path"]].drop_duplicates())
    .merge(firmperiodclusrca.rename(columns={"clus":'path'}).query('binrca>=1')[["person_id", "path", "period"]].drop_duplicates().assign(exist=1),how='left')
)
firmperiodsrcpath['exist'] = firmperiodsrcpath['exist'].fillna(0)
firmperiodsrcpath.head()

Unnamed: 0,person_id,src,period,path,exist
0,26821,s104,1995,c10,1.0
1,26821,s104,1995,c14,0.0
2,26821,s104,1995,c16,0.0
3,26821,s104,1995,c17,0.0
4,26821,s104,1995,c23,1.0


In [27]:
firmperiodsrcpath.to_parquet('firmperiodsrcpath.parquet')

In [28]:
clus2y02 = pd.read_parquet('../data/y02path33.parquet').query('path!="c-1"')
#clus2y02['clus'] = 'c'+clus2y02['clus'].astype(str)
clus2y02 = clus2y02[clus2y02.binrca==1]
clus2y02.head()

Unnamed: 0,path,y02,ccmt_patent_id,size,hhi,effhhi,rca,binrca
1,c0,Y02B 40,1,14,0.204082,4.9,1.021043,1
6,c0,Y02E 60,471,14,0.204082,4.9,9.649674,1
7,c0,Y02E 70,1,14,0.204082,4.9,1.111619,1
9,c0,Y02P 70,81,14,0.204082,4.9,1.384047,1
17,c1,Y02B 40,17,30,0.084444,11.842105,2.814342,1


In [29]:
firmperiody02clus = (
    firmperiodclusrca.query("binrca>=1")[["person_id", "clus", "period"]].rename(columns={'clus':'path'})
    .drop_duplicates()
    .merge(clus2y02[["path", "y02"]].drop_duplicates())
)
firmperiody02clus.head()

Unnamed: 0,person_id,path,period,y02
0,26821,c10,1995,Y02B 50
1,26821,c10,1995,Y02T 10
2,26821,c10,1995,Y02T 30
3,26821,c10,1995,Y02T 90
4,26821,c10,1995,Y04S 10


In [30]:
firmperiody02clus.to_parquet('firmperiody02path.parquet',index=False)

In [31]:
firmperiody02cluscnt = firmperiody02clus.groupby(['person_id','period','y02'])['path'].nunique().reset_index()
firmperiody02cluscnt.head()

Unnamed: 0,person_id,period,y02,path
0,26821,1995,Y02A 10,2
1,26821,1995,Y02A 20,1
2,26821,1995,Y02A 30,1
3,26821,1995,Y02A 50,1
4,26821,1995,Y02A 90,2


In [32]:
firmperiody02cluscnt.to_parquet('firmperiody02pathcnt.parquet',index=False)

In [33]:
firmperiody02srcpath = firmperiody02clus.merge(
    firmperiodsrcrca[firmperiodsrcrca.binrca == 1][["person_id", "src", "period"]]
    .merge(cpc4dclus2[["src", "path"]].drop_duplicates())
    .merge(
        firmperiodclusrca.query("binrca>=1")[["person_id", "clus", "period"]]
        .drop_duplicates().rename(columns={'clus':'path'})
    )
)
firmperiody02srcpath.head()

Unnamed: 0,person_id,path,period,y02,src
0,26821,c10,1995,Y02B 50,s104
1,26821,c10,1995,Y02B 50,s165
2,26821,c10,1995,Y02B 50,s29
3,26821,c10,1995,Y02B 50,s38
4,26821,c10,1995,Y02T 10,s104


In [34]:
firmperiody02srcpath.to_parquet('firmperiody02srcpath.parquet')

In [35]:
firmperiody02srcpathcnt = firmperiody02srcpath.groupby(['person_id','period','y02'])['src'].nunique().reset_index()
firmperiody02srcpathcnt.head()

Unnamed: 0,person_id,period,y02,src
0,26821,1995,Y02A 10,4
1,26821,1995,Y02A 20,4
2,26821,1995,Y02A 30,5
3,26821,1995,Y02A 50,4
4,26821,1995,Y02A 90,5


In [36]:
firmperiody02srcpathcnt.to_parquet('firmperiody02srcpathcnt.parquet')

In [37]:
clus2y02.head()

Unnamed: 0,path,y02,ccmt_patent_id,size,hhi,effhhi,rca,binrca
1,c0,Y02B 40,1,14,0.204082,4.9,1.021043,1
6,c0,Y02E 60,471,14,0.204082,4.9,9.649674,1
7,c0,Y02E 70,1,14,0.204082,4.9,1.111619,1
9,c0,Y02P 70,81,14,0.204082,4.9,1.384047,1
17,c1,Y02B 40,17,30,0.084444,11.842105,2.814342,1


In [38]:
firmperiodsrcrca.head()

Unnamed: 0,person_id,period,src,docdb_family_id,rca,binrca
0,26821,1995,s104,3,3.762498,1
1,26821,1995,s111,6,6.692257,1
2,26821,1995,s114,60,273.465079,1
3,26821,1995,s130,1,0.261601,0
4,26821,1995,s132,4,6.874619,1


In [39]:
firmperiody02src = clus2y02.query('binrca==1')[['path','y02']].drop_duplicates().merge(
    firmperiodsrcrca[firmperiodsrcrca.binrca == 1][["person_id", "src", "period"]]
    .merge(cpc4dclus2[["src", "path"]].drop_duplicates())
)
firmperiody02src.head()

Unnamed: 0,path,y02,person_id,src,period
0,c0,Y02B 40,26821,s114,1995
1,c0,Y02B 40,26821,s114,2005
2,c0,Y02B 40,43121,s114,0
3,c0,Y02B 40,88263,s114,2005
4,c0,Y02B 40,130552,s114,2005


In [40]:
firmperiody02clussrc = firmperiody02clus.merge(
    firmperiodsrcrca[firmperiodsrcrca.binrca == 1][["person_id", "src", "period"]]
    .merge(cpc4dclus2[["src", "path"]].drop_duplicates())
    .merge(
        firmperiodclusrca.query("binrca>=1")[["person_id", "clus", "period"]]
        .drop_duplicates().rename(columns={'clus':'path'})
    )
)
firmperiody02clussrc.head()

Unnamed: 0,person_id,path,period,y02,src
0,26821,c10,1995,Y02B 50,s104
1,26821,c10,1995,Y02B 50,s165
2,26821,c10,1995,Y02B 50,s29
3,26821,c10,1995,Y02B 50,s38
4,26821,c10,1995,Y02T 10,s104


In [41]:
firmperiody02srccnt = firmperiody02src.groupby(['person_id','period','y02'])['src'].nunique().reset_index()
firmperiody02srccnt.head()

Unnamed: 0,person_id,period,y02,src
0,26821,1995,Y02A 10,8
1,26821,1995,Y02A 20,8
2,26821,1995,Y02A 30,9
3,26821,1995,Y02A 40,8
4,26821,1995,Y02A 50,8


In [42]:
firmperiody02srccnt.to_parquet('firmperiody02srccnt.parquet')