In [None]:
# >>> Path configuration (auto-inserted) >>>
from pathlib import Path
import os

PROJECT_ROOT = Path(os.getenv("GBB_PROJECT_ROOT", ".")).resolve()
DATA = PROJECT_ROOT / "data"
PATSTAT = PROJECT_ROOT / "patstat"
PATTEXT = PROJECT_ROOT / "patent_text"
SAMPLEDATA = PROJECT_ROOT / "sampledata"

# Fallback to sampledata if primary paths not present
if not PATSTAT.exists() and (SAMPLEDATA / "patstat").exists():
    PATSTAT = SAMPLEDATA / "patstat"
if not PATTEXT.exists() and (SAMPLEDATA / "patent_text").exists():
    PATTEXT = SAMPLEDATA / "patent_text"
if not DATA.exists() and (PROJECT_ROOT / "data").exists():
    DATA = PROJECT_ROOT / "data"

def P(*parts):
    return str(Path(*parts))
# <<< Path configuration (auto-inserted) <<<


In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from matplotlib import cm

In [None]:
## load association of source fields and gbbs
srcpath = pd.read_parquet(str(DATA / "srcpath33.parquet"))
srcpath = srcpath[(srcpath.binrca==1)].copy()
srcpath['divsrc'] = srcpath.groupby('src')['path'].transform('nunique')
srcpath['divpath'] = srcpath.groupby('path')['src'].transform('nunique')

In [None]:
## load gbbs
pathcpc = pd.read_parquet(str(DATA / "clusangle_outlier_hdbscan2.parquet"),columns=['clus','cpc']).query('clus!="c-1"').rename(columns={'clus':'path'})
pathcpc['cpc'] = pathcpc['cpc'].str.replace(' ','')
pathcpc.head()

Unnamed: 0,path,cpc
0,c10,A01B1/00
8,c10,A01B39/18
9,c10,A01B61/00
11,c10,A01B63/1013
12,c10,A01B69/008


In [None]:
## load source fields
srccpc = pd.read_parquet(str(DATA / "clusangle_outlier_hdbscan2_src2.parquet"),columns=['clus','cpc']).query('clus!="s-1"').rename(columns={'clus':'src'})
srccpc['cpc'] = srccpc['cpc'].str.replace(' ','')
srccpc.head()

Unnamed: 0,src,cpc
0,s112,A01B1/02
1,s112,A01B1/022
2,s112,A01B13/00
3,s112,A01B13/025
4,s112,A01B13/08


In [None]:
## load primary country of firms
firmctry = pd.read_parquet('firmctry.parquet')
firmctry.head()

In [None]:
## load firm-cpc for firms with at least 500 patent families in total
firmpatcpc2 = pd.read_parquet('firmpatcpc.parquet').query('totalpat>=500')
firmpatcpc2.head()

Unnamed: 0,psn_id,docdb_family_id,totalpat,cpc
0,22321642,8554171,19095,G06K7/0013
1,22321642,8554171,19095,G06K7/0021
2,22321642,8554171,19095,G06K7/0043
3,22321642,8554171,19095,G06K7/0069
4,22321642,8554171,19095,G06K19/07739


## best potential collaborator of firms

In [None]:
## infer demand of GBBs from sources fields of firms
firmpathdemand = firmpatcpc2.merge(firmctry[firmctry.psn_sector.str.contains('COMPANY')][['psn_id']].drop_duplicates()).merge(srccpc).merge(srcpath[['src','path']]).groupby(['path','psn_id'])['docdb_family_id'].nunique().reset_index()
firmpathdemand['rca'] = firmpathdemand.docdb_family_id*firmpathdemand.docdb_family_id.sum()/firmpathdemand.groupby('path')['docdb_family_id'].transform('sum')/firmpathdemand.groupby('psn_id')['docdb_family_id'].transform('sum')
firmpathdemand.sort_values('rca',ascending=False).head(10)

Unnamed: 0,path,psn_id,docdb_family_id,rca
184351,c4,23043472,111,30.089194
182134,c4,3660408,536,22.507143
184625,c4,26422140,9,22.493706
182420,c4,5225004,104,21.953316
300238,c61,2640557,50,21.287143
361582,c72,2640557,50,20.942009
184146,c4,22186153,21,20.663509
300245,c61,2807687,120,20.502151
302359,c61,27550284,91,20.332996
361589,c72,2807687,120,20.169744


In [None]:
## The supply of GBBs of firms
firmpathsupply = firmpatcpc2.merge(pathcpc).groupby(['path','psn_id'])['docdb_family_id'].nunique().reset_index()
firmpathsupply['rca'] = firmpathsupply.docdb_family_id*firmpathsupply.docdb_family_id.sum()/firmpathsupply.groupby('path')['docdb_family_id'].transform('sum')/firmpathsupply.groupby('psn_id')['docdb_family_id'].transform('sum')
firmpathsupply.sort_values('rca',ascending=False).head(10)

Unnamed: 0,path,psn_id,docdb_family_id,rca
105956,c48,26643050,16,674.944936
79363,c35,6572778,3,642.317169
89119,c39,14136083,1,624.830947
89177,c39,15817930,2,624.830947
105678,c48,19751647,6,569.484789
160130,c7,22191828,205,498.519363
121085,c54,13177520,2,497.3049
96454,c43,12747199,146,494.786977
105957,c48,26644160,5,474.570658
105848,c48,23282717,8,433.893173


In [None]:
## Use RCA>=1 to filter significantly overrepresented GBBs
firmpathsupply['binrca'] = np.where((firmpathsupply['rca']>=1)&(firmpathsupply['docdb_family_id']>10),1,0)
firmpathdemand['binrca'] = np.where((firmpathdemand['rca']>=1)&(firmpathdemand['docdb_family_id']>10),1,0)

In [None]:
## Turn df into matrix for easier calc
demandmat = firmpathdemand.pivot(index='psn_id',columns='path',values='binrca').fillna(0)
supplymat = firmpathsupply.pivot(index='psn_id',columns='path',values='binrca').fillna(0)
supplymat2 = firmpathsupply.pivot(index='psn_id',columns='path',values='docdb_family_id').fillna(0)
demandmat.shape,supplymat.shape,supplymat2.shape

((5940, 82), (7179, 82), (7179, 82))

In [None]:
## reshape to the same form
idx = pd.Index(set(demandmat.index.tolist()).union(set(supplymat.index.tolist())))
demandmat = demandmat.reindex(index=idx,fill_value=0)
supplymat = supplymat.reindex(index=idx,fill_value=0)
supplymat2 = supplymat2.reindex(index=idx,fill_value=0)
demandmat.shape,supplymat.shape

((7190, 82), (7190, 82))

In [None]:
## demand of GBBs without firms's own supply
unmet = demandmat * (1 - supplymat)

In [None]:
## complementarity between firms
complement = supplymat @ unmet.T
complement2 = supplymat2 @ unmet.T

In [30]:
firmctry = firmctry.merge(firmpatcpc2[['psn_id','totalpat']].drop_duplicates(),how='left')
firmctry.head()

Unnamed: 0,psn_id,psn_name,psn_sector,ctry_code,totalpat
0,2,',COMPANY,RU,
1,17,' * CONPROJECT' HANDELSVERTRETUNG UND TECHN BU...,COMPANY,AT,
2,19,' 1C' [RU/RU],COMPANY,,
3,20,- A. VIGORELLI,COMPANY,IT,
4,21,"'' AEROSOLS ANTWERPIA'', ZOERSELBAAN, 25, WEST...",COMPANY,,


In [31]:
firmctry.shape,firmctry.psn_id.nunique()

((4434878, 5), 4434878)

In [None]:
## Long form match of supply and demand of GBB at firm level, with external country info
dfcomplement = (
    pd.melt(
        complement.reset_index(),
        id_vars="index",
        var_name="demand",
        value_name="n_gbb",
    ).rename(columns={'index':'supply'})
    .merge(pd.melt(
        np.log1p(complement2).reset_index(names='supply'),
        id_vars="supply",
        var_name="demand",
        value_name="log_n_pat",
    ))
    .merge(firmctry.rename(columns={"psn_id": 'supply', "psn_name": "supplyfirm","psn_sector":'supplytype','ctry_code':'supplyctry','totalpat':'supply_n_pat'}))
    .merge(firmctry.rename(columns={"psn_id": "demand", "psn_name": "demandfirm","psn_sector":'demandtype','ctry_code':'demandctry','totalpat':'demand_n_pat'}))
    .sort_values(['n_gbb','log_n_pat'],ascending=False)
)
dfcomplement.head()

Unnamed: 0,supply,demand,n_gbb,log_n_pat,supplyfirm,supplytype,supplyctry,supply_n_pat,demandfirm,demandtype,demandctry,demand_n_pat
43029140,2017893,20835176,36.0,7.502738,BATTELLE MEMORIAL INSTITUTE,GOV NON-PROFIT,US,3238.0,MITSUBISHI KASEI KOGYO,COMPANY,JP,1466.0
47661229,20834907,10679891,34.0,9.518854,MITSUBISHI HEAVY INDUSTRIES,COMPANY,JP,99855.0,GUTEHOFFNUNGSHUETTE OBERHAUSEN,COMPANY,DE,655.0
43025439,31492134,20835176,34.0,8.679992,UNION CARBIDE CORPORATION,COMPANY,US,22481.0,MITSUBISHI KASEI KOGYO,COMPANY,JP,1466.0
43030900,4614828,20835176,33.0,9.862926,CHINESE ACADEMY OF SCIENCES,GOV NON-PROFIT UNIVERSITY,CN,142927.0,MITSUBISHI KASEI KOGYO,COMPANY,JP,1466.0
3715980,4614828,22186127,32.0,10.022559,CHINESE ACADEMY OF SCIENCES,GOV NON-PROFIT UNIVERSITY,CN,142927.0,NIPPON CARBIDE KOGYO,COMPANY,JP,632.0


In [None]:
## Consider only firms for the dmeand side
dfcomplement = dfcomplement[dfcomplement.demandtype.str.contains('COMPANY')]
dfcomplement.shape

(42758930, 12)

In [45]:
dfcomplement.to_parquet('firm_complement.parquet',index=False)

In [None]:
## Sort by number of complement GBBs and total number of complementary patents, as the best potential collaborator
topcomplement = dfcomplement.sort_values(['n_gbb','log_n_pat'],ascending=False).groupby('demand').head(1)
topcomplement.head()

Unnamed: 0,supply,demand,n_gbb,log_n_pat,supplyfirm,supplytype,supplyctry,supply_n_pat,demandfirm,demandtype,demandctry,demand_n_pat
43029140,2017893,20835176,36.0,7.502738,BATTELLE MEMORIAL INSTITUTE,GOV NON-PROFIT,US,3238.0,MITSUBISHI KASEI KOGYO,COMPANY,JP,1466.0
47661229,20834907,10679891,34.0,9.518854,MITSUBISHI HEAVY INDUSTRIES,COMPANY,JP,99855.0,GUTEHOFFNUNGSHUETTE OBERHAUSEN,COMPANY,DE,655.0
3715980,4614828,22186127,32.0,10.022559,CHINESE ACADEMY OF SCIENCES,GOV NON-PROFIT UNIVERSITY,CN,142927.0,NIPPON CARBIDE KOGYO,COMPANY,JP,632.0
809460,2017893,23986621,32.0,7.458186,BATTELLE MEMORIAL INSTITUTE,GOV NON-PROFIT,US,3238.0,PHILIP MORRIS,COMPANY,US,1369.0
39687519,20834907,11527004,31.0,9.552226,MITSUBISHI HEAVY INDUSTRIES,COMPANY,JP,99855.0,HEFEI GENERAL MACHINERY RESEARCH INSTITUTE,COMPANY,CN,519.0


In [48]:
topcomplement[['supply','demand']].nunique()

supply     181
demand    5947
dtype: int64

In [None]:
## Aggregate to the type level
typecomplement = topcomplement.groupby('supplytype').agg({'supply':'count'}).reset_index().sort_values(['supply'],ascending=False)
typecomplement

Unnamed: 0,supplytype,supply
0,COMPANY,3967
3,GOV NON-PROFIT,836
4,GOV NON-PROFIT UNIVERSITY,621
5,UNIVERSITY,499
2,COMPANY UNIVERSITY,17
1,COMPANY GOV NON-PROFIT,7


In [None]:
## Aggregate to the country level
cntrycomplement = topcomplement.query('supplyctry!="  " and demandctry!="  "').groupby(['demandctry','supplyctry']).agg({'supply':'count','n_gbb':'mean','log_n_pat':'mean'}).reset_index().sort_values(['demandctry','supply','n_gbb','log_n_pat'],ascending=False)
cntrycomplement.head()

Unnamed: 0,demandctry,supplyctry,supply,n_gbb,log_n_pat
182,ZA,US,2,25.5,8.153242
181,ZA,KR,1,21.0,6.901737
175,US,JP,424,13.90566,8.62706
171,US,CN,290,16.131034,8.472486
180,US,US,171,18.076023,7.916348


In [51]:
cntrycomplement['ratio'] = cntrycomplement['supply']/cntrycomplement.groupby('demandctry')['supply'].transform('sum')
cntrycomplement.head()

Unnamed: 0,demandctry,supplyctry,supply,n_gbb,log_n_pat,ratio
182,ZA,US,2,25.5,8.153242,0.666667
181,ZA,KR,1,21.0,6.901737,0.333333
175,US,JP,424,13.90566,8.62706,0.385805
171,US,CN,290,16.131034,8.472486,0.263876
180,US,US,171,18.076023,7.916348,0.155596


In [52]:
cntrycomplement.query('demandctry=="DE"')

Unnamed: 0,demandctry,supplyctry,supply,n_gbb,log_n_pat,ratio
51,DE,JP,240,16.320833,8.894615,0.441176
47,DE,CN,103,17.029126,8.462634,0.189338
55,DE,US,72,18.222222,7.966936,0.132353
48,DE,DE,70,12.928571,9.388865,0.128676
52,DE,KR,27,15.37037,7.124268,0.049632
50,DE,GB,15,12.8,7.344841,0.027574
53,DE,NL,5,13.4,8.11929,0.009191
49,DE,FR,4,15.75,8.635108,0.007353
46,DE,CH,4,12.75,7.949041,0.007353
54,DE,TW,3,12.333333,8.377919,0.005515


In [53]:
cntrycomplement.query('supplyctry=="DE"')

Unnamed: 0,demandctry,supplyctry,supply,n_gbb,log_n_pat,ratio
172,US,DE,96,12.416667,9.336825,0.087352
163,TW,DE,13,11.692308,9.939829,0.095588
155,SU,DE,2,3.5,10.43727,0.153846
149,SE,DE,4,13.25,9.052748,0.085106
144,SA,DE,1,5.0,8.277412,0.333333
128,NL,DE,3,11.666667,9.133663,0.096774
126,MY,DE,1,13.0,10.553623,1.0
116,KR,DE,31,11.096774,9.564577,0.147619
105,JP,DE,249,13.506024,9.306542,0.161061
98,IT,DE,1,8.0,7.927685,0.025


In [54]:
cntrycomplement.query('demandctry=="US"')

Unnamed: 0,demandctry,supplyctry,supply,n_gbb,log_n_pat,ratio
175,US,JP,424,13.90566,8.62706,0.385805
171,US,CN,290,16.131034,8.472486,0.263876
180,US,US,171,18.076023,7.916348,0.155596
172,US,DE,96,12.416667,9.336825,0.087352
176,US,KR,54,13.944444,6.948261,0.049136
174,US,GB,19,14.368421,7.036053,0.017288
173,US,FR,17,14.058824,8.310157,0.015469
177,US,NL,12,10.25,8.372642,0.010919
169,US,AU,10,17.0,7.024092,0.009099
170,US,CH,4,15.25,8.232832,0.00364


In [55]:
cntrycomplement.query('demandctry=="CN"')

Unnamed: 0,demandctry,supplyctry,supply,n_gbb,log_n_pat,ratio
33,CN,JP,502,12.665339,8.205342,0.55531
30,CN,DE,123,12.495935,9.319574,0.136062
39,CN,US,106,18.698113,8.032475,0.117257
29,CN,CN,74,14.364865,8.092201,0.081858
34,CN,KR,53,13.962264,7.105711,0.058628
35,CN,NL,14,11.5,8.157095,0.015487
32,CN,GB,12,16.0,7.407478,0.013274
31,CN,FR,7,13.571429,8.351459,0.007743
28,CN,CH,5,16.4,8.264687,0.005531
38,CN,TW,5,12.0,8.285671,0.005531


In [56]:
cntrycomplement.to_parquet('cntrycomplement.parquet',index=False)

## Prepare data for regression of copatenting

In [None]:
df_copat  = pd.read_parquet('firm_cogreenpat.parquet').query('supply!=demand').assign(copatgreen=1)
df_copat.head()

Unnamed: 0,supply,demand,docdb_family_id,copatgreen
1,42082,11898188,1,1
2,42082,12487574,1,1
4,43121,43160,21,1
5,43121,7006447,1,1
6,43121,9392577,1,1


In [58]:
df_reg = dfcomplement.merge(df_copat[['supply','demand','copatgreen']],how='left')
df_reg['copatgreen'].fillna(0,inplace=True)
df_reg['samecntry'] = np.where(df_reg.supplyctry==df_reg.demandctry,1,0)
df_reg['sametype'] = np.where(df_reg.supplytype.str.contains('COMPANY'),1,0)
df_reg.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_reg['copatgreen'].fillna(0,inplace=True)


Unnamed: 0,supply,demand,n_gbb,log_n_pat,supplyfirm,supplytype,supplyctry,supply_n_pat,demandfirm,demandtype,demandctry,demand_n_pat,copatgreen,samecntry,sametype
0,2017893,20835176,36.0,7.502738,BATTELLE MEMORIAL INSTITUTE,GOV NON-PROFIT,US,3238.0,MITSUBISHI KASEI KOGYO,COMPANY,JP,1466.0,0.0,0,0
1,20834907,10679891,34.0,9.518854,MITSUBISHI HEAVY INDUSTRIES,COMPANY,JP,99855.0,GUTEHOFFNUNGSHUETTE OBERHAUSEN,COMPANY,DE,655.0,0.0,0,1
2,31492134,20835176,34.0,8.679992,UNION CARBIDE CORPORATION,COMPANY,US,22481.0,MITSUBISHI KASEI KOGYO,COMPANY,JP,1466.0,0.0,0,1
3,4614828,20835176,33.0,9.862926,CHINESE ACADEMY OF SCIENCES,GOV NON-PROFIT UNIVERSITY,CN,142927.0,MITSUBISHI KASEI KOGYO,COMPANY,JP,1466.0,0.0,0,0
4,4614828,22186127,32.0,10.022559,CHINESE ACADEMY OF SCIENCES,GOV NON-PROFIT UNIVERSITY,CN,142927.0,NIPPON CARBIDE KOGYO,COMPANY,JP,632.0,0.0,0,0


In [59]:
df_reg['log_supply_totalpat'] = np.log1p(df_reg.supply_n_pat)
df_reg['log_demand_totalpat'] = np.log1p(df_reg.demand_n_pat)
df_reg['copatgreen'] = df_reg['copatgreen'].astype(int)

In [60]:
df_reg.to_parquet('complement_reg.parquet',index=False)

In [62]:
df_reg.shape,df_reg[['supply','demand']].nunique(),df_reg.copatgreen.sum()

((42758930, 17),
 supply    7190
 demand    5947
 dtype: int64,
 np.int64(25525))