In [None]:
# >>> Path configuration (auto-inserted) >>>
from pathlib import Path
import os

PROJECT_ROOT = Path(os.getenv("GBB_PROJECT_ROOT", ".")).resolve()
DATA = PROJECT_ROOT / "data"
PATSTAT = PROJECT_ROOT / "patstat"
PATTEXT = PROJECT_ROOT / "patent_text"
SAMPLEDATA = PROJECT_ROOT / "sampledata"

# Fallback to sampledata if primary paths not present
if not PATSTAT.exists() and (SAMPLEDATA / "patstat").exists():
    PATSTAT = SAMPLEDATA / "patstat"
if not PATTEXT.exists() and (SAMPLEDATA / "patent_text").exists():
    PATTEXT = SAMPLEDATA / "patent_text"
if not DATA.exists() and (PROJECT_ROOT / "data").exists():
    DATA = PROJECT_ROOT / "data"

def P(*parts):
    return str(Path(*parts))
# <<< Path configuration (auto-inserted) <<<


In [1]:
import pandas as pd
import numpy as np

In [2]:
firmperiody02cpc4dcnt = pd.read_parquet('firmperiody02srccnt.parquet').query("period==1995").copy()
firmperiody02cpc4dcnt.head()

Unnamed: 0,person_id,period,y02,src
0,26821,1995,Y02A 10,8
1,26821,1995,Y02A 20,8
2,26821,1995,Y02A 30,9
3,26821,1995,Y02A 40,8
4,26821,1995,Y02A 50,8


In [3]:
firmperiody02cluscnt = pd.read_parquet('firmperiody02pathcnt.parquet').query("period==1995").copy()
firmperiody02cluscnt.head()

Unnamed: 0,person_id,period,y02,path
0,26821,1995,Y02A 10,2
1,26821,1995,Y02A 20,1
2,26821,1995,Y02A 30,1
3,26821,1995,Y02A 50,1
4,26821,1995,Y02A 90,2


In [4]:
firmperiody02cluscpc4dcnt = pd.read_parquet('firmperiody02srcpathcnt.parquet').rename(columns={'src':'relsrc'}).query("period==1995").copy()
firmperiody02cluscpc4dcnt.head()

Unnamed: 0,person_id,period,y02,relsrc
0,26821,1995,Y02A 10,4
1,26821,1995,Y02A 20,4
2,26821,1995,Y02A 30,5
3,26821,1995,Y02A 50,4
4,26821,1995,Y02A 90,5


In [5]:
firmperiody02 = pd.read_parquet("firmperiody02.parquet")
firmperiody02 = firmperiody02.merge(
    firmperiody02.assign(period=firmperiody02.period - 10),
    on=["person_id", "period", "y02"],
    how="outer",
    suffixes=["", "_2"],
).query("period==1995").fillna(0)
firmperiody02.head()

Unnamed: 0,person_id,period,y02,num_pat_ccmt,num_pat_ccmt_2
13,42082,1995,Y02B 90,1.0,0.0
14,42082,1995,Y02D 10,4.0,0.0
15,42082,1995,Y02D 30,1.0,0.0
16,42082,1995,Y02D 50,20.0,0.0
17,42082,1995,Y02D 70,2.0,0.0


In [6]:
y02cnt = pd.read_parquet('y02cnt.parquet')
y02cnt.head()

Unnamed: 0,y02,docdb_family_id
0,Y02A 10,1526
1,Y02A 20,6939
2,Y02A 30,7708
3,Y02A 40,19550
4,Y02A 50,15620


In [7]:
df = pd.read_parquet('firm_period_patcount.parquet').query('period==1995 and wt>=10').merge(y02cnt.rename(columns={'docdb_family_id':'ccmtcnt'}).assign(period=1995))
df.head()

Unnamed: 0,person_id,period,wt,y02,ccmtcnt
0,26821,1995,88,Y02A 10,1526
1,26821,1995,88,Y02A 20,6939
2,26821,1995,88,Y02A 30,7708
3,26821,1995,88,Y02A 40,19550
4,26821,1995,88,Y02A 50,15620


In [8]:
df.person_id.nunique()

12810

In [9]:
df = df.merge(firmperiody02cpc4dcnt,how='left').merge(firmperiody02cluscnt,how='left').merge(firmperiody02cluscpc4dcnt,how='left').merge(firmperiody02,how='left').dropna(subset=['person_id','wt','period','y02']).fillna(0)
df.head()

Unnamed: 0,person_id,period,wt,y02,ccmtcnt,src,path,relsrc,num_pat_ccmt,num_pat_ccmt_2
0,26821,1995,88,Y02A 10,1526,8.0,2.0,4.0,0.0,0.0
1,26821,1995,88,Y02A 20,6939,8.0,1.0,4.0,0.0,0.0
2,26821,1995,88,Y02A 30,7708,9.0,1.0,5.0,0.0,0.0
3,26821,1995,88,Y02A 40,19550,8.0,0.0,0.0,0.0,0.0
4,26821,1995,88,Y02A 50,15620,8.0,1.0,4.0,0.0,0.0


In [10]:
df.shape,df[['person_id','y02']].nunique()

((627690, 10),
 person_id    12810
 y02             49
 dtype: int64)

In [11]:
df[(df.num_pat_ccmt>0) & (df.num_pat_ccmt_2>0)].shape

(12438, 10)

In [12]:
df[(df.num_pat_ccmt==0)].shape

(600571, 10)

In [13]:
df['haspath'] = np.where(df.path>0,1,0)
df['hassrc'] = np.where(df.src>0,1,0)
df['hasrelsrc'] = np.where(df.relsrc>0,1,0)
df.head()

Unnamed: 0,person_id,period,wt,y02,ccmtcnt,src,path,relsrc,num_pat_ccmt,num_pat_ccmt_2,haspath,hassrc,hasrelsrc
0,26821,1995,88,Y02A 10,1526,8.0,2.0,4.0,0.0,0.0,1,1,1
1,26821,1995,88,Y02A 20,6939,8.0,1.0,4.0,0.0,0.0,1,1,1
2,26821,1995,88,Y02A 30,7708,9.0,1.0,5.0,0.0,0.0,1,1,1
3,26821,1995,88,Y02A 40,19550,8.0,0.0,0.0,0.0,0.0,0,1,0
4,26821,1995,88,Y02A 50,15620,8.0,1.0,4.0,0.0,0.0,1,1,1


In [14]:
df['log_num_pat_ccmt'] = np.where(df.num_pat_ccmt==0,0,np.log(df.num_pat_ccmt))
df.head()

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,person_id,period,wt,y02,ccmtcnt,src,path,relsrc,num_pat_ccmt,num_pat_ccmt_2,haspath,hassrc,hasrelsrc,log_num_pat_ccmt
0,26821,1995,88,Y02A 10,1526,8.0,2.0,4.0,0.0,0.0,1,1,1,0.0
1,26821,1995,88,Y02A 20,6939,8.0,1.0,4.0,0.0,0.0,1,1,1,0.0
2,26821,1995,88,Y02A 30,7708,9.0,1.0,5.0,0.0,0.0,1,1,1,0.0
3,26821,1995,88,Y02A 40,19550,8.0,0.0,0.0,0.0,0.0,0,1,0,0.0
4,26821,1995,88,Y02A 50,15620,8.0,1.0,4.0,0.0,0.0,1,1,1,0.0


In [15]:
df.to_parquet('firm_green_data.parquet')

In [None]:
np.log10(df[df.num_pat_ccmt_2>0].num_pat_ccmt_2).plot.hist()

In [17]:
#df = pd.read_parquet('firm_green_data.parquet').query('period==1995 and wt>=10')
df.shape

(627690, 14)

In [18]:
df.person_id.nunique()

12810

In [None]:
df.plot.scatter('src','relsrc')

In [20]:
df.describe()

Unnamed: 0,person_id,period,wt,ccmtcnt,src,path,relsrc,num_pat_ccmt,num_pat_ccmt_2,haspath,hassrc,hasrelsrc,log_num_pat_ccmt
count,627690.0,627690.0,627690.0,627690.0,627690.0,627690.0,627690.0,627690.0,627690.0,627690.0,627690.0,627690.0,627690.0
mean,17667940.0,1995.0,407.605386,7479.632653,6.16797,1.033708,3.281626,0.369037,0.652884,0.480449,0.87429,0.460398,0.03831
std,10245190.0,0.0,2876.830753,10159.319687,6.702848,1.568026,5.609748,12.207755,28.285111,0.499618,0.331523,0.49843,0.301277
min,26821.0,1995.0,10.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8373490.0,1995.0,35.0,1373.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,18884540.0,1995.0,74.0,4086.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,26960900.0,1995.0,168.0,7754.0,9.0,2.0,5.0,0.0,0.0,1.0,1.0,1.0,0.0
max,39713290.0,1995.0,135257.0,53226.0,70.0,20.0,66.0,5899.0,17330.0,1.0,1.0,1.0,8.682538


In [2]:
df  = pd.read_parquet('firm_green_data.parquet')
df.head()

Unnamed: 0,person_id,period,wt,y02,ccmtcnt,src,path,relsrc,num_pat_ccmt,num_pat_ccmt_2,haspath,hassrc,hasrelsrc,log_num_pat_ccmt
0,26821,1995,88,Y02A 10,1526,11.0,2.0,7.0,0.0,0.0,1,1,1,0.0
1,26821,1995,88,Y02A 20,6939,13.0,1.0,7.0,0.0,0.0,1,1,1,0.0
2,26821,1995,88,Y02A 30,7708,10.0,1.0,5.0,0.0,0.0,1,1,1,0.0
3,26821,1995,88,Y02A 40,19550,11.0,0.0,0.0,0.0,0.0,0,1,0,0.0
4,26821,1995,88,Y02A 50,15620,9.0,1.0,7.0,0.0,0.0,1,1,1,0.0


In [21]:
df.to_stata('firm_green_data.dta')

In [22]:
df[(df.period==1995)&(df.wt>=10)&(df.num_pat_ccmt==0)].to_stata('firm_green_entry.dta')