In [1]:
import scipy as sp
import numpy as np
import pandas as pd
import timeit
import re
import json
import pickle
import fastparquet
import os
os.chdir('/mnt/t48/bighomes-active/sfeng/patentdiffusion/')
seed = 3
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
import datetime
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.iolib.summary2 as summary2
import logging
import itertools

  from pandas.core import datetools


- Based on: https://sfengc7.stern.nyu.edu:8888/notebooks/patentdiffusion/201808Results/Reg1016/3a-RunReg-1016.ipynb
- Regression models in: https://sfengc7.stern.nyu.edu:8888/notebooks/patentdiffusion/201808Results/Reg1016/2-RegPrep-1016.ipynb

In [12]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.basicConfig(level=logging.INFO)
logger.addHandler(logging.FileHandler('Logs/reg_{0}.log'.format(datetime.datetime.now().\
                                                            strftime("%Y-%m-%d"), 'a')))
print = logging.info
print('good day to you madam fiona')
print('started')
print(datetime.datetime.now())

INFO:root:good day to you madam fiona
INFO:root:started
INFO:root:2018-11-27 16:07:05.027461


In [13]:
pathdir = "DataStore/2018-11/Reg1127/"
reg_f = "reg_model_1127.pkl"

regs = pickle.load(open(pathdir+reg_f, "rb"))

In [14]:
# Add list of equations
# Only normed values
regs["model_names"] = regs["model_names"].loc[regs["model_names"].apply(lambda x: x[:2] == "N ")]
# naics_ind = regs["model_names"].loc[regs["model_names"].apply(lambda x: ("Sim PC" in x) or ("Int PC" in x))].index.tolist()
naics_ind = regs["model_names"].index.tolist()
print(len(naics_ind))

regs[("docvecs", "naics_name")] = regs["docvecs"].loc[naics_ind]
# regs[("ldavecs", "naics_name")] = regs["ldavecs"].loc[naics_ind]
# for c in ["num_common_cited", "tp_pct_common_cited"]:
#     regs[(c, "naics_name")] = regs[c].loc[regs[c].index.isin(naics_ind)]

# PC indices
# Exclude Sim PC and Primclass Match
pc_ind = regs["model_names"].loc[regs["model_names"].apply(lambda x: (x[:-2] != "PC") and ("PC M-" not in x))].index.tolist()
print(len(pc_ind))

regs[("docvecs", "primclass")] = regs["docvecs"].loc[regs["docvecs"].index.isin(pc_ind)].dropna()
# regs[("ldavecs", "primclass")] = regs["ldavecs"].loc[regs["ldavecs"].index.isin(pc_ind)].dropna()
# Use whatever is available for num_common_cited
# for c in ["num_common_cited", "tp_pct_common_cited"]:
#     regs[(c, "primclass")] = regs[c].loc[regs[c].index.isin(pc_ind)]

# Where files are located
files = {"naics_name": "DataStore/2018-11/naics_name_sim_claims_1120.parq",
         "primclass": "DataStore/2018-11/primclass_sim_claims_1120.parq",
}


INFO:root:15
INFO:root:14


In [15]:
regs[("docvecs", "primclass")].sample(5)

16    norm_sim_claims_docvecs ~ C(inv_msa_match) + C...
27    norm_sim_claims_docvecs ~ C(primclass_match) +...
19    norm_sim_claims_docvecs ~ C(inv_msa_match) + C...
17    norm_sim_claims_docvecs ~ C(inv_msa_match) + C...
28    norm_sim_claims_docvecs ~ C(common_npc_match) ...
dtype: object

In [16]:

info_dict = {'$N$':lambda x: "{0:d}".format(int(x.nobs)),
'Adjusted $R^2$':lambda x: "{:.2f}".format(x.rsquared_adj)}
def get_fit(formula, grouped_data, group_col, cov_type = "HC1", return_fit = False):
    summ = []
    tables = {}
    
    # If formula uses mean similarity, use grant year above 1980
    if "mean_sim_" in formula:
        grouped_data = grouped_data.loc[(grouped_data["tp_gyear"] >= 1980)]
    
    # Remove missing values used in formula
    col_used = re.findall('\((.*?)\)',formula)
    # Intersect with grouped_data columns
    col_used = list(set(col_used).intersection(set(list(grouped_data.columns))))
    print(col_used)
    
    grouped_data = grouped_data.dropna(how="any", subset=col_used).copy().reset_index(drop=True)
    
    # Get length of data
    print(("Length of data", len(grouped_data)))
                                    
    # Group and then get results
    grouped_data = grouped_data.groupby(group_col)
    
    for n,g in grouped_data:
        try:
            if cov_type == "HC1":
                fit = smf.ols(formula = formula, data = g, missing="drop").fit(cov_type="HC1")
            else:
                # Drop missing in the grouped data first
                # Cluster uses primary class so must have primary class FE
                cols_used2 = list(set(col_used).intersection(set(list(grouped_data.columns))))+["tp_primclass_FE"]
                g = g.dropna(subset=cols_used2, how="any").copy().reset_index(drop=True)
                fit = smf.ols(formula = formula, data = g).fit(cov_type="cluster",
                                                                              cov_kwds={'groups': g["tp_primclass_FE"]})
            # Get results tables
            tables[n] = fit.summary2().tables
            # Append results
            summ.append(fit)
        except Exception as e:
            print(n)
            logging.exception("Regression error")
            pass
    # Get full results output
    # Dataframe of full results
    res_no_stars = summary2.summary_col(summ, stars = False, \
    model_names = ["{0}".format(n) for n in tables.keys()],\
        info_dict = info_dict).tables[0]
    res_stars = summary2.summary_col(summ, stars = True, \
    model_names = ["{0}".format(n) for n in tables.keys()],\
        info_dict = info_dict).tables[0]
    
    # Get partial results
    # 1. Get relevant variables from index of full results: UPDATED
    regressors = [v for v in res_no_stars.index.unique() if ("sim_" in v) | ("match" in v) | ("common_" in v)]
    # 2. Make sure regressors come last
    regressors = regressors+["Intercept"]
    # 3. Get results with regressors
    part_res_no_stars = summary2.summary_col(summ, stars = False, \
    model_names = ["{0}".format(n) for n in tables.keys()],\
        info_dict = info_dict, regressor_order = regressors).tables[0]
    part_res_stars = summary2.summary_col(summ, stars = True, \
    model_names = ["{0}".format(n) for n in tables.keys()],\
        info_dict = info_dict, regressor_order = regressors).tables[0]
    
    # 4. Get index of where Intercept is and add 2 (to include standard error)
    last_ind = list(part_res_stars.index).index("Intercept")+2
    
    # 5. Get partial results
    part_res_no_stars = pd.concat([part_res_no_stars.iloc[:last_ind], part_res_no_stars.iloc[-2::]])
    part_res_stars = pd.concat([part_res_stars.iloc[:last_ind], part_res_stars.iloc[-2::]])
    
    if return_fit == True:
        return summ, tables, res_no_stars, res_stars, part_res_no_stars, part_res_stars
    else:
        return tables, res_no_stars, res_stars, part_res_no_stars, part_res_stars
    
def get_results(sample, ks, cov = "HC1"):
    rs = fastparquet.ParquetFile(files[sample]).to_pandas()
    try:
        rs = rs.drop(['mean_sim_ldavecs_pc_msa_v',
           'mean_sim_docvecs_pc_msa_v', 'mean_sim_docvecs_pc_v', 'mean_sim_ldavecs_pc_v',
           'norm_mean_sim_ldavecs_pc_msa_v', 'norm_mean_sim_docvecs_pc_msa_v',
           'norm_mean_sim_docvecs_pc_v', 'norm_mean_sim_ldavecs_pc_v'],1)
        
    except:
        pass
    print(rs.columns)
    print(len(rs))
    
    # Drop extreme values
#     print(len(rs))
#     rs = rs.loc[((rs["norm_mean_sim_docvecs_pc_msa"] >= -5) & (rs["norm_mean_sim_docvecs_pc_msa"] <= 5)) | 
#                (rs["norm_mean_sim_docvecs_pc_msa"].isnull())]
#     rs = rs.loc[((rs["norm_mean_sim_docvecs_pc"] >= -5) & (rs["norm_mean_sim_docvecs_pc"] <= 5)) |
#                (rs["norm_mean_sim_docvecs_pc"].isnull())]
#     print(len(rs))
    
    print((ks, sample, "started"))
    print(datetime.datetime.now())
    # 3. Define output
    samp_out = {}
    formulas = list(regs[(ks, sample)])
    formulas_ind = list(regs[(ks, sample)].index)
    for i, j in zip(formulas_ind, formulas):
        print((ks, sample, i, j))
        print(datetime.datetime.now())
        try:
            out = get_fit(j, rs, "year_group", cov, return_fit = False)
            samp_out[i] = {}
            samp_out[i]["model"] = j
            samp_out[i]["tables"] = out[0]
            samp_out[i]["res_no_stars"] = out[1]
            samp_out[i]["res_stars"] = out[2]
            samp_out[i]["part_res_no_stars"] = out[3]
            samp_out[i]["part_res_stars"] = out[4]
        except Exception as e:
            logging.exception("error here")
            pass
        print("finished")
        print(datetime.datetime.now())

    # Define outfile
    o_f = "reg_{0}_{1}_out_{2}_1127.pkl".format(ks, sample, cov)
    pickle.dump(samp_out, open(pathdir+o_f, "wb")) 

In [None]:
samples_l = ["naics_name", "primclass"]
# ks_l = ["tp_pct_common_cited", "docvecs", "ldavecs"]
ks_l = ["docvecs"]

for sample, ks in itertools.product(samples_l, ks_l):
    try:
#         get_results(sample, ks, cov="cluster")
        get_results(sample, ks, cov="HC1")
    except Exception as e:
        logging.exception("error here")
        pass

INFO:root:Index(['tp', 'op', 'sim_claims_docvecs', 'norm_sim_claims_docvecs',
       'sim_docvecs', 'sim_ldavecs', 'tp_gyear', 'tp_naics_name',
       'op_naics_name', 'op_primclass', 'op_inv_msa', 'inv_msa_match',
       'primclass_match', 'norm_sim_ldavecs', 'norm_sim_docvecs', 'year_group',
       'common_est_inv', 'common_pat_inv', 'lawyer_match', 'num_common_cited',
       'norm_num_common_cited', 'tp_pct_common_cited',
       'norm_tp_pct_common_cited', 'common_cited_match', 'common_npc_match',
       'mean_sim_docvecs_pc', 'mean_sim_docvecs_pc_msa',
       'norm_mean_sim_docvecs_pc', 'norm_mean_sim_docvecs_pc_msa',
       'num_common_npc', 'norm_num_common_npc', 'tp_primclass_FE',
       'tp_inv_msa_FE', 'tp_examiner_FE', 'tp_lawyer_FE', 'examiner_match'],
      dtype='object')
INFO:root:1498184
INFO:root:('docvecs', 'naics_name', 'started')
INFO:root:2018-11-27 16:07:17.559022
INFO:root:('docvecs', 'naics_name', 15, 'norm_sim_claims_docvecs ~ C(inv_msa_match) + C(tp_gyear) + C(

________
### Results
#### HC1

In [14]:
tab = {}
samples_l = ["naics_name", "primclass"]
# samples_l = ["naics_name"]
ks_l = ["tp_pct_common_cited", "docvecs", "ldavecs", ]
# ks_l = ["tp_pct_common_cited", "docvecs"]

for sample, ks in itertools.product(samples_l, ks_l):
    o_f = "reg_{0}_{1}_out_HC1_1016.pkl".format(ks, sample)

    res = pickle.load(open(pathdir+o_f, "rb"))

    res_out = pd.DataFrame()

    for k in res.keys():
        lks = res[k]["model"].split(" ~ ")[0]

        # Selecting portion of results without intercept
        cdf = res[k]["part_res_stars"].reset_index()
        ic_ind = cdf.loc[cdf["index"] == "Intercept"].index[0]
        # Include N & R^2
        cdf = cdf.iloc[pd.np.r_[0:ic_ind,ic_ind+2:len(cdf)]]
        cdf["Model"] = regs["model_names"][k]
        cdf["Model Num"] = k
        cdf["LKS"] = lks

        res_out = res_out.append(cdf)

    tab[ks, sample] = res_out
    del(res_out)
        
full_tab = tab

In [15]:
r = pd.DataFrame()
for k,v in tab.items():
    print(k)
    display(v)
    v["samp"] = str(k)
    v = v.reset_index(drop=True)
    r = pd.concat([r,v],axis=0)

INFO:root:('tp_pct_common_cited', 'naics_name')


Unnamed: 0,1975-85,1985-95,1995-05,2005-15,LKS,Model,Model Num,index
0,0.0321***,0.0737***,0.0802***,0.0689***,norm_tp_pct_common_cited,N PC FE-Year FE,60,C(inv_msa_match)[T.True]
1,(0.0056),(0.0075),(0.0064),(0.0050),norm_tp_pct_common_cited,N PC FE-Year FE,60,
4,188668,273463,384536,548061,norm_tp_pct_common_cited,N PC FE-Year FE,60,$N$
5,0.00,0.00,0.00,0.00,norm_tp_pct_common_cited,N PC FE-Year FE,60,Adjusted $R^2$
0,0.0345***,0.0693***,0.0742***,0.0626***,norm_tp_pct_common_cited,N PC FE-Sim PC,61,C(inv_msa_match)[T.True]
1,(0.0083),(0.0072),(0.0061),(0.0048),norm_tp_pct_common_cited,N PC FE-Sim PC,61,
2,0.0370***,0.0637***,0.0679***,0.0735***,norm_tp_pct_common_cited,N PC FE-Sim PC,61,norm_mean_sim_docvecs_pc
3,(0.0074),(0.0063),(0.0048),(0.0043),norm_tp_pct_common_cited,N PC FE-Sim PC,61,
6,100251,273455,384521,548046,norm_tp_pct_common_cited,N PC FE-Sim PC,61,$N$
7,0.00,0.00,0.00,0.00,norm_tp_pct_common_cited,N PC FE-Sim PC,61,Adjusted $R^2$


INFO:root:('docvecs', 'naics_name')


Unnamed: 0,1975-85,1985-95,1995-05,2005-15,LKS,Model,Model Num,index
0,0.0309***,0.0585***,0.0596***,0.0555***,norm_sim_docvecs,N PC FE-Year FE,60,C(inv_msa_match)[T.True]
1,(0.0053),(0.0043),(0.0034),(0.0030),norm_sim_docvecs,N PC FE-Year FE,60,
4,192773,280962,437405,563881,norm_sim_docvecs,N PC FE-Year FE,60,$N$
5,0.06,0.07,0.08,0.05,norm_sim_docvecs,N PC FE-Year FE,60,Adjusted $R^2$
0,0.0293***,0.0375***,0.0335***,0.0300***,norm_sim_docvecs,N PC FE-Sim PC,61,C(inv_msa_match)[T.True]
1,(0.0070),(0.0041),(0.0033),(0.0029),norm_sim_docvecs,N PC FE-Sim PC,61,
2,0.2899***,0.3015***,0.2966***,0.3018***,norm_sim_docvecs,N PC FE-Sim PC,61,norm_mean_sim_docvecs_pc
3,(0.0044),(0.0027),(0.0022),(0.0021),norm_sim_docvecs,N PC FE-Sim PC,61,
6,102330,280954,437386,563865,norm_sim_docvecs,N PC FE-Sim PC,61,$N$
7,0.11,0.11,0.12,0.08,norm_sim_docvecs,N PC FE-Sim PC,61,Adjusted $R^2$


INFO:root:('ldavecs', 'naics_name')


Unnamed: 0,1975-85,1985-95,1995-05,2005-15,LKS,Model,Model Num,index
0,0.0451***,0.0596***,0.0964***,0.0760***,norm_sim_ldavecs,N PC FE-Year FE,60,C(inv_msa_match)[T.True]
1,(0.0050),(0.0041),(0.0033),(0.0030),norm_sim_ldavecs,N PC FE-Year FE,60,
4,193104,281358,438028,564393,norm_sim_ldavecs,N PC FE-Year FE,60,$N$
5,0.10,0.09,0.12,0.14,norm_sim_ldavecs,N PC FE-Year FE,60,Adjusted $R^2$
0,0.0298***,0.0226***,0.0458***,0.0270***,norm_sim_ldavecs,N PC FE-Sim PC,61,C(inv_msa_match)[T.True]
1,(0.0061),(0.0037),(0.0030),(0.0027),norm_sim_ldavecs,N PC FE-Sim PC,61,
2,0.4647***,0.4555***,0.4627***,0.4597***,norm_sim_ldavecs,N PC FE-Sim PC,61,norm_mean_sim_ldavecs_pc
3,(0.0032),(0.0020),(0.0016),(0.0014),norm_sim_ldavecs,N PC FE-Sim PC,61,
6,102459,281350,438009,564377,norm_sim_ldavecs,N PC FE-Sim PC,61,$N$
7,0.27,0.26,0.29,0.30,norm_sim_ldavecs,N PC FE-Sim PC,61,Adjusted $R^2$


INFO:root:('tp_pct_common_cited', 'primclass')


Unnamed: 0,index,1975-85,1985-95,1995-05,2005-15,Model,Model Num,LKS
0,C(inv_msa_match)[T.True],0.1163***,0.1693***,0.1512***,0.1360***,N PC FE-Year FE,60,norm_tp_pct_common_cited
1,,(0.0097),(0.0095),(0.0070),(0.0057),N PC FE-Year FE,60,norm_tp_pct_common_cited
4,$N$,166703,244415,351637,504227,N PC FE-Year FE,60,norm_tp_pct_common_cited
5,Adjusted $R^2$,0.01,0.01,0.01,0.01,N PC FE-Year FE,60,norm_tp_pct_common_cited
0,C(inv_msa_match)[T.True],0.1222***,0.1693***,0.1512***,0.1360***,N PC FE-Sim PC,61,norm_tp_pct_common_cited
1,,(0.0142),(0.0095),(0.0070),(0.0057),N PC FE-Sim PC,61,norm_tp_pct_common_cited
2,norm_mean_sim_docvecs_pc,0.0378***,0.0302**,0.0231*,-0.0106,N PC FE-Sim PC,61,norm_tp_pct_common_cited
3,,(0.0136),(0.0122),(0.0122),(0.0141),N PC FE-Sim PC,61,norm_tp_pct_common_cited
6,$N$,89062,244412,351637,504227,N PC FE-Sim PC,61,norm_tp_pct_common_cited
7,Adjusted $R^2$,0.01,0.01,0.01,0.01,N PC FE-Sim PC,61,norm_tp_pct_common_cited


INFO:root:('docvecs', 'primclass')


Unnamed: 0,index,1975-85,1985-95,1995-05,2005-15,Model,Model Num,LKS
0,C(inv_msa_match)[T.True],0.0525***,0.0799***,0.0774***,0.0571***,N PC FE-Year FE,60,norm_sim_docvecs
1,,(0.0064),(0.0051),(0.0038),(0.0032),N PC FE-Year FE,60,norm_sim_docvecs
4,$N$,170564,251218,400729,518334,N PC FE-Year FE,60,norm_sim_docvecs
5,Adjusted $R^2$,0.06,0.07,0.07,0.05,N PC FE-Year FE,60,norm_sim_docvecs
0,C(inv_msa_match)[T.True],0.0559***,0.0800***,0.0775***,0.0568***,N PC FE-Sim PC,61,norm_sim_docvecs
1,,(0.0089),(0.0051),(0.0038),(0.0032),N PC FE-Sim PC,61,norm_sim_docvecs
2,norm_mean_sim_docvecs_pc,0.2456***,0.1795***,0.1702***,0.1592***,N PC FE-Sim PC,61,norm_sim_docvecs
3,,(0.0129),(0.0078),(0.0076),(0.0085),N PC FE-Sim PC,61,norm_sim_docvecs
6,$N$,90998,251215,400729,518334,N PC FE-Sim PC,61,norm_sim_docvecs
7,Adjusted $R^2$,0.07,0.07,0.08,0.06,N PC FE-Sim PC,61,norm_sim_docvecs


INFO:root:('ldavecs', 'primclass')


Unnamed: 0,index,1975-85,1985-95,1995-05,2005-15,Model,Model Num,LKS
0,C(inv_msa_match)[T.True],0.0547***,0.0631***,0.0612***,0.0427***,N PC FE-Year FE,60,norm_sim_ldavecs
1,,(0.0057),(0.0046),(0.0036),(0.0031),N PC FE-Year FE,60,norm_sim_ldavecs
4,$N$,170780,251507,401106,518628,N PC FE-Year FE,60,norm_sim_ldavecs
5,Adjusted $R^2$,0.17,0.17,0.19,0.19,N PC FE-Year FE,60,norm_sim_ldavecs
0,C(inv_msa_match)[T.True],0.0556***,0.0637***,0.0619***,0.0422***,N PC FE-Sim PC,61,norm_sim_ldavecs
1,,(0.0078),(0.0046),(0.0035),(0.0031),N PC FE-Sim PC,61,norm_sim_ldavecs
2,norm_mean_sim_ldavecs_pc,0.3786***,0.3218***,0.3445***,0.2763***,N PC FE-Sim PC,61,norm_sim_ldavecs
3,,(0.0106),(0.0072),(0.0070),(0.0074),N PC FE-Sim PC,61,norm_sim_ldavecs
6,$N$,91102,251504,401106,518628,N PC FE-Sim PC,61,norm_sim_ldavecs
7,Adjusted $R^2$,0.19,0.18,0.20,0.19,N PC FE-Sim PC,61,norm_sim_ldavecs


In [16]:
r.to_csv(pathdir+"reg_pairs_out_HC1_1016.csv")

#### Clustered

In [37]:
tab = {}
samples_l = ["naics_name", "primclass"]
# samples_l = ["naics_name"]
ks_l = ["tp_pct_common_cited", "docvecs", "ldavecs", ]
# ks_l = ["docvecs"]

for sample, ks in itertools.product(samples_l, ks_l):
        o_f = "reg_{0}_{1}_out_cluster_1016.pkl".format(ks, sample)
        
        res = pickle.load(open(pathdir+o_f, "rb"))

        res_out = pd.DataFrame()

        for k in res.keys():
            lks = res[k]["model"].split(" ~ ")[0]
            
            # Selecting portion of results without intercept
            cdf = res[k]["part_res_stars"].reset_index()
            ic_ind = cdf.loc[cdf["index"] == "Intercept"].index[0]
            # Include N & R^2
            cdf = cdf.iloc[pd.np.r_[0:ic_ind,ic_ind+2:len(cdf)]]
            cdf["Model"] = regs["model_names"][k]
            cdf["Model Num"] = k
            cdf["LKS"] = lks
        
            res_out = res_out.append(cdf)
        
        tab[ks, sample] = res_out
        del(res_out)
        
full_tab = tab

In [38]:
r = pd.DataFrame()
for k,v in tab.items():
    print(k)
    display(v)
    v["samp"] = str(k)
    v = v.reset_index(drop=True)
    r = pd.concat([r,v],axis=0)

INFO:root:('tp_pct_common_cited', 'naics_name')


Unnamed: 0,index,1975-85,1985-95,1995-05,2005-15,Model,Model Num,LKS
0,C(inv_msa_match)[T.True],0.0000,0.0004***,0.0006***,0.0005***,PC FE-Year FE,0,tp_pct_common_cited
1,,(0.0001),(0.0001),(0.0001),(0.0001),PC FE-Year FE,0,tp_pct_common_cited
4,$N$,38445,105898,185859,327759,PC FE-Year FE,0,tp_pct_common_cited
5,Adjusted $R^2$,0.00,0.00,0.00,0.00,PC FE-Year FE,0,tp_pct_common_cited
0,C(common_pat_inv)[T.True],0.0565,0.0392*,0.0693***,0.0911***,PC FE-Inv M,1,tp_pct_common_cited
1,,(0.0415),(0.0229),(0.0180),(0.0237),PC FE-Inv M,1,tp_pct_common_cited
4,$N$,38445,105898,185859,327759,PC FE-Inv M,1,tp_pct_common_cited
5,Adjusted $R^2$,0.01,0.01,0.03,0.04,PC FE-Inv M,1,tp_pct_common_cited
0,C(lawyer_match)[T.True],0.0054,0.0075***,0.0097***,0.0064***,PC FE-Lawyer M,2,tp_pct_common_cited
1,,(0.0045),(0.0029),(0.0030),(0.0020),PC FE-Lawyer M,2,tp_pct_common_cited


INFO:root:('docvecs', 'naics_name')


Unnamed: 0,index,1975-85,1985-95,1995-05,2005-15,Model,Model Num,LKS
0,C(inv_msa_match)[T.True],0.0057***,0.0109***,0.0093***,0.0082***,PC FE-Year FE,0,sim_docvecs
1,,(0.0018),(0.0011),(0.0009),(0.0007),PC FE-Year FE,0,sim_docvecs
4,$N$,38445,105898,185859,327759,PC FE-Year FE,0,sim_docvecs
5,Adjusted $R^2$,0.08,0.08,0.09,0.05,PC FE-Year FE,0,sim_docvecs
0,C(common_pat_inv)[T.True],0.0934**,0.1441***,0.1353***,0.1401***,PC FE-Inv M,1,sim_docvecs
1,,(0.0417),(0.0178),(0.0145),(0.0131),PC FE-Inv M,1,sim_docvecs
4,$N$,38445,105898,185859,327759,PC FE-Inv M,1,sim_docvecs
5,Adjusted $R^2$,0.08,0.08,0.08,0.05,PC FE-Inv M,1,sim_docvecs
0,C(lawyer_match)[T.True],0.0141,0.0516***,0.0538***,0.0408***,PC FE-Lawyer M,2,sim_docvecs
1,,(0.0157),(0.0072),(0.0070),(0.0071),PC FE-Lawyer M,2,sim_docvecs


INFO:root:('ldavecs', 'naics_name')


Unnamed: 0,index,1975-85,1985-95,1995-05,2005-15,Model,Model Num,LKS
0,C(inv_msa_match)[T.True],0.0132***,0.0176***,0.0243***,0.0179***,PC FE-Year FE,0,sim_ldavecs
1,,(0.0032),(0.0019),(0.0017),(0.0020),PC FE-Year FE,0,sim_ldavecs
4,$N$,38445,105898,185859,327759,PC FE-Year FE,0,sim_ldavecs
5,Adjusted $R^2$,0.14,0.12,0.12,0.14,PC FE-Year FE,0,sim_ldavecs
0,C(common_pat_inv)[T.True],0.2204**,0.2625***,0.2570***,0.2765***,PC FE-Inv M,1,sim_ldavecs
1,,(0.0867),(0.0346),(0.0246),(0.0219),PC FE-Inv M,1,sim_ldavecs
4,$N$,38445,105898,185859,327759,PC FE-Inv M,1,sim_ldavecs
5,Adjusted $R^2$,0.14,0.12,0.12,0.14,PC FE-Inv M,1,sim_ldavecs
0,C(lawyer_match)[T.True],0.0788***,0.1118***,0.0800***,0.0624***,PC FE-Lawyer M,2,sim_ldavecs
1,,(0.0258),(0.0134),(0.0094),(0.0102),PC FE-Lawyer M,2,sim_ldavecs


INFO:root:('tp_pct_common_cited', 'primclass')


Unnamed: 0,index,1975-85,1985-95,1995-05,2005-15,Model,Model Num,LKS
0,C(inv_msa_match)[T.True],0.0014***,0.0019***,0.0019***,0.0016***,PC FE-Year FE,0,tp_pct_common_cited
1,,(0.0005),(0.0003),(0.0002),(0.0002),PC FE-Year FE,0,tp_pct_common_cited
4,$N$,36511,101363,176538,308704,PC FE-Year FE,0,tp_pct_common_cited
5,Adjusted $R^2$,0.00,0.01,0.01,0.01,PC FE-Year FE,0,tp_pct_common_cited
0,C(common_pat_inv)[T.True],0.0788**,0.0794***,0.0993***,0.1295***,PC FE-Inv M,1,tp_pct_common_cited
1,,(0.0338),(0.0174),(0.0133),(0.0155),PC FE-Inv M,1,tp_pct_common_cited
4,$N$,36511,101363,176538,308704,PC FE-Inv M,1,tp_pct_common_cited
5,Adjusted $R^2$,0.01,0.02,0.04,0.06,PC FE-Inv M,1,tp_pct_common_cited
0,C(lawyer_match)[T.True],0.0292**,0.0219***,0.0241***,0.0327***,PC FE-Lawyer M,2,tp_pct_common_cited
1,,(0.0124),(0.0046),(0.0045),(0.0047),PC FE-Lawyer M,2,tp_pct_common_cited


INFO:root:('docvecs', 'primclass')


Unnamed: 0,index,1975-85,1985-95,1995-05,2005-15,Model,Model Num,LKS
0,C(inv_msa_match)[T.True],0.0066***,0.0100***,0.0108***,0.0072***,PC FE-Year FE,0,sim_docvecs
1,,(0.0023),(0.0013),(0.0010),(0.0008),PC FE-Year FE,0,sim_docvecs
4,$N$,36511,101363,176538,308704,PC FE-Year FE,0,sim_docvecs
5,Adjusted $R^2$,0.07,0.08,0.08,0.06,PC FE-Year FE,0,sim_docvecs
0,C(common_pat_inv)[T.True],0.1333***,0.1273***,0.1191***,0.1133***,PC FE-Inv M,1,sim_docvecs
1,,(0.0266),(0.0152),(0.0120),(0.0083),PC FE-Inv M,1,sim_docvecs
4,$N$,36511,101363,176538,308704,PC FE-Inv M,1,sim_docvecs
5,Adjusted $R^2$,0.07,0.08,0.08,0.06,PC FE-Inv M,1,sim_docvecs
0,C(lawyer_match)[T.True],0.0466***,0.0548***,0.0468***,0.0416***,PC FE-Lawyer M,2,sim_docvecs
1,,(0.0174),(0.0061),(0.0077),(0.0062),PC FE-Lawyer M,2,sim_docvecs


INFO:root:('ldavecs', 'primclass')


Unnamed: 0,index,1975-85,1985-95,1995-05,2005-15,Model,Model Num,LKS
0,C(inv_msa_match)[T.True],0.0096***,0.0133***,0.0120***,0.0088***,PC FE-Year FE,0,sim_ldavecs
1,,(0.0032),(0.0025),(0.0016),(0.0016),PC FE-Year FE,0,sim_ldavecs
4,$N$,36511,101363,176538,308704,PC FE-Year FE,0,sim_ldavecs
5,Adjusted $R^2$,0.21,0.20,0.21,0.19,PC FE-Year FE,0,sim_ldavecs
0,C(common_pat_inv)[T.True],0.1804***,0.1942***,0.1677***,0.1575***,PC FE-Inv M,1,sim_ldavecs
1,,(0.0322),(0.0179),(0.0140),(0.0134),PC FE-Inv M,1,sim_ldavecs
4,$N$,36511,101363,176538,308704,PC FE-Inv M,1,sim_ldavecs
5,Adjusted $R^2$,0.22,0.20,0.21,0.19,PC FE-Inv M,1,sim_ldavecs
0,C(lawyer_match)[T.True],0.0670***,0.0620***,0.0599***,0.0529***,PC FE-Lawyer M,2,sim_ldavecs
1,,(0.0165),(0.0107),(0.0106),(0.0087),PC FE-Lawyer M,2,sim_ldavecs


In [39]:
r.to_csv(pathdir+"reg_pairs_out_cluster_1016.csv")