In [1]:
import scipy as sp
import numpy as np
import pandas as pd
import timeit
import re
import json
import pickle
import fastparquet
import os
os.chdir('/mnt/t48/bighomes-active/sfeng/patentdiffusion/')
seed = 3
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
import datetime
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.iolib.summary2 as summary2
import logging
import itertools

  from pandas.core import datetools


In this regression:
- Remove all primary class match observations and compare only similarity across patents from different technology fields

In [2]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.basicConfig(level=logging.INFO)
logger.addHandler(logging.FileHandler('Logs/reg_{0}.log'.format(datetime.datetime.now().\
                                                            strftime("%Y-%m-%d"), 'a')))
print = logging.info
print('good day to you madam fiona')
print('started')
print(datetime.datetime.now())

INFO:root:good day to you madam fiona
INFO:root:started
INFO:root:2018-10-07 12:04:08.377995


In [3]:
pathdir = "DataStore/2018-10/Reg0930/"
reg_f = "reg_model_1002.pkl"

regs = pickle.load(open(pathdir+reg_f, "rb"))

In [4]:
# Add list of equations
# Drop Inv FE
naics_ind = regs["model_names"].loc[regs["model_names"].apply(lambda x: ("Sim PC" in x) or ("Int PC" in x))].index.tolist()
print(len(naics_ind))

regs[("docvecs", "naics_name")] = regs["docvecs"].loc[naics_ind]
regs[("ldavecs", "naics_name")] = regs["ldavecs"].loc[naics_ind]
for c in ["num_common_cited", "tp_pct_common_cited"]:
    regs[(c, "naics_name")] = regs[c].loc[regs[c].index.isin(naics_ind)]

# PC indices
# PC MSA not including Primclass Match
pc_msa_ind = regs["model_names"].loc[regs["model_names"].apply(lambda x: ("PC MSA" in x) & ("All FE" not in x))].index.tolist()

# Combine
pc_ind = pc_msa_ind
print(len(pc_ind))

regs[("docvecs", "primclass")] = regs["docvecs"].loc[regs["docvecs"].index.isin(pc_ind)].dropna()
regs[("ldavecs", "primclass")] = regs["ldavecs"].loc[regs["ldavecs"].index.isin(pc_ind)].dropna()
# Use whatever is available for num_common_cited
for c in ["num_common_cited", "tp_pct_common_cited"]:
    regs[(c, "primclass")] = regs[c].loc[regs[c].index.isin(pc_ind)]

# Where files are located
files = {"naics_name": "DataStore/2018-10/Reg0930/naics_name_all_1004.parq",
         "primclass": "DataStore/2018-10/Reg0930/primclass_all_1004.parq",
}


INFO:root:32
INFO:root:12


In [6]:

info_dict = {'$N$':lambda x: "{0:d}".format(int(x.nobs)),
'Adjusted $R^2$':lambda x: "{:.2f}".format(x.rsquared_adj)}
def get_fit(formula, grouped_data, group_col, cov_type, return_fit = False):
    summ = []
    tables = {}
    
    # If formula uses mean similarity, use grant year above 1980
    if "mean_sim_" in formula:
        grouped_data = grouped_data.loc[(grouped_data["tp_gyear"] >= 1980)]
    
    # Remove missing values used in formula
    col_used = re.findall('\((.*?)\)',formula)
    # Intersect with grouped_data columns
    col_used = list(set(col_used).intersection(set(list(grouped_data.columns))))
    grouped_data = grouped_data.dropna(how="any", subset=col_used)
                                    # Should take care of gyear > 1980 issues
    
    # Group and then get results
    grouped_data = grouped_data.groupby(group_col)
    
    for n,g in grouped_data:
        try:
            fit = smf.ols(formula = formula, data = g, missing="drop").fit(cov_type=cov_type)
            # Get results tables
            tables[n] = fit.summary2().tables
            # Append results
            summ.append(fit)
        except Exception as e:
            print(n)
            logging.exception("Regression error")
            pass
    # Get full results output
    # Dataframe of full results
    res_no_stars = summary2.summary_col(summ, stars = False, \
    model_names = ["{0}".format(n) for n in tables.keys()],\
        info_dict = info_dict).tables[0]
    res_stars = summary2.summary_col(summ, stars = True, \
    model_names = ["{0}".format(n) for n in tables.keys()],\
        info_dict = info_dict).tables[0]
    
    # Get partial results
    # 1. Get relevant variables from index of full results: UPDATED
    regressors = [v for v in res_no_stars.index.unique() if ("sim_" in v) | ("match" in v) | ("common_" in v)]
    # 2. Make sure regressors come last
    regressors = regressors+["Intercept"]
    # 3. Get results with regressors
    part_res_no_stars = summary2.summary_col(summ, stars = False, \
    model_names = ["{0}".format(n) for n in tables.keys()],\
        info_dict = info_dict, regressor_order = regressors).tables[0]
    part_res_stars = summary2.summary_col(summ, stars = True, \
    model_names = ["{0}".format(n) for n in tables.keys()],\
        info_dict = info_dict, regressor_order = regressors).tables[0]
    
    # 4. Get index of where Intercept is and add 2 (to include standard error)
    last_ind = list(part_res_stars.index).index("Intercept")+2
    
    # 5. Get partial results
    part_res_no_stars = pd.concat([part_res_no_stars.iloc[:last_ind], part_res_no_stars.iloc[-2::]])
    part_res_stars = pd.concat([part_res_stars.iloc[:last_ind], part_res_stars.iloc[-2::]])
    
    if return_fit == True:
        return summ, tables, res_no_stars, res_stars, part_res_no_stars, part_res_stars
    else:
        return tables, res_no_stars, res_stars, part_res_no_stars, part_res_stars
    
def get_results(sample, ks):
    rs = fastparquet.ParquetFile(files[sample]).to_pandas()
    try:
        rs = rs.drop(['mean_sim_ldavecs_pc_msa_v',
           'mean_sim_docvecs_pc_msa_v', 'mean_sim_docvecs_pc_v', 'mean_sim_ldavecs_pc_v',
           'norm_mean_sim_ldavecs_pc_msa_v', 'norm_mean_sim_docvecs_pc_msa_v',
           'norm_mean_sim_docvecs_pc_v', 'norm_mean_sim_ldavecs_pc_v'],1)
        
    except:
        pass
    print(rs.columns)
    print(len(rs))
    
    # Drop extreme values
    print(len(rs))
    rs = rs.loc[(rs["norm_mean_sim_docvecs_pc_msa"] >= -5) & (rs["norm_mean_sim_docvecs_pc_msa"] <= 5)]
    rs = rs.loc[(rs["norm_mean_sim_docvecs_pc"] >= -5) & (rs["norm_mean_sim_docvecs_pc"] <= 5)]
    print(len(rs))
    
    print((ks, sample, "started"))
    print(datetime.datetime.now())
    # 3. Define output
    samp_out = {}
    formulas = list(regs[(ks, sample)])
    formulas_ind = list(regs[(ks, sample)].index)
    cov = "HC1"
    for i, j in zip(formulas_ind, formulas):
        print((ks, sample, i, j))
        print(datetime.datetime.now())
        try:
            out = get_fit(j, rs, "year_group", cov, return_fit = False)
            samp_out[i] = {}
            samp_out[i]["model"] = j
            samp_out[i]["tables"] = out[0]
            samp_out[i]["res_no_stars"] = out[1]
            samp_out[i]["res_stars"] = out[2]
            samp_out[i]["part_res_no_stars"] = out[3]
            samp_out[i]["part_res_stars"] = out[4]
        except Exception as e:
            logging.exception("error here")
            pass
        print("finished")
        print(datetime.datetime.now())
    
    # Define outfile
    o_f = "reg_{0}_{1}_out_1003.pkl".format(ks, sample)
    pickle.dump(samp_out, open(pathdir+o_f, "wb")) 

In [None]:
samples_l = ["naics_name", "primclass"]
# ks_l = ["tp_pct_common_cited", "docvecs", "ldavecs", ]
ks_l = ["docvecs"]

for sample, ks in itertools.product(samples_l, ks_l):
    try:
        get_results(sample, ks)
    except Exception as e:
        logging.exception("error here")
        pass

INFO:root:Index(['tp', 'op', 'sim_docvecs', 'sim_ldavecs', 'tp_gyear', 'tp_naics_name',
       'tp_primclass', 'tp_inv_msa', 'op_naics_name', 'op_primclass',
       'op_inv_msa', 'inv_msa_match', 'primclass_match', 'norm_sim_ldavecs',
       'norm_sim_docvecs', 'year_group', 'common_est_inv', 'common_pat_inv',
       'lawyer_match', 'num_common_cited', 'norm_num_common_cited',
       'tp_pct_common_cited', 'norm_tp_pct_common_cited', 'common_cited_match',
       'mean_sim_docvecs_pc', 'mean_sim_ldavecs_pc', 'mean_sim_docvecs_pc_msa',
       'mean_sim_ldavecs_pc_msa', 'norm_mean_sim_docvecs_pc',
       'norm_mean_sim_ldavecs_pc', 'norm_mean_sim_docvecs_pc_msa',
       'norm_mean_sim_ldavecs_pc_msa', 'sd_sim_docvecs_pc',
       'sd_sim_ldavecs_pc', 'sd_sim_docvecs_pc_msa', 'sd_sim_ldavecs_pc_msa',
       'pc_msa_greater_0', 'pc_msa_less_0'],
      dtype='object')
INFO:root:1498184
INFO:root:1498184
INFO:root:1214679
INFO:root:('docvecs', 'naics_name', 'started')
INFO:root:2018-10-07 12:0

### Results

In [8]:
tab = {}
samples_l = ["naics_name", "primclass"]
# samples_l = ["naics_name"]
# ks_l = ["tp_pct_common_cited", "docvecs", "ldavecs", ]
ks_l = ["docvecs"]

for sample, ks in itertools.product(samples_l, ks_l):
        o_f = "reg_{0}_{1}_out_1003.pkl".format(ks, sample)
        
        res = pickle.load(open(pathdir+o_f, "rb"))

        res_out = pd.DataFrame()

        for k in res.keys():
            lks = res[k]["model"].split(" ~ ")[0]
            
            # Selecting portion of results without intercept
            cdf = res[k]["part_res_stars"].reset_index()
            ic_ind = cdf.loc[cdf["index"] == "Intercept"].index[0]
            # Include N & R^2
            cdf = cdf.iloc[pd.np.r_[0:ic_ind,ic_ind+2:len(cdf)]]
            cdf["Model"] = regs["model_names"][k]
            cdf["Model Num"] = k
            cdf["LKS"] = lks
        
            res_out = res_out.append(cdf)
        
        tab[ks, sample] = res_out
        del(res_out)
        
full_tab = tab

In [10]:
r = pd.DataFrame()
for k,v in tab.items():
    print(k)
    display(v)
    v["samp"] = str(k)
    v = v.reset_index(drop=True)
    r = pd.concat([r,v],axis=0)

INFO:root:('docvecs', 'naics_name')


Unnamed: 0,index,1975-85,1985-95,1995-05,2005-15,Model,Model Num,LKS,samp
0,C(common_est_inv)[T.1.0],0.0057**,0.0026**,0.0036***,0.0053***,All FE-Sim PC,13,sim_docvecs,"('docvecs', 'naics_name')"
1,,(0.0026),(0.0013),(0.0007),(0.0005),All FE-Sim PC,13,sim_docvecs,"('docvecs', 'naics_name')"
2,C(common_pat_inv)[T.True],0.0503,0.0921***,0.0911***,0.0944***,All FE-Sim PC,13,sim_docvecs,"('docvecs', 'naics_name')"
3,,(0.0446),(0.0213),(0.0122),(0.0113),All FE-Sim PC,13,sim_docvecs,"('docvecs', 'naics_name')"
4,C(inv_msa_match)[T.True],0.0023,0.0053***,0.0041***,0.0037***,All FE-Sim PC,13,sim_docvecs,"('docvecs', 'naics_name')"
5,,(0.0016),(0.0009),(0.0006),(0.0005),All FE-Sim PC,13,sim_docvecs,"('docvecs', 'naics_name')"
6,C(lawyer_match)[T.True],0.0040,0.0221***,0.0351***,0.0258***,All FE-Sim PC,13,sim_docvecs,"('docvecs', 'naics_name')"
7,,(0.0155),(0.0066),(0.0055),(0.0046),All FE-Sim PC,13,sim_docvecs,"('docvecs', 'naics_name')"
8,C(primclass_match)[T.True],0.0018,0.0037**,0.0032***,0.0019**,All FE-Sim PC,13,sim_docvecs,"('docvecs', 'naics_name')"
9,,(0.0030),(0.0017),(0.0012),(0.0009),All FE-Sim PC,13,sim_docvecs,"('docvecs', 'naics_name')"


INFO:root:('docvecs', 'primclass')


Unnamed: 0,index,1975-85,1985-95,1995-05,2005-15,Model,Model Num,LKS,samp
0,C(common_est_inv)[T.1.0],0.0031,0.0078***,0.0065***,0.0068***,Inv FE-Sim PC MSA,23,sim_docvecs,"('docvecs', 'primclass')"
1,,(0.0025),(0.0012),(0.0008),(0.0005),Inv FE-Sim PC MSA,23,sim_docvecs,"('docvecs', 'primclass')"
2,C(common_pat_inv)[T.True],0.1114***,0.0987***,0.1032***,0.0895***,Inv FE-Sim PC MSA,23,sim_docvecs,"('docvecs', 'primclass')"
3,,(0.0237),(0.0129),(0.0085),(0.0069),Inv FE-Sim PC MSA,23,sim_docvecs,"('docvecs', 'primclass')"
4,C(inv_msa_match)[T.True],0.0014,0.0034***,0.0044***,0.0022***,Inv FE-Sim PC MSA,23,sim_docvecs,"('docvecs', 'primclass')"
5,,(0.0020),(0.0011),(0.0008),(0.0006),Inv FE-Sim PC MSA,23,sim_docvecs,"('docvecs', 'primclass')"
6,C(lawyer_match)[T.True],0.0271*,0.0314***,0.0286***,0.0261***,Inv FE-Sim PC MSA,23,sim_docvecs,"('docvecs', 'primclass')"
7,,(0.0145),(0.0057),(0.0050),(0.0042),Inv FE-Sim PC MSA,23,sim_docvecs,"('docvecs', 'primclass')"
8,mean_sim_docvecs_pc_msa,0.1188***,0.1183***,0.1500***,0.1698***,Inv FE-Sim PC MSA,23,sim_docvecs,"('docvecs', 'primclass')"
9,,(0.0136),(0.0082),(0.0069),(0.0064),Inv FE-Sim PC MSA,23,sim_docvecs,"('docvecs', 'primclass')"


In [11]:
r.to_csv(pathdir+"reg_pairs_out_1003.csv")