In [1]:
import scipy as sp
import numpy as np
import pandas as pd
import timeit
import re
import json
import pickle
import fastparquet
import os
os.chdir('/mnt/t48/bighomes-active/sfeng/patentdiffusion/')
seed = 3
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
import datetime
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.iolib.summary2 as summary2
import logging
import itertools

  from pandas.core import datetools


- Based on: https://sfengc7.stern.nyu.edu:8888/notebooks/patentdiffusion/201808Results/Reg1016/3a-RunReg-1016.ipynb
- Regression models in: https://sfengc7.stern.nyu.edu:8888/notebooks/patentdiffusion/201808Results/Reg1016/2-RegPrep-1016.ipynb

In [12]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.basicConfig(level=logging.INFO)
logger.addHandler(logging.FileHandler('Logs/reg_{0}.log'.format(datetime.datetime.now().\
                                                            strftime("%Y-%m-%d"), 'a')))
print = logging.info
print('good day to you madam fiona')
print('started')
print(datetime.datetime.now())

INFO:root:good day to you madam fiona
INFO:root:started
INFO:root:2018-11-27 16:07:05.027461


In [13]:
pathdir = "DataStore/2018-11/Reg1127/"
reg_f = "reg_model_1127.pkl"

regs = pickle.load(open(pathdir+reg_f, "rb"))

In [14]:
# Add list of equations
# Only normed values
regs["model_names"] = regs["model_names"].loc[regs["model_names"].apply(lambda x: x[:2] == "N ")]
# naics_ind = regs["model_names"].loc[regs["model_names"].apply(lambda x: ("Sim PC" in x) or ("Int PC" in x))].index.tolist()
naics_ind = regs["model_names"].index.tolist()
print(len(naics_ind))

regs[("docvecs", "naics_name")] = regs["docvecs"].loc[naics_ind]
# regs[("ldavecs", "naics_name")] = regs["ldavecs"].loc[naics_ind]
# for c in ["num_common_cited", "tp_pct_common_cited"]:
#     regs[(c, "naics_name")] = regs[c].loc[regs[c].index.isin(naics_ind)]

# PC indices
# Exclude Sim PC and Primclass Match
pc_ind = regs["model_names"].loc[regs["model_names"].apply(lambda x: (x[:-2] != "PC") and ("PC M-" not in x))].index.tolist()
print(len(pc_ind))

regs[("docvecs", "primclass")] = regs["docvecs"].loc[regs["docvecs"].index.isin(pc_ind)].dropna()
# regs[("ldavecs", "primclass")] = regs["ldavecs"].loc[regs["ldavecs"].index.isin(pc_ind)].dropna()
# Use whatever is available for num_common_cited
# for c in ["num_common_cited", "tp_pct_common_cited"]:
#     regs[(c, "primclass")] = regs[c].loc[regs[c].index.isin(pc_ind)]

# Where files are located
files = {"naics_name": "DataStore/2018-11/naics_name_sim_claims_1120.parq",
         "primclass": "DataStore/2018-11/primclass_sim_claims_1120.parq",
}


INFO:root:15
INFO:root:14


In [15]:
regs[("docvecs", "primclass")].sample(5)

16    norm_sim_claims_docvecs ~ C(inv_msa_match) + C...
27    norm_sim_claims_docvecs ~ C(primclass_match) +...
19    norm_sim_claims_docvecs ~ C(inv_msa_match) + C...
17    norm_sim_claims_docvecs ~ C(inv_msa_match) + C...
28    norm_sim_claims_docvecs ~ C(common_npc_match) ...
dtype: object

In [16]:

info_dict = {'$N$':lambda x: "{0:d}".format(int(x.nobs)),
'Adjusted $R^2$':lambda x: "{:.2f}".format(x.rsquared_adj)}
def get_fit(formula, grouped_data, group_col, cov_type = "HC1", return_fit = False):
    summ = []
    tables = {}
    
    # If formula uses mean similarity, use grant year above 1980
    if "mean_sim_" in formula:
        grouped_data = grouped_data.loc[(grouped_data["tp_gyear"] >= 1980)]
    
    # Remove missing values used in formula
    col_used = re.findall('\((.*?)\)',formula)
    # Intersect with grouped_data columns
    col_used = list(set(col_used).intersection(set(list(grouped_data.columns))))
    print(col_used)
    
    grouped_data = grouped_data.dropna(how="any", subset=col_used).copy().reset_index(drop=True)
    
    # Get length of data
    print(("Length of data", len(grouped_data)))
                                    
    # Group and then get results
    grouped_data = grouped_data.groupby(group_col)
    
    for n,g in grouped_data:
        try:
            if cov_type == "HC1":
                fit = smf.ols(formula = formula, data = g, missing="drop").fit(cov_type="HC1")
            else:
                # Drop missing in the grouped data first
                # Cluster uses primary class so must have primary class FE
                cols_used2 = list(set(col_used).intersection(set(list(grouped_data.columns))))+["tp_primclass_FE"]
                g = g.dropna(subset=cols_used2, how="any").copy().reset_index(drop=True)
                fit = smf.ols(formula = formula, data = g).fit(cov_type="cluster",
                                                                              cov_kwds={'groups': g["tp_primclass_FE"]})
            # Get results tables
            tables[n] = fit.summary2().tables
            # Append results
            summ.append(fit)
        except Exception as e:
            print(n)
            logging.exception("Regression error")
            pass
    # Get full results output
    # Dataframe of full results
    res_no_stars = summary2.summary_col(summ, stars = False, \
    model_names = ["{0}".format(n) for n in tables.keys()],\
        info_dict = info_dict).tables[0]
    res_stars = summary2.summary_col(summ, stars = True, \
    model_names = ["{0}".format(n) for n in tables.keys()],\
        info_dict = info_dict).tables[0]
    
    # Get partial results
    # 1. Get relevant variables from index of full results: UPDATED
    regressors = [v for v in res_no_stars.index.unique() if ("sim_" in v) | ("match" in v) | ("common_" in v)]
    # 2. Make sure regressors come last
    regressors = regressors+["Intercept"]
    # 3. Get results with regressors
    part_res_no_stars = summary2.summary_col(summ, stars = False, \
    model_names = ["{0}".format(n) for n in tables.keys()],\
        info_dict = info_dict, regressor_order = regressors).tables[0]
    part_res_stars = summary2.summary_col(summ, stars = True, \
    model_names = ["{0}".format(n) for n in tables.keys()],\
        info_dict = info_dict, regressor_order = regressors).tables[0]
    
    # 4. Get index of where Intercept is and add 2 (to include standard error)
    last_ind = list(part_res_stars.index).index("Intercept")+2
    
    # 5. Get partial results
    part_res_no_stars = pd.concat([part_res_no_stars.iloc[:last_ind], part_res_no_stars.iloc[-2::]])
    part_res_stars = pd.concat([part_res_stars.iloc[:last_ind], part_res_stars.iloc[-2::]])
    
    if return_fit == True:
        return summ, tables, res_no_stars, res_stars, part_res_no_stars, part_res_stars
    else:
        return tables, res_no_stars, res_stars, part_res_no_stars, part_res_stars
    
def get_results(sample, ks, cov = "HC1"):
    rs = fastparquet.ParquetFile(files[sample]).to_pandas()
    try:
        rs = rs.drop(['mean_sim_ldavecs_pc_msa_v',
           'mean_sim_docvecs_pc_msa_v', 'mean_sim_docvecs_pc_v', 'mean_sim_ldavecs_pc_v',
           'norm_mean_sim_ldavecs_pc_msa_v', 'norm_mean_sim_docvecs_pc_msa_v',
           'norm_mean_sim_docvecs_pc_v', 'norm_mean_sim_ldavecs_pc_v'],1)
        
    except:
        pass
    print(rs.columns)
    print(len(rs))
    
    # Drop extreme values
#     print(len(rs))
#     rs = rs.loc[((rs["norm_mean_sim_docvecs_pc_msa"] >= -5) & (rs["norm_mean_sim_docvecs_pc_msa"] <= 5)) | 
#                (rs["norm_mean_sim_docvecs_pc_msa"].isnull())]
#     rs = rs.loc[((rs["norm_mean_sim_docvecs_pc"] >= -5) & (rs["norm_mean_sim_docvecs_pc"] <= 5)) |
#                (rs["norm_mean_sim_docvecs_pc"].isnull())]
#     print(len(rs))
    
    print((ks, sample, "started"))
    print(datetime.datetime.now())
    # 3. Define output
    samp_out = {}
    formulas = list(regs[(ks, sample)])
    formulas_ind = list(regs[(ks, sample)].index)
    for i, j in zip(formulas_ind, formulas):
        print((ks, sample, i, j))
        print(datetime.datetime.now())
        try:
            out = get_fit(j, rs, "year_group", cov, return_fit = False)
            samp_out[i] = {}
            samp_out[i]["model"] = j
            samp_out[i]["tables"] = out[0]
            samp_out[i]["res_no_stars"] = out[1]
            samp_out[i]["res_stars"] = out[2]
            samp_out[i]["part_res_no_stars"] = out[3]
            samp_out[i]["part_res_stars"] = out[4]
        except Exception as e:
            logging.exception("error here")
            pass
        print("finished")
        print(datetime.datetime.now())

    # Define outfile
    o_f = "reg_{0}_{1}_out_{2}_1127.pkl".format(ks, sample, cov)
    pickle.dump(samp_out, open(pathdir+o_f, "wb")) 

In [None]:
samples_l = ["naics_name", "primclass"]
# ks_l = ["tp_pct_common_cited", "docvecs", "ldavecs"]
ks_l = ["docvecs"]

for sample, ks in itertools.product(samples_l, ks_l):
    try:
#         get_results(sample, ks, cov="cluster")
        get_results(sample, ks, cov="HC1")
    except Exception as e:
        logging.exception("error here")
        pass

INFO:root:Index(['tp', 'op', 'sim_claims_docvecs', 'norm_sim_claims_docvecs',
       'sim_docvecs', 'sim_ldavecs', 'tp_gyear', 'tp_naics_name',
       'op_naics_name', 'op_primclass', 'op_inv_msa', 'inv_msa_match',
       'primclass_match', 'norm_sim_ldavecs', 'norm_sim_docvecs', 'year_group',
       'common_est_inv', 'common_pat_inv', 'lawyer_match', 'num_common_cited',
       'norm_num_common_cited', 'tp_pct_common_cited',
       'norm_tp_pct_common_cited', 'common_cited_match', 'common_npc_match',
       'mean_sim_docvecs_pc', 'mean_sim_docvecs_pc_msa',
       'norm_mean_sim_docvecs_pc', 'norm_mean_sim_docvecs_pc_msa',
       'num_common_npc', 'norm_num_common_npc', 'tp_primclass_FE',
       'tp_inv_msa_FE', 'tp_examiner_FE', 'tp_lawyer_FE', 'examiner_match'],
      dtype='object')
INFO:root:1498184
INFO:root:('docvecs', 'naics_name', 'started')
INFO:root:2018-11-27 16:07:17.559022
INFO:root:('docvecs', 'naics_name', 15, 'norm_sim_claims_docvecs ~ C(inv_msa_match) + C(tp_gyear) + C(

________
### Results
#### HC1

In [19]:
tab = {}
samples_l = ["naics_name", "primclass"]
# samples_l = ["naics_name"]
ks_l = ["docvecs"]
# ks_l = ["tp_pct_common_cited", "docvecs"]

for sample, ks in itertools.product(samples_l, ks_l):
    o_f = "reg_{0}_{1}_out_HC1_1127.pkl".format(ks, sample)

    res = pickle.load(open(pathdir+o_f, "rb"))

    res_out = pd.DataFrame()

    for k in res.keys():
        lks = res[k]["model"].split(" ~ ")[0]

        # Selecting portion of results without intercept
        cdf = res[k]["part_res_stars"].reset_index()
        ic_ind = cdf.loc[cdf["index"] == "Intercept"].index[0]
        # Include N & R^2
        cdf = cdf.iloc[pd.np.r_[0:ic_ind,ic_ind+2:len(cdf)]]
        cdf["Model"] = regs["model_names"][k]
        cdf["Model Num"] = k
        cdf["LKS"] = lks

        res_out = res_out.append(cdf)

    tab[ks, sample] = res_out
    del(res_out)
        
full_tab = tab

In [20]:
r = pd.DataFrame()
for k,v in tab.items():
    print(k)
    display(v)
    v["samp"] = str(k)
    v = v.reset_index(drop=True)
    r = pd.concat([r,v],axis=0)

INFO:root:('docvecs', 'naics_name')


Unnamed: 0,index,1975-85,1985-95,1995-05,2005-15,Model,Model Num,LKS
0,C(inv_msa_match)[T.True],0.0510***,0.0659***,0.0679***,0.0613***,N PC FE-Year FE,15,norm_sim_claims_docvecs
1,,(0.0050),(0.0042),(0.0034),(0.0031),N PC FE-Year FE,15,norm_sim_claims_docvecs
4,$N$,194092,282099,443884,578054,N PC FE-Year FE,15,norm_sim_claims_docvecs
5,Adjusted $R^2$,0.07,0.05,0.05,0.07,N PC FE-Year FE,15,norm_sim_claims_docvecs
0,C(common_est_inv)[T.1.0],0.0752***,0.1064***,0.1190***,0.0985***,N Inv M-Year FE,16,norm_sim_claims_docvecs
1,,(0.0154),(0.0082),(0.0054),(0.0038),N Inv M-Year FE,16,norm_sim_claims_docvecs
2,C(common_pat_inv)[T.True],1.8277***,1.9355***,1.7402***,1.4942***,N Inv M-Year FE,16,norm_sim_claims_docvecs
3,,(0.1240),(0.0879),(0.0696),(0.0668),N Inv M-Year FE,16,norm_sim_claims_docvecs
4,C(inv_msa_match)[T.True],0.0439***,0.0543***,0.0560***,0.0537***,N Inv M-Year FE,16,norm_sim_claims_docvecs
5,,(0.0050),(0.0041),(0.0034),(0.0031),N Inv M-Year FE,16,norm_sim_claims_docvecs


INFO:root:('docvecs', 'primclass')


Unnamed: 0,index,1975-85,1985-95,1995-05,2005-15,Model,Model Num,LKS
0,C(inv_msa_match)[T.True],0.0718***,0.0793***,0.0784***,0.0646***,N PC FE-Year FE,15,norm_sim_claims_docvecs
1,,(0.0061),(0.0049),(0.0038),(0.0032),N PC FE-Year FE,15,norm_sim_claims_docvecs
4,$N$,171859,252881,407173,537875,N PC FE-Year FE,15,norm_sim_claims_docvecs
5,Adjusted $R^2$,0.09,0.07,0.07,0.12,N PC FE-Year FE,15,norm_sim_claims_docvecs
0,C(common_est_inv)[T.1.0],0.0402***,0.0747***,0.0842***,0.0808***,N Inv M-Year FE,16,norm_sim_claims_docvecs
1,,(0.0138),(0.0076),(0.0051),(0.0036),N Inv M-Year FE,16,norm_sim_claims_docvecs
2,C(common_pat_inv)[T.True],1.5177***,1.5276***,1.4042***,1.2027***,N Inv M-Year FE,16,norm_sim_claims_docvecs
3,,(0.0617),(0.0449),(0.0383),(0.0374),N Inv M-Year FE,16,norm_sim_claims_docvecs
4,C(inv_msa_match)[T.True],0.0442***,0.0471***,0.0516***,0.0484***,N Inv M-Year FE,16,norm_sim_claims_docvecs
5,,(0.0060),(0.0048),(0.0037),(0.0032),N Inv M-Year FE,16,norm_sim_claims_docvecs


In [16]:
r.to_csv(pathdir+"reg_pairs_out_HC1_1127z.csv")