In [1]:
import scipy as sp
import numpy as np
import pandas as pd
import timeit
import re
import json
import pickle
import fastparquet
import os
os.chdir('/mnt/t48/bighomes-active/sfeng/patentdiffusion/')
seed = 3
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
import datetime
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.iolib.summary2 as summary2
import logging

  from pandas.core import datetools


In [2]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.basicConfig(level=logging.INFO)
logger.addHandler(logging.FileHandler('Logs/reg_{0}.log'.format(datetime.datetime.now().\
                                                            strftime("%Y-%m-%d"), 'a')))
print = logging.info
print('good day to you madam fiona')
print('started')
print(datetime.datetime.now())

INFO:root:good day to you madam fiona
INFO:root:started
INFO:root:2018-09-10 17:51:12.361121


In [13]:
# Initial full list
regs = {}
pathdir = "DataStore/2018-08/Reg0910/"
reg_f = "reg_model_0910.pkl"
# Similarity regressions

for dm in ["sim_ldavecs", "sim_docvecs", "num_common_cites"]:
    sim_regs = [
     # Main results

     # No FE
     #0
     "{0} ~ C(inv_msa_match) + C(tp_appyear) + C(year_diff)".format(dm),
     #1
     "{0} ~ C(inv_msa_match):C(year_diff) + C(tp_appyear)".format(dm),
     #2
     "{0} ~ C(inv_msa_match) + C(tp_appyear) + C(year_diff) + C(tp_naics_name)".format(dm),
     #3
     "{0} ~ C(inv_msa_match):C(year_diff) + C(tp_appyear) + C(year_diff) + C(tp_naics_name)".format(dm),
     #4
     "{0} ~ C(inv_msa_match) + C(tp_appyear) + C(year_diff) + C(tp_naics_name) + C(primclass_match)".format(dm),
     #5
     "{0} ~ C(inv_msa_match):C(year_diff) + C(tp_appyear) + C(tp_naics_name) + C(primclass_match)".format(dm),
     #6
     "{0} ~ C(inv_msa_match) + C(tp_appyear) + C(year_diff) + C(primclass_match)".format(dm),
     #7
     "{0} ~ C(inv_msa_match):C(year_diff) + C(tp_appyear) + C(primclass_match)".format(dm),
    ]
    

    # Replace sim with norm
    n_sim_regs = [i.replace("sim_","norm_sim_") for i in sim_regs]

    # Replace num_common_cites with norm
    n_sim_regs = [i.replace("num_common_cites","norm_num_common_cites") for i in n_sim_regs]
    # Number of all equations in sim_regs
    n_eqns = len(sim_regs)
    # Update regression list
    sim_regs = sim_regs+n_sim_regs

    # NAICS indices
    regs[dm, "all_models"] = pd.Series(sim_regs)
    
mn = [
    "Year FE", #0
    "Year FE Int", #1
    "NAICS FE", #2
    "NAICS FE Int", #3
    "NAICS FE & PC Match", #4
    "NAICS FE & PC Match", #5
    "PC Match", #6
    "PC Match Int", #7
]
# Add Norm
n_mn = ["N "+i for i in mn]
mn = mn+n_mn

regs["model_names"] = pd.Series(mn)
pickle.dump(regs, open(pathdir+reg_f, "wb"))


In [11]:
info_dict = {'$N$':lambda x: "{0:d}".format(int(x.nobs)),
'Adjusted $R^2$':lambda x: "{:.2f}".format(x.rsquared_adj)}

def get_fit(formula, grouped_data, group_col, cov_type, return_fit = False):
    summ = []
    tables = {}
    
    # Remove missing values used in formula
    col_used = re.findall('\((.*?)\)',formula)
    grouped_data = grouped_data.dropna(how="any", subset=col_used) # Should take care of gyear > 1980 issues
    
    # Group and then get results
    grouped_data = grouped_data.groupby(group_col)
    
    for n,g in grouped_data:
        try:
            fit = smf.ols(formula = formula, data = g, missing="drop").fit(cov_type=cov_type)
            # Get results tables
            tables[n] = fit.summary2().tables
            # Append results
            summ.append(fit)
        except Exception:
            print(n)
            print("Regression error")
            pass
    # Get full results output
    # Dataframe of full results
    res_no_stars = summary2.summary_col(summ, stars = False, \
    model_names = ["{0}".format(n) for n in tables.keys()],\
        info_dict = info_dict).tables[0]
    res_stars = summary2.summary_col(summ, stars = True, \
    model_names = ["{0}".format(n) for n in tables.keys()],\
        info_dict = info_dict).tables[0]
    
    # Get partial results
    # 1. Get relevant variables from index of full results: UPDATED
    regressors = [v for v in res_no_stars.index.unique() if ("sim_" in v) | ("match" in v) | ("common_" in v)]
    # 2. Make sure regressors come last
    regressors = regressors+["Intercept"]
    # 3. Get results with regressors
    part_res_no_stars = summary2.summary_col(summ, stars = False, \
    model_names = ["{0}".format(n) for n in tables.keys()],\
        info_dict = info_dict, regressor_order = regressors).tables[0]
    part_res_stars = summary2.summary_col(summ, stars = True, \
    model_names = ["{0}".format(n) for n in tables.keys()],\
        info_dict = info_dict, regressor_order = regressors).tables[0]
    
    # 4. Get index of where Intercept is and add 2 (to include standard error)
    last_ind = list(part_res_stars.index).index("Intercept")+2
    
    # 5. Get partial results
    part_res_no_stars = pd.concat([part_res_no_stars.iloc[:last_ind], part_res_no_stars.iloc[-2::]])
    part_res_stars = pd.concat([part_res_stars.iloc[:last_ind], part_res_stars.iloc[-2::]])
    
    if return_fit == True:
        return summ, tables, res_no_stars, res_stars, part_res_no_stars, part_res_stars
    else:
        return tables, res_no_stars, res_stars, part_res_no_stars, part_res_stars

In [12]:
# Regressions for random sample
dms = ["sim_ldavecs", "sim_docvecs", "num_common_cites"]
# dms = ["num_common_cites"]
file_pathdir = "DataStore/2018-08"
mod_keys = ["all_models"]
seed = 3
# Merge common inventor data
rs = fastparquet.ParquetFile("DataStore/2018-08/newterms_lead_follow_0910.parq").to_pandas().sample(frac=0.3, random_state = seed)
# Add year_group
rs["year_group"] = "all"
# Drop missing term
print(len(rs))
rs = rs.loc[rs["term"].notnull()]
print(len(rs))
print(rs.columns)

# Industry and primary class match
for c in ["naics_name", "primclass"]:
    rs["{0}_match".format(c)] = (rs["tp_{0}".format(c)] == rs["op_{0}".format(c)])

for k in mod_keys:
    for dm in dms:

        print((dm, k, "started"))
        print(datetime.datetime.now())
        # 3. Define output
        samp_out = {}
        formulas = list(regs[dm,k])
        formulas_ind = list(regs[dm,k].index)
        cov = "HC1"
        for i, j in zip(formulas_ind, formulas):
            print(k)
            print(i)
            print(j)
            print(datetime.datetime.now())
            try:
                out = get_fit(j, rs, "year_group", cov, return_fit = False)
                samp_out[i] = {}
                samp_out[i]["model"] = j
                samp_out[i]["tables"] = out[0]
                samp_out[i]["res_no_stars"] = out[1]
                samp_out[i]["res_stars"] = out[2]
                samp_out[i]["part_res_no_stars"] = out[3]
                samp_out[i]["part_res_stars"] = out[4]
            except Exception as e:
                logging.exception("error here")
                pass
            print("finished")
            print(datetime.datetime.now())
        # Define outfile
        o_f = "reg_{0}_{1}_out_0910.pkl".format(k,dm)
        pickle.dump(samp_out, open(pathdir+o_f, "wb"))    

INFO:root:1682770
INFO:root:1682770
INFO:root:Index(['sim_docvecs', 'tp', 'op', 'term', 'sim_ldavecs', 'tp_appyear',
       'tp_inv_msa', 'tp_primclass', 'tp_naics_name', 'tp_title', 'op_appyear',
       'op_inv_msa', 'op_primclass', 'op_naics_name', 'op_title',
       'inv_msa_match', 'year_diff', 'direct_cite', 'num_common_cites',
       'norm_sim_ldavecs', 'norm_sim_docvecs', 'norm_num_common_cites',
       'year_group'],
      dtype='object')
INFO:root:('sim_ldavecs', 'all_models', 'started')
INFO:root:2018-09-11 12:25:41.122748
INFO:root:all_models
INFO:root:0
INFO:root:sim_ldavecs ~ C(inv_msa_match) + C(tp_appyear) + C(year_diff)
INFO:root:2018-09-11 12:25:41.126903
INFO:root:finished
INFO:root:2018-09-11 12:26:06.489361
INFO:root:all_models
INFO:root:1
INFO:root:sim_ldavecs ~ C(inv_msa_match) + C(tp_appyear) + C(year_diff) + C(term)
INFO:root:2018-09-11 12:26:06.492538
INFO:root:finished
INFO:root:2018-09-11 12:27:08.285149
INFO:root:all_models
INFO:root:2
INFO:root:sim_ldavecs 

KeyboardInterrupt: 

In [7]:
# Regressions for random sample
dms = ["sim_ldavecs", "sim_docvecs", "num_common_cites"]
# dms = ["num_common_cites"]
file_pathdir = "DataStore/2018-08"
mod_keys = ["all_models"]
pathdir = "DataStore/2018-08/Reg0910/"
reg_f = "reg_model_0910.pkl"
regs = pickle.load(open(pathdir+reg_f, "rb"))
tab = {}
for dm in dms:
    for g in mod_keys:
        
        o_f = "reg_{0}_{1}_out_0910.pkl".format(g,dm)
        
        res = pickle.load(open(pathdir+o_f, "rb"))

        res_out = pd.DataFrame()

        for k in res.keys():
            lks = res[k]["model"].split(" ~ ")[0]
            
            # Selecting portion of results without intercept
            cdf = res[k]["part_res_stars"].reset_index()
            ic_ind = cdf.loc[cdf["index"] == "Intercept"].index[0]
            # Include N & R^2
            cdf = cdf.iloc[pd.np.r_[0:ic_ind,ic_ind+2:len(cdf)]]
            cdf["Model"] = regs["model_names"][k]
            cdf["Model Num"] = k
            cdf["LKS"] = lks
        
            res_out = res_out.append(cdf)
        
#         res_out = res_out.replace("C(inv_msa_match)[T.True]", "$I(MSA \, Match)$")
#         res_out = res_out.replace("C(tp_from_uni)[T.True]", "$I(tp \, from \, uni)$")
#         res_out["Sample"] = samp_name[f]
        
#         m_ind = res_out["Model"].drop_duplicates().index.tolist()
#         res_out.loc[~(res_out.index.isin(m_ind)), ["LKS", "Sample"]] = ""
        
        tab[dm,g] = res_out
        del(res_out)
        
full_tab = tab

In [8]:
r = pd.DataFrame()
for k,v in tab.items():
    print(k)
    display(v)
    v["samp"] = str(k)
    v = v.reset_index(drop=True)
    r = pd.concat([r,v],axis=0)

INFO:root:('sim_ldavecs', 'all_models')


Unnamed: 0,index,all,Model,Model Num,LKS
0,C(inv_msa_match)[T.True],0.2265***,Year FE,0,sim_ldavecs
1,,(0.0317),Year FE,0,sim_ldavecs
4,$N$,1040690,Year FE,0,sim_ldavecs
5,Adjusted $R^2$,0.02,Year FE,0,sim_ldavecs
0,C(inv_msa_match)[T.True],7041682729.5868,Term FE,1,sim_ldavecs
1,,(4878023053.8826),Term FE,1,sim_ldavecs
4,$N$,1040690,Term FE,1,sim_ldavecs
5,Adjusted $R^2$,0.03,Term FE,1,sim_ldavecs
0,C(inv_msa_match)[T.True],7782178530.3229,Term NAICS FE,2,sim_ldavecs
1,,(4775290894.0600),Term NAICS FE,2,sim_ldavecs


INFO:root:('sim_docvecs', 'all_models')


Unnamed: 0,index,all,Model,Model Num,LKS
0,C(inv_msa_match)[T.True],0.1226***,Year FE,0,sim_docvecs
1,,(0.0149),Year FE,0,sim_docvecs
4,$N$,1040690,Year FE,0,sim_docvecs
5,Adjusted $R^2$,0.04,Year FE,0,sim_docvecs
0,C(inv_msa_match)[T.True],5107302478.9594**,Term FE,1,sim_docvecs
1,,(2512970605.5584),Term FE,1,sim_docvecs
4,$N$,1040690,Term FE,1,sim_docvecs
5,Adjusted $R^2$,0.04,Term FE,1,sim_docvecs
0,C(inv_msa_match)[T.True],4197787056.5718*,Term NAICS FE,2,sim_docvecs
1,,(2480185459.3837),Term NAICS FE,2,sim_docvecs


INFO:root:('num_common_cites', 'all_models')


Unnamed: 0,index,all,Model,Model Num,LKS
0,C(inv_msa_match)[T.True],0.0008**,Year FE,0,num_common_cites
1,,(0.0003),Year FE,0,num_common_cites
4,$N$,1040690,Year FE,0,num_common_cites
5,Adjusted $R^2$,0.00,Year FE,0,num_common_cites
0,C(inv_msa_match)[T.True],42842804.6402,Term FE,1,num_common_cites
1,,(78115634.3397),Term FE,1,num_common_cites
4,$N$,1040690,Term FE,1,num_common_cites
5,Adjusted $R^2$,0.18,Term FE,1,num_common_cites
0,C(inv_msa_match)[T.True],-11670644.2190,Term NAICS FE,2,num_common_cites
1,,(73383294.0815),Term NAICS FE,2,num_common_cites
