In [1]:
import scipy as sp
import numpy as np
import pandas as pd
import timeit
import re
import json
import pickle
import fastparquet
import os
os.chdir('/mnt/t48/bighomes-active/sfeng/patentdiffusion/')
seed = 3
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
import datetime
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.iolib.summary2 as summary2
import logging

  from pandas.core import datetools


In [2]:

fe = {
    
     "Year FE": "{0} ~ C(inv_msa_match) + C(tp_gyear)",
}

comp_fe = {
     
     "Inv M": "{0} ~ C(common_pat_inv) + C(tp_gyear) + C(tp_primclass_FE)",
     "Lawyer M": "{0} ~ C(lawyer_match) + C(tp_gyear) + C(tp_primclass_FE)",
     "CC M": "{0} ~ C(common_cited_match) + C(tp_gyear) + C(tp_primclass_FE)",
     "PC M": "{0} ~ C(primclass_match) + C(tp_gyear) + C(tp_primclass_FE)",
     "NPC M": "{0} ~ C(common_npc_match) + C(tp_gyear) + C(tp_primclass_FE)",
     "Examiner M": "{0} ~ C(examiner_match) + C(tp_gyear) + C(tp_primclass_FE)",
    
}

techsim = {
     # All sim
    
    "Sim PC": "{0} ~ C(inv_msa_match) + C(tp_gyear) + mean_{0}_pc",
    
    "Int PC": "{0} ~ C(inv_msa_match)*mean_{0}_pc + C(tp_gyear)",

    "Sim PC MSA": "{0} ~ C(inv_msa_match) + C(tp_gyear) + mean_{0}_pc_msa",

    "Int PC MSA": "{0} ~ C(inv_msa_match)*mean_{0}_pc_msa + C(tp_gyear)",
    
    "Break Int PC MSA": "{0} ~ C(inv_msa_match) + pc_msa_greater_0:mean_{0}_pc_msa + \
    pc_msa_less_0:mean_{0}_pc_msa + C(inv_msa_match):pc_msa_greater_0:mean_{0}_pc_msa + \
    C(inv_msa_match):pc_msa_less_0:mean_{0}_pc_msa + C(tp_gyear)"
}

In [3]:
base_m = {**fe, **techsim}

In [4]:
add_vars = [("PC FE", " + C(tp_primclass_FE)"),
           ("Inv M", " + C(common_pat_inv) + C(common_est_inv)"),
           ("Lawyer M", " + C(lawyer_match)"),
           ("Exam M", " + C(examiner_match)"),
            
           
           # FE
           ("MSA FE", " + C(tp_inv_msa_FE)"),
           ("Lawyer FE", " + C(tp_lawyer_FE)"), 
           ("Exam FE", " + C(tp_examiner_FE)"),
            
           # PC Match
           ("PC M", " + C(primclass_match)"),
#            # Not enough True observations to estimate
           ("NonP M", " + C(common_npc_match)"), 
           ]

In [5]:
iter_mods = [None]*len(add_vars)
for i,v in enumerate(add_vars):
    if i == 0:
        eqn = v[1]
    else:
        eqn = iter_mods[i-1][2]+v[1]
    iter_mods[i] = (v[0]+"-", v[0], eqn)

In [6]:
iter_mods[:3]

[('PC FE-', 'PC FE', ' + C(tp_primclass_FE)'),
 ('Inv M-',
  'Inv M',
  ' + C(tp_primclass_FE) + C(common_pat_inv) + C(common_est_inv)'),
 ('Lawyer M-',
  'Lawyer M',
  ' + C(tp_primclass_FE) + C(common_pat_inv) + C(common_est_inv) + C(lawyer_match)')]

In [7]:
all_m = {}             
for m in iter_mods:
    new_m = {m[0]+k : v+m[2] for k,v in base_m.items()}
    all_m.update(new_m)
    
# Add FE Match
all_m.update(comp_fe)

In [8]:
# Models
s1 = list(all_m.values())
s2 = [i.replace("{0} ", "norm_{0} ").replace("mean_", "norm_mean_") for i in s1]
sim_regs = s1+s2

n_eqns = len(s1)

# Model names
m1 = list(all_m.keys())
# Add Norm
m2 = ["N "+i for i in m1]
mn = m1+m2

In [10]:
# Create regression dictionary
regs = {}
pathdir = "DataStore/2018-10/Reg1016/"
reg_f = "reg_model_1016.pkl"

regs["model_names"] = pd.Series(mn)
regs["n_eqns"] = n_eqns
regs["ldavecs"] = pd.Series([i.format("sim_ldavecs") for i in sim_regs])
regs["docvecs"] = pd.Series([i.format("sim_docvecs") for i in sim_regs])

# Include mean_sim_docvecs
for c in ["tp_pct_common_cited"]:
    # Just replace KS measure
    l = [i.replace("sim_docvecs ", "{0} ".format(c)) for i in regs["docvecs"]]
    regs[c] = pd.Series(l)
pickle.dump(regs, open(pathdir+reg_f, "wb"))

In [18]:
pc10 = {
    "PC FE-Year FE": "{0} ~ C(inv_msa_match) + C(tp_gyear) + C(tp_primclass_FE)",
    "MSA FE-Year FE": "{0} ~ C(inv_msa_match) + C(tp_gyear) + C(tp_primclass_FE) + C(tp_inv_msa_FE)",
    "Lawyer FE-Year FE": "{0} ~ C(inv_msa_match) + C(tp_gyear) + C(tp_primclass_FE)  + C(tp_inv_msa_FE)\
    + C(tp_lawyer_FE)",
    "Examiner FE-Year FE": "{0} ~ C(inv_msa_match) + C(tp_gyear) + C(tp_primclass_FE) + C(tp_inv_msa_FE)\
    + C(tp_lawyer_FE) + C(tp_examiner_FE)",
}

pc10_N = {"N "+k: "norm_"+v for k,v in pc10.items()}
pc10.update(pc10_N)
pc10 = {k: v.format("perc_match_10") for k,v in pc10.items()}

regs["JTH_model_names"] = pd.Series(list(pc10.keys())) 
regs["JTH_cite"] = pd.Series(list(pc10.values())) 

In [19]:
pickle.dump(regs, open(pathdir+reg_f, "wb"))