In [1]:
import scipy as sp
import numpy as np
import pandas as pd
import timeit
import re
import json
import pickle
import fastparquet
import os
os.chdir('/mnt/t48/bighomes-active/sfeng/patentdiffusion/')
seed = 3
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
import datetime
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.iolib.summary2 as summary2

  from pandas.core import datetools


Based on: https://sfengc7.stern.nyu.edu:8888/notebooks/patentdiffusion/201808Results/Reg1016/4a-CombinedResults-1016.ipynb, https://sfengc7.stern.nyu.edu:8888/notebooks/patentdiffusion/201808Results/Reg1016/4b-RegCoefficients-1016.ipynb

In [2]:
pathdir = "DataStore/2018-10/Reg1016/"
reg_f = "reg_model_1016.pkl"

regs = pickle.load(open(pathdir+reg_f, "rb"))

# Model names to equations
regs["model_names_eqn"] = dict(zip(regs["model_names"].tolist(), regs["model_names"].index.tolist()))

In [82]:
# JTH Rep
jr = pd.read_csv("DataStore/2018-10/Reg1016/JTH_cite_reg_1026.csv",index_col=0)
jr = jr.replace({"$I(MSA \, Match)$":"C(inv_msa_match)[T.True]",
                "Examiner FE-Year FE":"Exam FE-Year FE",
                "N Examiner FE-Year FE":"N Exam FE-Year FE"})
jr["samp"] = "JTH Rep"

# FE results (0930 - uses full dataset) and Sim results (1003 - uses only those with sim pc, sim pc msa values)
rr = pd.read_csv("DataStore/2018-10/Reg1016/reg_pairs_out_HC1_1016.csv",index_col=0)

# Use reg_model_1002 to assign model numbers
rr["Model Num"] = rr["Model"].map(regs["model_names_eqn"])

# Get rid of models without a number (squared ones excluded from analysis)
print(len(rr))
rr = rr.loc[rr["Model Num"].notnull()]
print(len(rr))
rr["samp"] = rr["samp"].apply(eval)
rr = pd.concat([jr,rr],axis=0)
# Reset index
rr = rr.reset_index(drop=True)
# Integer model number
rr["Model Num"] = rr["Model Num"].astype(int)
# Equation id
rr["id"] = ["({0}, {1}, {2})".format(i,j,k) for i,j,k in zip(rr["samp"], rr["LKS"], rr["Model"])]
rr["id"] = rr["id"].astype(str)

# Add S.E.
# S.E. index
se_ind = rr.loc[rr["index"].isnull()].index.tolist()

for ind in se_ind:
    rr.loc[ind, "index"] = "SE {0}".format(rr.loc[ind-1, "index"])

4512
4512


In [65]:
rr["Model"].unique()

array(['PC FE-Year FE', 'MSA FE-Year FE', 'Lawyer FE-Year FE',
       'Examiner FE-Year FE', 'N PC FE-Year FE', 'N MSA FE-Year FE',
       'N Lawyer FE-Year FE', 'N Examiner FE-Year FE', 'N PC FE-Sim PC',
       'N PC FE-Int PC', 'N PC FE-Sim PC MSA', 'N PC FE-Int PC MSA',
       'N PC FE-Break Int PC MSA', 'N Inv M-Year FE', 'N Inv M-Sim PC',
       'N Inv M-Int PC', 'N Inv M-Sim PC MSA', 'N Inv M-Int PC MSA',
       'N Inv M-Break Int PC MSA', 'N Lawyer M-Year FE',
       'N Lawyer M-Sim PC', 'N Lawyer M-Int PC', 'N Lawyer M-Sim PC MSA',
       'N Lawyer M-Int PC MSA', 'N Lawyer M-Break Int PC MSA',
       'N Exam M-Year FE', 'N Exam M-Sim PC', 'N Exam M-Int PC',
       'N Exam M-Sim PC MSA', 'N Exam M-Int PC MSA',
       'N Exam M-Break Int PC MSA', 'N MSA FE-Sim PC', 'N MSA FE-Int PC',
       'N MSA FE-Sim PC MSA', 'N MSA FE-Int PC MSA',
       'N MSA FE-Break Int PC MSA', 'N Lawyer FE-Sim PC',
       'N Lawyer FE-Int PC', 'N Lawyer FE-Sim PC MSA',
       'N Lawyer FE-Int PC MSA', 

### 1. Year and PC Controls

In [100]:
# Regression models
# Primclass
rm_pc = [
    # JTH norm percentage match PC FE
    "(JTH Rep, norm_perc_match_10, N PC FE-Year FE)",
    # pcc N PC FE
    "(('tp_pct_common_cited', 'primclass'), norm_tp_pct_common_cited, N PC FE-Year FE)",
    # docvecs N PC FE
    "(('docvecs', 'primclass'), norm_sim_docvecs, N PC FE-Year FE)",  
]

rm_pc_n = ["Citations Benchmark", "Pct Common Cited Pats", "Sim DocVecs, PC FE"]

# NAICS
rm_n = [
    # JTH norm percentage match PC FE
    "(JTH Rep, norm_perc_match_10, N PC FE-Year FE)",
    # pcc N PC FE
    "(('tp_pct_common_cited', 'naics_name'), norm_tp_pct_common_cited, N PC FE-Year FE)",
    # docvecs N PC FE
    "(('docvecs', 'naics_name'), norm_sim_docvecs, N PC FE-Year FE)", 
]

# rm_n_n = ["Citations Benchmark", "Pct Common Cited Pats", "Sim DocVecs, PC FE", "Sim DocVecs, Sim PC", "Sim DocVecs, Sim PC MSA"]
rm_n_n = rm_pc_n

rm_d = dict(zip(rm_pc, rm_pc_n))
rm_d.update(dict(zip(rm_n, rm_n_n)))

### NAICS Table

In [101]:
# Localization estimates
t = {}
for l in [("naics_name", rm_n), ("primclass", rm_pc)]:
    rc = rr.loc[rr["id"].isin(l[1]) & rr["index"].isin(["C(inv_msa_match)[T.True]",\
            "SE C(inv_msa_match)[T.True]", "$N$", "Adjusted $R^2$"])].copy()
    rc["id"] = rc["id"].map(rm_d)
    rc.loc[rc["index"].isin(["$N$", "Adjusted $R^2$"]), "id"] =\
    rc.loc[rc["index"].isin(["$N$", "Adjusted $R^2$"]), "index"]
    # Fixed effects columns
    rc["Year \& PC FE"] = ""
    rc["Other Controls"] = ""
    rc.loc[(rc["index"] == "SE C(inv_msa_match)[T.True]"), ['id']] = ""
    rc.loc[(rc["index"] == "C(inv_msa_match)[T.True]"), ["Year \& PC FE"]] = True
    rc.loc[(rc["index"] == "C(inv_msa_match)[T.True]"), ["Other Controls"]] = False
    # cols = ['id', '1975-85', '1985-95', '1995-05', '2005-15', "Year \& PC FE", "Other Controls"]
    cols = ['id', '1975-85', '1985-95', '1995-05', '2005-15']
    rc = rc[cols]
    rc = rc.rename(columns={"id": ""})
    t[l[0]] = rc
t1 = t

In [102]:
for k,v in t1.items():
    print(k)
    print(v.to_latex(index=False, escape=False, column_format="lcccc"))

naics_name
\begin{tabular}{lcccc}
\toprule
                       &    1975-85 &    1985-95 &    1995-05 &    2005-15 \\
\midrule
   Citations Benchmark &  0.2422*** &  0.2849*** &  0.2968*** &  0.3365*** \\
                       &   (0.0074) &   (0.0051) &   (0.0041) &   (0.0054) \\
                   $N$ &      58647 &     107358 &     185154 &     154619 \\
        Adjusted $R^2$ &       0.03 &       0.05 &       0.05 &       0.05 \\
 Pct Common Cited Pats &  0.0321*** &  0.0737*** &  0.0802*** &  0.0689*** \\
                       &   (0.0056) &   (0.0075) &   (0.0064) &   (0.0050) \\
                   $N$ &     188668 &     273463 &     384536 &     548061 \\
        Adjusted $R^2$ &       0.00 &       0.00 &       0.00 &       0.00 \\
    Sim DocVecs, PC FE &  0.0309*** &  0.0585*** &  0.0596*** &  0.0555*** \\
                       &   (0.0053) &   (0.0043) &   (0.0034) &   (0.0030) \\
                   $N$ &     192773 &     280962 &     437405 &     563881 \\
        Adju

### 2. All further controls

In [103]:
# Regression models
# Primclass
rm_pc = [
    # JTH norm percentage match PC FE
    "(JTH Rep, norm_perc_match_10, N Exam FE-Year FE)",
    # pcc N PC FE
    "(('tp_pct_common_cited', 'primclass'), norm_tp_pct_common_cited, N Exam FE-Year FE)",
    # docvecs N PC FE
    "(('docvecs', 'primclass'), norm_sim_docvecs, N Exam FE-Year FE)",  
]

rm_pc_n = ["Citations Benchmark", "Pct Common Cited Pats", "Sim DocVecs, PC FE"]

# NAICS
rm_n = [
    # JTH norm percentage match PC FE
    "(JTH Rep, norm_perc_match_10, N Exam FE-Year FE)",
    # pcc N PC FE
    "(('tp_pct_common_cited', 'naics_name'), norm_tp_pct_common_cited, N PC M-Year FE)",
    # docvecs N PC FE
    "(('docvecs', 'naics_name'), norm_sim_docvecs, N PC M-Year FE)", 
]

# rm_n_n = ["Citations Benchmark", "Pct Common Cited Pats", "Sim DocVecs, PC FE", "Sim DocVecs, Sim PC", "Sim DocVecs, Sim PC MSA"]
rm_n_n = rm_pc_n

rm_d = dict(zip(rm_pc, rm_pc_n))
rm_d.update(dict(zip(rm_n, rm_n_n)))

In [104]:
# Localization estimates
t = {}
for l in [("naics_name", rm_n), ("primclass", rm_pc)]:
    rc = rr.loc[rr["id"].isin(l[1]) & rr["index"].isin(["C(inv_msa_match)[T.True]",\
            "SE C(inv_msa_match)[T.True]", "$N$", "Adjusted $R^2$"])].copy()
    rc["id"] = rc["id"].map(rm_d)
    rc.loc[rc["index"].isin(["$N$", "Adjusted $R^2$"]), "id"] =\
    rc.loc[rc["index"].isin(["$N$", "Adjusted $R^2$"]), "index"]
    # Fixed effects columns
    rc["Year \& PC FE"] = ""
    rc["Other Controls"] = ""
    rc.loc[(rc["index"] == "SE C(inv_msa_match)[T.True]"), ['id']] = ""
    rc.loc[(rc["index"] == "C(inv_msa_match)[T.True]"), ["Year \& PC FE"]] = True
    rc.loc[(rc["index"] == "C(inv_msa_match)[T.True]"), ["Other Controls"]] = False
    # cols = ['id', '1975-85', '1985-95', '1995-05', '2005-15', "Year \& PC FE", "Other Controls"]
    cols = ['id', '1975-85', '1985-95', '1995-05', '2005-15']
    rc = rc[cols]
    rc = rc.rename(columns={"id": ""})
    t[l[0]] = rc
t2 = t

In [105]:
for k,v in t2.items():
    print(k)
    print(v.to_latex(index=False, escape=False, column_format="lcccc"))

naics_name
\begin{tabular}{lcccc}
\toprule
                       &    1975-85 &    1985-95 &    1995-05 &    2005-15 \\
\midrule
   Citations Benchmark &  0.2340*** &  0.2783*** &  0.2864*** &  0.3215*** \\
                       &   (0.0073) &   (0.0050) &   (0.0040) &   (0.0053) \\
                   $N$ &      58647 &     107358 &     185154 &     154619 \\
        Adjusted $R^2$ &       0.04 &       0.06 &       0.08 &       0.07 \\
 Pct Common Cited Pats &     0.0051 &    0.0075* &  0.0134*** &  0.0125*** \\
                       &   (0.0038) &   (0.0045) &   (0.0036) &   (0.0028) \\
                   $N$ &     188668 &     273463 &     384536 &     548061 \\
        Adjusted $R^2$ &       0.09 &       0.15 &       0.14 &       0.16 \\
    Sim DocVecs, PC FE &  0.0171*** &  0.0354*** &  0.0344*** &  0.0333*** \\
                       &   (0.0053) &   (0.0043) &   (0.0034) &   (0.0030) \\
                   $N$ &     192773 &     280962 &     437405 &     563881 \\
        Adju