In [40]:
import scipy as sp
import numpy as np
import pandas as pd
import timeit
import re
import json
import pickle
import fastparquet
import os
os.chdir('/mnt/t48/bighomes-active/sfeng/patentdiffusion/')
seed = 3
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
import datetime
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.iolib.summary2 as summary2

In [41]:
pathdir = "DataStore/2018-10/Reg0930/"
reg_f = "reg_model_1002.pkl"

regs = pickle.load(open(pathdir+reg_f, "rb"))

# Model names to equations
regs["model_names_eqn"] = dict(zip(regs["model_names"].tolist(), regs["model_names"].index.tolist()))

In [48]:
# JTH Rep
jr = pd.read_csv("DataStore/2018-07-P3/JTHReg0727/JTH_res_out_0726.csv",index_col=0)

# FE results (0930 - uses full dataset) and Sim results (1003 - uses only those with sim pc, sim pc msa values)
rr = pd.read_csv("DataStore/2018-10/Reg0930/reg_pairs_out_0930.csv",index_col=0)
rr2 = pd.read_csv("DataStore/2018-10/Reg0930/reg_pairs_out_1003.csv",index_col=0)
rr = rr.loc[~rr["Model"].isin(rr2["Model"])].append(rr2, ignore_index=True)

# Use reg_model_1002 to assign model numbers
rr["Model Num"] = rr["Model"].map(regs["model_names_eqn"])

# Get rid of models without a number (squared ones excluded from analysis)
print(len(rr))
rr = rr.loc[rr["Model Num"].notnull()]
print(len(rr))
rr["samp"] = rr["samp"].apply(eval)
rr = pd.concat([jr,rr],axis=0)
# Reset index
rr = rr.reset_index(drop=True)
# Integer model number
rr["Model Num"] = rr["Model Num"].astype(int)

5122
1492


In [49]:
rr["id"] = list(zip(rr["samp"], rr["Model Num"]))
rr["id"] = rr["id"].astype(str)

# KS dict
ksd = rr[["id", "LKS"]].drop_duplicates()
ksd = dict(zip(ksd["id"], ksd["LKS"]))

# Sample dict
sd = rr[["id", "samp"]].drop_duplicates()
sd = dict(zip(sd["id"], sd["samp"]))

In [50]:
rr.head()

Unnamed: 0,1975-85,1985-95,1995-05,2005-15,LKS,Model,Model Num,index,samp,id
0,0.0530***,0.0624***,0.0646***,0.0740***,perc_match_10,"Perc Match Targ MSA, Year FE",0,"$I(MSA \, Match)$",JTH Rep,"('JTH Rep', 0)"
1,(0.0016),(0.0011),(0.0009),(0.0012),perc_match_10,"Perc Match Targ MSA, Year FE",0,,JTH Rep,"('JTH Rep', 0)"
2,58647,107358,185154,154619,perc_match_10,"Perc Match Targ MSA, Year FE",0,$N$,JTH Rep,"('JTH Rep', 0)"
3,0.02,0.03,0.03,0.02,perc_match_10,"Perc Match Targ MSA, Year FE",0,Adjusted $R^2$,JTH Rep,"('JTH Rep', 0)"
4,0.0523***,0.0616***,0.0642***,0.0729***,perc_match_10,"Perc Match Targ MSA, PC FE",1,"$I(MSA \, Match)$",JTH Rep,"('JTH Rep', 1)"


### 1. Results with Year FE and PC Controls

In [None]:
repl1 = pickle.load(open("DataStore/2018-07-P3/reg_names_dict.pkl", "rb"))
# repl2 = {k: "\multicolumn{2}{c}{"+v+"}" if len(v)>1 else v for k,v in repl1.items() }

In [74]:
# Index by each table: Norm/Raw, KS, Sample
tab_ind = {}
tab_ind[("raw", "pct_cite", "JTH", "No FE")] = ("JTH Rep", 0)
tab_ind[("norm", "pct_cite", "JTH", "No FE")] = ("JTH Rep", 2)
tab_ind[("raw", "sim_cite", "JTH", "No FE")] = ("JTH Targ Sim", 4)
tab_ind[("norm", "sim_cite", "JTH", "No FE")] = ("JTH Targ Sim", 6)

tab_ind[("raw", "pct_cite", "JTH", "PC FE")] = ("JTH Rep", 1)
tab_ind[("norm", "pct_cite", "JTH", "PC FE")] = ("JTH Rep", 3)
tab_ind[("raw", "sim_cite", "JTH", "PC FE")] = ("JTH Targ Sim", 5)
tab_ind[("norm", "sim_cite", "JTH", "PC FE")] = ("JTH Targ Sim", 7)

for samp in ["naics_name", "primclass"]:
    if samp == "naics_name":
#         mods = ["No FE", "PC FE", "All FE", "Inv M-PC FE", "Lawyer M-PC FE", "PC M-PC FE", "CC M-PC FE"]
        mods = ["No FE", "PC FE", "All FE"]
    else:
        mods = ["No FE", "PC FE", "Inv FE"]
#     for dm in ["docvecs", "ldavecs", "tp_pct_common_cited"]:
    for dm in ["docvecs", "tp_pct_common_cited"]:
        for mod in mods:
            tab_ind[("raw", dm, samp, mod)] = ((dm, samp), regs["model_names_eqn"][mod])
            tab_ind[("norm", dm, samp, mod)] = ((dm, samp), regs["model_names_eqn"]["N "+mod])

#### 1.1 Results by year

In [77]:
# Results by year
ygs = ["index", "1975-85", "1985-95", "1995-05", "2005-15"]
tabp = {}
for k in tab_ind.keys():
    
    i = str(tab_ind[k])
    mod = rr.loc[(rr["id"] == i), ["Model", "Model Num", "samp"]]\
    .drop_duplicates().values[0]
    
    if "JTH" in k:
#         ygs = ["index", "1975-85", "1985-95", "1995-05",]
        ygs = ["index", "1975-85", "1985-95", "1995-05", "2005-15"]
    else:
        ygs = ["index", "1975-85", "1985-95", "1995-05", "2005-15"]
        
    # Select appropriate years
    tab = rr.loc[(rr["id"] == i), ygs].fillna("").set_index("index")

    # Add other columns
    if "No FE" in k:
        tab.loc["Year FE"] = [True]*len(tab.columns)
        tab.loc["PC FE"] = [False]*len(tab.columns)
    else:
        tab.loc["Year FE"] = [True]*len(tab.columns)
        tab.loc["PC FE"] = [True]*len(tab.columns)

#     # Pandas tables
    tab = tab.reset_index()
    # Replace "index"
    tab = tab.rename(columns={"index":""})
    # Replace everything else
    tab = tab.replace(repl1).copy()

    tabp[k] = tab
tab_f = tabp      

In [78]:
for k,v in tab_f.items():
    if (k[0] == "norm"):
        print("\n"+str(k)+"\n"+"\n")
        ncols = len(v.columns)
        with pd.option_context("max_colwidth", 1000):
            print(v.to_latex(index=False,escape=False, column_format="lccc"))


('norm', 'pct_cite', 'JTH', 'No FE')


\begin{tabular}{lccc}
\toprule
                   &    1975-85 &    1985-95 &    1995-05 &    2005-15 \\
\midrule
 $I(MSA \, Match)$ &  0.2448*** &  0.2881*** &  0.2983*** &  0.3417*** \\
                   &   (0.0074) &   (0.0052) &   (0.0041) &   (0.0054) \\
               $N$ &      58647 &     107358 &     185154 &     154619 \\
    Adjusted $R^2$ &       0.02 &       0.03 &       0.03 &       0.02 \\
           Year FE &       True &       True &       True &       True \\
             PC FE &      False &      False &      False &      False \\
\bottomrule
\end{tabular}


('norm', 'sim_cite', 'JTH', 'No FE')


\begin{tabular}{lccc}
\toprule
                   &    1975-85 &   1985-95 &   1995-05 &   2005-15 \\
\midrule
 $I(MSA \, Match)$ &  0.0469*** &    0.0154 &   -0.0047 &    0.0157 \\
                   &   (0.0145) &  (0.0097) &  (0.0067) &  (0.0108) \\
               $N$ &      38541 &     69612 &    122217 &     52710 \\
    Adjuste

In [19]:
for k,v in tab_f.items():
    if (k[0] == "raw"):
        print("\n"+str(k)+"\n"+"\n")
        ncols = len(v.columns)
        with pd.option_context("max_colwidth", 1000):
            print(v.to_latex(index=False,escape=False, column_format="lccc"))


('raw', 'pct_cite', 'JTH', 'No FE')


\begin{tabular}{lccc}
\toprule
                   &    1975-85 &    1985-95 &    1995-05 \\
\midrule
 $I(MSA \, Match)$ &  0.0530*** &  0.0624*** &  0.0646*** \\
                   &   (0.0016) &   (0.0011) &   (0.0009) \\
               $N$ &      58647 &     107358 &     185154 \\
    Adjusted $R^2$ &       0.02 &       0.03 &       0.03 \\
           Year FE &       True &       True &       True \\
             PC FE &      False &      False &      False \\
\bottomrule
\end{tabular}


('raw', 'sim_cite', 'JTH', 'No FE')


\begin{tabular}{lccc}
\toprule
                   &    1975-85 &    1985-95 &    1995-05 \\
\midrule
 $I(MSA \, Match)$ &  0.0543*** &  0.0468*** &  0.0408*** \\
                   &   (0.0021) &   (0.0014) &   (0.0010) \\
               $N$ &      38541 &      69612 &     122217 \\
    Adjusted $R^2$ &       0.02 &       0.02 &       0.02 \\
           Year FE &       True &       True &       True \\
             PC FE &  

### 2. Results with Field Sim

In [63]:
repl1 = pickle.load(open("DataStore/2018-07-P3/reg_names_dict.pkl", "rb"))
dms = ["docvecs", "ldavecs", "tp_pct_common_cited"]
dms = ["docvecs"]
# repl2 = {k: "\multicolumn{2}{c}{"+v+"}" if len(v)>1 else v for k,v in repl1.items() }
            
# Add names for standard errors
ygs = ["index", "1975-85", "1985-95", "1995-05", "2005-15"]
rr2 = rr.copy()
se_index = rr2[ygs+["index"]].loc[rr2["index"].isnull()].index
above_var = rr2.loc[se_index-1, "index"].tolist()
rr2.loc[se_index, "index"] = ["(s.e.) "+v for v in above_var]

#### 2.1 Results by year

In [64]:
# Index by each table: Norm/Raw, KS, Sample
tab_ind = {}
for samp in ["naics_name", "primclass"]:
    mods_all = ["All FE-Sim PC", "All FE-Sim PC MSA", "All FE-Int PC", "All FE-Int PC MSA",
               "All FE-Sim PC MSA-Cb", "All FE-Int PC MSA-Add Cb", "All FE-Int PC MSA-Int Cb",
                "Inv FE-Sim PC MSA", "Inv FE-Int PC MSA",
               "Inv FE-Sim PC MSA-Cb", "Inv FE-Int PC MSA-Add Cb", "Inv FE-Int PC MSA-Int Cb",
               "Inv FE-Break Int PC MSA"]
    if samp == "naics_name":
        mods = mods_all[:7]
    else:
        mods = mods_all[7:]
    for dm in dms:
        for mod in mods:
            tab_ind[("raw", dm, samp, mod)] = ((dm, samp), regs["model_names_eqn"][mod])
            tab_ind[("norm", dm, samp, mod)] = ((dm, samp), regs["model_names_eqn"]["N "+mod])

tab_ind_yr = tab_ind

In [65]:
tab_no_c = {}
tab_c = {}
no_c_ind = ["C(common_est_inv)[T.1.0]", "C(common_pat_inv)[T.True]", "C(lawyer_match)[T.True]"]
no_c_ind = no_c_ind+["(s.e.) "+c for c in no_c_ind]

for k in tab_ind_yr.keys():
    
    i = str(tab_ind_yr[k])
    
    mod = rr2.loc[(rr2["id"] == i), ["Model", "Model Num", "samp"]]\
.drop_duplicates().values[0]

    tab = rr2.loc[(rr2["id"] == i), ygs].fillna("").set_index("index")

    # Add other columns
    tab.loc["Year FE"] = [True]*len(tab.columns)
    tab.loc["PC FE"] = [True]*len(tab.columns)

    if ("All FE" in k[3]) or ("Inv FE" in k[3]):
        tab.loc["Inv & Lawyer Match"] = [True]*len(tab.columns)
    else:
        tab.loc["Inv & Lawyer Match"] = [False]*len(tab.columns)

    # Create gaps
    tab = tab.fillna("").reset_index()

    #1. Create table without controls
    tab2 = tab.copy()
    tab2 = tab2.loc[~(tab2["index"].isin(no_c_ind))].reset_index(drop=True)
    # Get rid of se labels
    tab2.loc[tab2["index"].apply(lambda x: "(s.e.)" in x), "index"] = ""
    tab2 = tab2.rename(columns={"index":""})
    tab2 = tab2.replace(repl1).copy()

    tab_no_c[k] = tab2

    #2. Original tables with controls
    # Get rid of se labels
    tab.loc[tab["index"].apply(lambda x: "(s.e.)" in x), "index"] = ""
    tab = tab.rename(columns={"index":""})

    # Pandas tables
    tab = tab.replace(repl1).copy()
    tab_c[k] = tab
tab_c_yr = tab_c
tab_no_c_yr = tab_no_c

In [66]:
for k,v in tab_no_c_yr.items():
    if (k[0]=='norm') & (k[1]=='docvecs'):
        print(k)
        display(v)

('norm', 'docvecs', 'naics_name', 'All FE-Sim PC')


Unnamed: 0,Unnamed: 1,1975-85,1985-95,1995-05,2005-15
0,"$I(MSA \, Match)$",0.0170,0.0390***,0.0300***,0.0274***
1,,(0.0120),(0.0070),(0.0048),(0.0038)
2,"$I(Primclass\,Match)$",0.0133,0.0274**,0.0236***,0.0140**
3,,(0.0221),(0.0128),(0.0088),(0.0069)
4,"$sim_{DV}(pc_{i}, pc_{j})$",0.2805***,0.2913***,0.2816***,0.2932***
5,,(0.0090),(0.0055),(0.0040),(0.0036)
6,$N$,40323,110982,215861,344313
7,Adjusted $R^2$,0.12,0.13,0.13,0.08
8,Year FE,True,True,True,True
9,PC FE,True,True,True,True


('norm', 'docvecs', 'naics_name', 'All FE-Sim PC MSA')


Unnamed: 0,Unnamed: 1,1975-85,1985-95,1995-05,2005-15
0,"$I(MSA \, Match)$",0.0109,0.0353***,0.0187***,0.0170***
1,,(0.0121),(0.0071),(0.0048),(0.0038)
2,"$I(Primclass\,Match)$",0.2881***,0.3074***,0.2630***,0.2373***
3,,(0.0191),(0.0110),(0.0074),(0.0058)
4,"$sim_{DV}(pc_{i,MSA_i}, pc_{j,MSA_j})$",0.1259***,0.1259***,0.1643***,0.1830***
5,,(0.0070),(0.0041),(0.0035),(0.0031)
6,$N$,40323,110982,215861,344313
7,Adjusted $R^2$,0.11,0.11,0.12,0.08
8,Year FE,True,True,True,True
9,PC FE,True,True,True,True


('norm', 'docvecs', 'naics_name', 'All FE-Int PC')


Unnamed: 0,Unnamed: 1,1975-85,1985-95,1995-05,2005-15
0,"$I(MSA \, Match)$",0.0178,0.0415***,0.0277***,0.0222***
1,,(0.0120),(0.0069),(0.0048),(0.0043)
2,"$I_{MSA} * sim_{DV}(pc_{i}, pc_{j})$",-0.0012,-0.0045,0.0050,0.0085*
3,,(0.0116),(0.0067),(0.0047),(0.0045)
4,"$I(Primclass\,Match)$",0.0133,0.0277**,0.0232***,0.0135*
5,,(0.0221),(0.0128),(0.0088),(0.0069)
6,"$sim_{DV}(pc_{i}, pc_{j})$",0.2808***,0.2925***,0.2804***,0.2908***
7,,(0.0095),(0.0057),(0.0042),(0.0039)
8,$N$,40323,110982,215861,344313
9,Adjusted $R^2$,0.12,0.13,0.13,0.08


('norm', 'docvecs', 'naics_name', 'All FE-Int PC MSA')


Unnamed: 0,Unnamed: 1,1975-85,1985-95,1995-05,2005-15
0,"$I(MSA \, Match)$",0.0114,0.0357***,0.0202***,0.0194***
1,,(0.0124),(0.0073),(0.0051),(0.0039)
2,"$I_{MSA} * sim_{DV}(pc_{i,MSA_i}, pc_{j,MSA_j})$",0.0064,0.0040,0.0103*,0.0261***
3,,(0.0136),(0.0079),(0.0061),(0.0057)
4,"$I(Primclass\,Match)$",0.2871***,0.3068***,0.2618***,0.2345***
5,,(0.0191),(0.0110),(0.0075),(0.0058)
6,"$sim_{DV}(pc_{i,MSA_i}, pc_{j,MSA_j})$",0.1245***,0.1250***,0.1618***,0.1764***
7,,(0.0076),(0.0045),(0.0038),(0.0034)
8,$N$,40323,110982,215861,344313
9,Adjusted $R^2$,0.11,0.11,0.12,0.08


('norm', 'docvecs', 'naics_name', 'All FE-Sim PC MSA-Cb')


Unnamed: 0,Unnamed: 1,1975-85,1985-95,1995-05,2005-15
0,"$I(MSA \, Match)$",0.0091,0.0324***,0.0153***,0.0141***
1,,(0.0121),(0.0070),(0.0048),(0.0038)
2,"$I(Primclass\,Match)$",0.2670***,0.2899***,0.2416***,0.2152***
3,,(0.0192),(0.0111),(0.0076),(0.0059)
4,"$sim_{DV}(pc_{i,MSA_i}, pc_{j,MSA_j})$",0.1831***,0.1743***,0.2189***,0.2338***
5,,(0.0095),(0.0054),(0.0045),(0.0039)
6,"$sim_{DV}(pc_{i,MSA_i}, pc_{j,MSA_j})^3$",-0.0111***,-0.0094***,-0.0123***,-0.0120***
7,,(0.0015),(0.0008),(0.0007),(0.0007)
8,$N$,40323,110982,215861,344313
9,Adjusted $R^2$,0.11,0.11,0.12,0.08


('norm', 'docvecs', 'naics_name', 'All FE-Int PC MSA-Add Cb')


Unnamed: 0,Unnamed: 1,1975-85,1985-95,1995-05,2005-15
0,"$I(MSA \, Match)$",0.0094,0.0328***,0.0168***,0.0167***
1,,(0.0124),(0.0072),(0.0051),(0.0039)
2,"$I_{MSA} * sim_{DV}(pc_{i,MSA_i}, pc_{j,MSA_j})$",0.0031,0.0042,0.0104*,0.0275***
3,,(0.0136),(0.0078),(0.0061),(0.0057)
4,"$I(Primclass\,Match)$",0.2666***,0.2893***,0.2403***,0.2122***
5,,(0.0193),(0.0111),(0.0076),(0.0059)
6,"$sim_{DV}(pc_{i,MSA_i}, pc_{j,MSA_j})$",0.1825***,0.1733***,0.2164***,0.2271***
7,,(0.0100),(0.0058),(0.0047),(0.0042)
8,"$sim_{DV}(pc_{i,MSA_i}, pc_{j,MSA_j})^3$",-0.0111***,-0.0094***,-0.0123***,-0.0120***
9,,(0.0015),(0.0008),(0.0007),(0.0007)


('norm', 'docvecs', 'naics_name', 'All FE-Int PC MSA-Int Cb')


Unnamed: 0,Unnamed: 1,1975-85,1985-95,1995-05,2005-15
0,"$I(MSA \, Match)$",0.0100,0.0343***,0.0179***,0.0155***
1,,(0.0126),(0.0074),(0.0052),(0.0040)
2,"$I_{MSA} * sim_{DV}(pc_{i,MSA_i}, pc_{j,MSA_j})$",0.0077,0.0145,0.0164**,0.0196***
3,,(0.0188),(0.0103),(0.0077),(0.0070)
4,"$I_{MSA} * sim_{DV}(pc_{i,MSA_i}, pc_{j,MSA_j}...",-0.0011,-0.0024,-0.0017,0.0023
5,,(0.0037),(0.0017),(0.0015),(0.0015)
6,"$I(Primclass\,Match)$",0.2667***,0.2895***,0.2403***,0.2122***
7,,(0.0193),(0.0111),(0.0076),(0.0059)
8,"$sim_{DV}(pc_{i,MSA_i}, pc_{j,MSA_j})$",0.1815***,0.1705***,0.2148***,0.2292***
9,,(0.0103),(0.0060),(0.0049),(0.0044)


('norm', 'docvecs', 'primclass', 'Inv FE-Sim PC MSA')


Unnamed: 0,Unnamed: 1,1975-85,1985-95,1995-05,2005-15
0,"$I(MSA \, Match)$",0.0100,0.0248***,0.0316***,0.0157***
1,,(0.0143),(0.0081),(0.0054),(0.0041)
2,"$sim_{DV}(pc_{i,MSA_i}, pc_{j,MSA_j})$",0.0654***,0.0652***,0.0826***,0.0935***
3,,(0.0075),(0.0045),(0.0038),(0.0035)
4,$N$,38324,106152,205248,324142
5,Adjusted $R^2$,0.08,0.08,0.09,0.06
6,Year FE,True,True,True,True
7,PC FE,True,True,True,True
8,Inv & Lawyer Match,True,True,True,True


('norm', 'docvecs', 'primclass', 'Inv FE-Int PC MSA')


Unnamed: 0,Unnamed: 1,1975-85,1985-95,1995-05,2005-15
0,"$I(MSA \, Match)$",-0.0047,0.0353***,0.0413***,0.0216***
1,,(0.0213),(0.0112),(0.0075),(0.0058)
2,"$I_{MSA} * sim_{DV}(pc_{i,MSA_i}, pc_{j,MSA_j})$",0.0145,-0.0116,-0.0123,-0.0087
3,,(0.0175),(0.0099),(0.0077),(0.0068)
4,"$sim_{DV}(pc_{i,MSA_i}, pc_{j,MSA_j})$",0.0632***,0.0671***,0.0849***,0.0953***
5,,(0.0079),(0.0048),(0.0041),(0.0038)
6,$N$,38324,106152,205248,324142
7,Adjusted $R^2$,0.08,0.08,0.09,0.06
8,Year FE,True,True,True,True
9,PC FE,True,True,True,True


('norm', 'docvecs', 'primclass', 'Inv FE-Sim PC MSA-Cb')


Unnamed: 0,Unnamed: 1,1975-85,1985-95,1995-05,2005-15
0,"$I(MSA \, Match)$",0.0065,0.0214***,0.0278***,0.0126***
1,,(0.0144),(0.0082),(0.0054),(0.0041)
2,"$sim_{DV}(pc_{i,MSA_i}, pc_{j,MSA_j})$",0.0863***,0.0828***,0.1037***,0.1135***
3,,(0.0103),(0.0062),(0.0050),(0.0046)
4,"$sim_{DV}(pc_{i,MSA_i}, pc_{j,MSA_j})^3$",-0.0031**,-0.0026***,-0.0034***,-0.0033***
5,,(0.0012),(0.0007),(0.0006),(0.0005)
6,$N$,38324,106152,205248,324142
7,Adjusted $R^2$,0.08,0.08,0.09,0.06
8,Year FE,True,True,True,True
9,PC FE,True,True,True,True


('norm', 'docvecs', 'primclass', 'Inv FE-Int PC MSA-Add Cb')


Unnamed: 0,Unnamed: 1,1975-85,1985-95,1995-05,2005-15
0,"$I(MSA \, Match)$",-0.0210,0.0248**,0.0298***,0.0128**
1,,(0.0218),(0.0115),(0.0078),(0.0060)
2,"$I_{MSA} * sim_{DV}(pc_{i,MSA_i}, pc_{j,MSA_j})$",0.0267,-0.0037,-0.0024,-0.0003
3,,(0.0179),(0.0101),(0.0080),(0.0070)
4,"$sim_{DV}(pc_{i,MSA_i}, pc_{j,MSA_j})$",0.0850***,0.0831***,0.1039***,0.1135***
5,,(0.0104),(0.0062),(0.0051),(0.0046)
6,"$sim_{DV}(pc_{i,MSA_i}, pc_{j,MSA_j})^3$",-0.0035***,-0.0026***,-0.0034***,-0.0033***
7,,(0.0012),(0.0007),(0.0006),(0.0006)
8,$N$,38324,106152,205248,324142
9,Adjusted $R^2$,0.08,0.08,0.09,0.06


('norm', 'docvecs', 'primclass', 'Inv FE-Int PC MSA-Int Cb')


Unnamed: 0,Unnamed: 1,1975-85,1985-95,1995-05,2005-15
0,"$I(MSA \, Match)$",-0.0296,0.0305**,0.0276***,0.0097
1,,(0.0241),(0.0123),(0.0083),(0.0065)
2,"$I_{MSA} * sim_{DV}(pc_{i,MSA_i}, pc_{j,MSA_j})$",0.0412,-0.0146,0.0021,0.0064
3,,(0.0266),(0.0144),(0.0106),(0.0092)
4,"$I_{MSA} * sim_{DV}(pc_{i,MSA_i}, pc_{j,MSA_j}...",-0.0018,0.0015,-0.0007,-0.0010
5,,(0.0028),(0.0016),(0.0012),(0.0011)
6,"$sim_{DV}(pc_{i,MSA_i}, pc_{j,MSA_j})$",0.0826***,0.0851***,0.1030***,0.1121***
7,,(0.0108),(0.0065),(0.0053),(0.0049)
8,"$sim_{DV}(pc_{i,MSA_i}, pc_{j,MSA_j})^3$",-0.0030**,-0.0030***,-0.0032***,-0.0029***
9,,(0.0014),(0.0009),(0.0008),(0.0007)


('norm', 'docvecs', 'primclass', 'Inv FE-Break Int PC MSA')


Unnamed: 0,Unnamed: 1,1975-85,1985-95,1995-05,2005-15
0,"$I(MSA \, Match)$",-0.0085,0.0390***,0.0524***,0.0295***
1,,(0.0235),(0.0128),(0.0085),(0.0063)
2,"$I_{MSA} * I(sim_{DV,pc,MSA}>0)*sim_{DV}(pc_{i...",0.0142,-0.0167,-0.0267***,-0.0212***
3,,(0.0196),(0.0114),(0.0089),(0.0075)
4,"$I_{MSA} * I(sim_{DV,pc,MSA} \leq 0)*sim_{DV}(...",-0.1366*,-0.0299,0.0202,0.0118
5,,(0.0791),(0.0470),(0.0422),(0.0406)
6,"$I(sim_{DV,pc,MSA}>0)*sim_{DV}(pc_{i,MSA_i}, p...",0.0764***,0.0769***,0.1042***,0.1148***
7,,(0.0109),(0.0067),(0.0057),(0.0051)
8,"$I(sim_{DV,pc,MSA} \leq 0)*sim_{DV}(pc_{i,MSA_...",0.0301*,0.0446***,0.0402***,0.0456***
9,,(0.0183),(0.0105),(0.0088),(0.0084)


In [67]:
for k,v in tab_no_c_yr.items():
    if (k[0]=='norm') & (k[1]=='docvecs'):
        print("\n"+str(k)+"\n"+"\n")
        ncols = len(v.columns)
        with pd.option_context("max_colwidth", 1000):
            print(v.to_latex(index=False,escape=False, column_format="lcccc"))


('norm', 'docvecs', 'naics_name', 'All FE-Sim PC')


\begin{tabular}{lcccc}
\toprule
                            &    1975-85 &    1985-95 &    1995-05 &    2005-15 \\
\midrule
          $I(MSA \, Match)$ &     0.0170 &  0.0390*** &  0.0300*** &  0.0274*** \\
                            &   (0.0120) &   (0.0070) &   (0.0048) &   (0.0038) \\
      $I(Primclass\,Match)$ &     0.0133 &   0.0274** &  0.0236*** &   0.0140** \\
                            &   (0.0221) &   (0.0128) &   (0.0088) &   (0.0069) \\
 $sim_{DV}(pc_{i}, pc_{j})$ &  0.2805*** &  0.2913*** &  0.2816*** &  0.2932*** \\
                            &   (0.0090) &   (0.0055) &   (0.0040) &   (0.0036) \\
                        $N$ &      40323 &     110982 &     215861 &     344313 \\
             Adjusted $R^2$ &       0.12 &       0.13 &       0.13 &       0.08 \\
                    Year FE &       True &       True &       True &       True \\
                      PC FE &       True &       True &       True &       

#### 2.2 Collected results

In [29]:
# Index by each table: Norm/Raw, KS, Sample
tab_ind = {}
for samp in ["naics_name", "primclass"]:
    mods_all = ["All FE-Sim PC", "All FE-Int PC", "All FE-Sim PC MSA", "All FE-Int PC MSA",
               "Inv FE-Sim PC MSA", "Inv FE-Int PC MSA"]
    if samp == "naics_name":
        mods = mods_all[:4]
    else:
        mods = mods_all[4:]
    mods_n = dict(zip(mods_all, range(1,len(mods_all)+1)))
#     for dm in ["docvecs", "ldavecs", "tp_pct_common_cited"]:
    for dm in dms:
        for mod in mods:
            tab_ind[("raw", dm, samp, mod)] = ((dm, samp), regs["model_names_eqn"][mod])
            tab_ind[("norm", dm, samp, mod)] = ((dm, samp), regs["model_names_eqn"]["N "+mod])

tab_ind_collected = tab_ind

In [35]:
tab_no_c = {}
no_c_ind = ["C(common_est_inv)[T.1.0]", "C(common_pat_inv)[T.True]", "C(lawyer_match)[T.True]",
            "C(primclass_match)[T.True]"]
no_c_ind = no_c_ind+["(s.e.) "+c for c in no_c_ind]

for k in tab_ind_collected.keys():
    
    i = str(tab_ind_collected[k])
    
    mod = rr2.loc[(rr2["id"] == i), ["Model", "Model Num", "samp"]]\
.drop_duplicates().values[0]

    tab = rr2.loc[(rr2["id"] == i), ygs].fillna("").set_index("index")

    # Add other columns
    tab.loc["Year FE"] = [True]*len(tab.columns)
    tab.loc["PC FE"] = [True]*len(tab.columns)

    if ("All FE" in k[3]) or ("Inv FE" in k[3]):
        tab.loc["Inv \& Lawyer Match"] = [True]*len(tab.columns)
    else:
        tab.loc["Inv \& Lawyer Match"] = [False]*len(tab.columns)

    # Create gaps
    tab = tab.fillna("").reset_index()

    #1. Create table without controls
    tab2 = tab.copy()
    tab2 = tab2.loc[~(tab2["index"].isin(no_c_ind))]
    # Set back index
    tab2 = tab2.set_index("index")
    
#     #
#     # Get rid of se labels
#     tab2.loc[tab2["index"].apply(lambda x: "(s.e.)" in x), "index"] = ""
#     tab2 = tab2.rename(columns={"index":""})
#     tab2 = tab2.replace(repl1).copy()

    tab_no_c[k] = tab2


tab_collected = tab_no_c

In [36]:
tab_col = {}
tab_col["norm"] = {}
tab_col["raw"] = {}

In [37]:
keys = {}
keys["norm"] = [i for i in list(tab_collected.keys()) if "norm" in i]
keys["raw"] = [i for i in list(tab_collected.keys()) if "raw" in i]
for m in keys.keys():
    for i,k in enumerate(keys[m]):
        # If group matches m, and is even
        if (m in k) & (i % 2) == 0:
            # Existing index order
            ind1 = tab_collected[keys[m][i+1]].index

            # First table
            a = tab_collected[keys[m][i]].copy()
            a.columns = pd.MultiIndex.from_product([["({0})".format(mods_n[keys[m][i][3]])], a.columns.tolist()])
            b = tab_collected[keys[m][i+1]].copy()
            b.columns = pd.MultiIndex.from_product([["({0})".format(mods_n[keys[m][i+1][3]])], b.columns.tolist()])
            tab2 = pd.concat([a, b], axis=1)
            tab2 = tab2.reindex(ind1)

            # Fill nan
            tab2 = tab2.fillna("").copy()

            # Get rid of se labels
            tab2.index = ["" if "(s.e.)" in x else x for x in tab2.index ]
            # Rename index
            tab2.index.name = "$KS="+repl1[k[1]][1::]
            # Get rid of se labels
            tab2 = tab2.reset_index(col_level=1)

            tab2 = tab2.replace(repl1).copy()

            tab_col[m][k] = tab2

In [38]:
m = "norm"
for k,v in tab_col[m].items():
    print(k)
    display(v)

('norm', 'docvecs', 'naics_name', 'All FE-Sim PC')


Unnamed: 0_level_0,Unnamed: 1_level_0,(1),(1),(1),(1),(2),(2),(2),(2)
Unnamed: 0_level_1,"$KS=sim_{DV}(i,j)$",1975-85,1985-95,1995-05,2005-15,1975-85,1985-95,1995-05,2005-15
0,"$I(MSA \, Match)$",0.0170,0.0390***,0.0300***,0.0274***,0.0178,0.0415***,0.0277***,0.0222***
1,,(0.0120),(0.0070),(0.0048),(0.0038),(0.0120),(0.0069),(0.0048),(0.0043)
2,"$I_{MSA} * sim_{DV}(pc_{i}, pc_{j})$",,,,,-0.0012,-0.0045,0.0050,0.0085*
3,,,,,,(0.0116),(0.0067),(0.0047),(0.0045)
4,"$sim_{DV}(pc_{i}, pc_{j})$",0.2805***,0.2913***,0.2816***,0.2932***,0.2808***,0.2925***,0.2804***,0.2908***
5,,(0.0090),(0.0055),(0.0040),(0.0036),(0.0095),(0.0057),(0.0042),(0.0039)
6,$N$,40323,110982,215861,344313,40323,110982,215861,344313
7,Adjusted $R^2$,0.12,0.13,0.13,0.08,0.12,0.13,0.13,0.08
8,Year FE,True,True,True,True,True,True,True,True
9,PC FE,True,True,True,True,True,True,True,True


('norm', 'docvecs', 'naics_name', 'All FE-Sim PC MSA')


Unnamed: 0_level_0,Unnamed: 1_level_0,(3),(3),(3),(3),(4),(4),(4),(4)
Unnamed: 0_level_1,"$KS=sim_{DV}(i,j)$",1975-85,1985-95,1995-05,2005-15,1975-85,1985-95,1995-05,2005-15
0,"$I(MSA \, Match)$",0.0109,0.0353***,0.0187***,0.0170***,0.0114,0.0357***,0.0202***,0.0194***
1,,(0.0121),(0.0071),(0.0048),(0.0038),(0.0124),(0.0073),(0.0051),(0.0039)
2,"$I_{MSA} * sim_{DV}(pc_{i,MSA_i}, pc_{j,MSA_j})$",,,,,0.0064,0.0040,0.0103*,0.0261***
3,,,,,,(0.0136),(0.0079),(0.0061),(0.0057)
4,"$sim_{DV}(pc_{i,MSA_i}, pc_{j,MSA_j})$",0.1259***,0.1259***,0.1643***,0.1830***,0.1245***,0.1250***,0.1618***,0.1764***
5,,(0.0070),(0.0041),(0.0035),(0.0031),(0.0076),(0.0045),(0.0038),(0.0034)
6,$N$,40323,110982,215861,344313,40323,110982,215861,344313
7,Adjusted $R^2$,0.11,0.11,0.12,0.08,0.11,0.11,0.12,0.08
8,Year FE,True,True,True,True,True,True,True,True
9,PC FE,True,True,True,True,True,True,True,True


('norm', 'docvecs', 'primclass', 'Inv FE-Sim PC MSA')


Unnamed: 0_level_0,Unnamed: 1_level_0,(5),(5),(5),(5),(6),(6),(6),(6)
Unnamed: 0_level_1,"$KS=sim_{DV}(i,j)$",1975-85,1985-95,1995-05,2005-15,1975-85,1985-95,1995-05,2005-15
0,"$I(MSA \, Match)$",0.0100,0.0248***,0.0316***,0.0157***,-0.0047,0.0353***,0.0413***,0.0216***
1,,(0.0143),(0.0081),(0.0054),(0.0041),(0.0213),(0.0112),(0.0075),(0.0058)
2,"$I_{MSA} * sim_{DV}(pc_{i,MSA_i}, pc_{j,MSA_j})$",,,,,0.0145,-0.0116,-0.0123,-0.0087
3,,,,,,(0.0175),(0.0099),(0.0077),(0.0068)
4,"$sim_{DV}(pc_{i,MSA_i}, pc_{j,MSA_j})$",0.0654***,0.0652***,0.0826***,0.0935***,0.0632***,0.0671***,0.0849***,0.0953***
5,,(0.0075),(0.0045),(0.0038),(0.0035),(0.0079),(0.0048),(0.0041),(0.0038)
6,$N$,38324,106152,205248,324142,38324,106152,205248,324142
7,Adjusted $R^2$,0.08,0.08,0.09,0.06,0.08,0.08,0.09,0.06
8,Year FE,True,True,True,True,True,True,True,True
9,PC FE,True,True,True,True,True,True,True,True


In [39]:
m = "norm"
for k,v in tab_col[m].items():
    print("\n"+str(k)+"\n"+"\n")
    with pd.option_context("max_colwidth", 1000):
        print(v.to_latex(index=False,escape=False, column_format="l|cccc|cccc", multicolumn_format="c"))


('norm', 'docvecs', 'naics_name', 'All FE-Sim PC')


\begin{tabular}{l|cccc|cccc}
\toprule
                                      & \multicolumn{4}{c}{(1)} & \multicolumn{4}{c}{(2)} \\
                   $KS=sim_{DV}(i,j)$ &    1975-85 &    1985-95 &    1995-05 &    2005-15 &    1975-85 &    1985-95 &    1995-05 &    2005-15 \\
\midrule
                    $I(MSA \, Match)$ &     0.0170 &  0.0390*** &  0.0300*** &  0.0274*** &     0.0178 &  0.0415*** &  0.0277*** &  0.0222*** \\
                                      &   (0.0120) &   (0.0070) &   (0.0048) &   (0.0038) &   (0.0120) &   (0.0069) &   (0.0048) &   (0.0043) \\
 $I_{MSA} * sim_{DV}(pc_{i}, pc_{j})$ &            &            &            &            &    -0.0012 &    -0.0045 &     0.0050 &    0.0085* \\
                                      &            &            &            &            &   (0.0116) &   (0.0067) &   (0.0047) &   (0.0045) \\
           $sim_{DV}(pc_{i}, pc_{j})$ &  0.2805*** &  0.2913*** &  0.2816*** &  0

In [34]:
m = "raw"
for k,v in tab_col[m].items():
    print("\n"+str(k)+"\n"+"\n")
    with pd.option_context("max_colwidth", 1000):
        print(v.to_latex(index=False,escape=False, column_format="l|cccc|cccc", multicolumn_format="c"))


('raw', 'docvecs', 'naics_name', 'All FE-Sim PC')


\begin{tabular}{l|cccc|cccc}
\toprule
                                      & \multicolumn{4}{c}{(1)} & \multicolumn{4}{c}{(2)} \\
                   $KS=sim_{DV}(i,j)$ &    1975-85 &    1985-95 &    1995-05 &    2005-15 &    1975-85 &    1985-95 &    1995-05 &    2005-15 \\
\midrule
                    $I(MSA \, Match)$ &     0.0023 &  0.0053*** &  0.0041*** &  0.0037*** &     0.0028 &  0.0072*** &     0.0020 &    -0.0000 \\
                                      &   (0.0016) &   (0.0009) &   (0.0006) &   (0.0005) &   (0.0049) &   (0.0027) &   (0.0019) &   (0.0020) \\
 $I_{MSA} * sim_{DV}(pc_{i}, pc_{j})$ &            &            &            &            &    -0.0038 &    -0.0141 &     0.0158 &    0.0270* \\
                                      &            &            &            &            &   (0.0367) &   (0.0211) &   (0.0149) &   (0.0144) \\
                $I(Primclass\,Match)$ &     0.0018 &   0.0037** &  0.0032*** &   0

#### 2.3 Localization estimate by model for norm, docvecs

In [18]:
local_est = pd.DataFrame()
for k,v in tab_no_c_yr.items():
    if (k[0] == "norm") & (k[1] == "docvecs"):
        v2 = v.loc[[0,1]]
        v2[""] = mods_n[k[3]]
        v2["var"] = ["est", "se"]
        v2["Model Name"] = k[3]
        local_est = local_est.append(v2, ignore_index=True)
        
local_est = local_est.sort_values(["", "var"]).reset_index(drop=True).drop("var",1)
local_est[""] = local_est[""].apply(lambda x: "({0})".format(x))
local_est.loc[list(range(1,len(local_est),2)), ""] = ""

KeyError: 'All FE-Sim PC MSA-Cb'

In [353]:
local_est

Unnamed: 0,Unnamed: 1,1975-85,1985-95,1995-05,2005-15,Model Name
0,(1),0.0209*,0.0407***,0.0334***,0.0287***,All FE-Sim PC
1,,(0.0112),(0.0066),(0.0046),(0.0037),All FE-Sim PC
2,(2),0.0207*,0.0404***,0.0333***,0.0287***,All FE-Sim PC Qd
3,,(0.0112),(0.0066),(0.0046),(0.0037),All FE-Sim PC Qd
4,(3),-0.0480***,-0.0183**,-0.0328***,-0.0224***,All FE-Sim PC MSA
5,,(0.0124),(0.0072),(0.0049),(0.0039),All FE-Sim PC MSA
6,(4),-0.0599***,-0.0344***,-0.0491***,-0.0381***,All FE-Sim PC MSA Qd
7,,(0.0125),(0.0073),(0.0050),(0.0039),All FE-Sim PC MSA Qd
8,(5),0.0208*,0.0401***,0.0319***,0.0219***,All FE-Int PC
9,,(0.0111),(0.0065),(0.0046),(0.0038),All FE-Int PC
