In [1]:
import scipy as sp
import numpy as np
import pandas as pd
import timeit
import re
import json
import pickle
import fastparquet
import os
os.chdir('/mnt/t48/bighomes-active/sfeng/patentdiffusion/')
seed = 3
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
import datetime
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.iolib.summary2 as summary2
import logging

  from pandas.core import datetools


### 1. JTH summary tables

In [46]:
tsl = pickle.load(open("DataStore/2018-11/jth_rep_control_long_dict_1112.pkl", "rb"))

In [50]:
rt = pd.DataFrame()
for k in ["PC", "Sim", "Lawyer"]:
    v = tsl[k]
    v2 = v[["perc_match_10", "year_group", "inv_msa_match"]].groupby(["year_group", "inv_msa_match"]).mean().reset_index()
    
    t_pct = {n:g["perc_match_10"].values for n,g in v.loc[v["inv_msa_match"] == True, ["year_group", "perc_match_10"]].groupby("year_group")}
    c_pct = {n:g["perc_match_10"].values for n,g in v.loc[v["inv_msa_match"] == False, ["year_group", "perc_match_10"]].groupby("year_group")}
    
    tc = v2.loc[v2["inv_msa_match"] == True, "perc_match_10"].values
    cc = v2.loc[v2["inv_msa_match"] == False, "perc_match_10"].values
    yg = v2.loc[v2["inv_msa_match"] == True, "year_group"].values
    p = [sp.stats.ttest_ind(t_pct[k], c_pct[k], equal_var=False, nan_policy="omit")[1] for k in t_pct.keys()] 
    n = v[["perc_match_10", "year_group"]].groupby(["year_group"]).size().values
    ind = ["Type", "Target, Pct Cite in Target MSA", "Control, Pct Cite in Target MSA", "Ratio", "$p$-value", "$N$"]
    
    r = pd.DataFrame({"Year Group": yg,
                      "Target, Pct Cite in Target MSA": tc, 
                      "Control, Pct Cite in Target MSA": cc, 
                      "$p$-value": p, "$N$": n})
    
    r = r.loc[r["Year Group"] != "2005-15"].set_index("Year Group")
    
    r["Ratio"] = r["Target, Pct Cite in Target MSA"]/r["Control, Pct Cite in Target MSA"]
    r = r.round(3)
    r["Type"] = k
    
    r = r[ind].T
    r.loc["$N$"] = r.loc["$N$"].astype(int).astype(str)
    rt = rt.append(r)

In [51]:
print(rt.to_latex(index=True, escape=False, column_format = "lcccc"))

\begin{tabular}{lcccc}
\toprule
Year Group & 1975-85 & 1985-95 & 1995-05 \\
\midrule
Type                            &      PC &      PC &      PC \\
Target, Pct Cite in Target MSA  &   0.091 &   0.097 &    0.11 \\
Control, Pct Cite in Target MSA &   0.038 &   0.035 &   0.045 \\
Ratio                           &   2.411 &   2.792 &   2.429 \\
$p$-value                       &       0 &       0 &       0 \\
Obs.                            &   58647 &  107358 &  185154 \\
Type                            &     Sim &     Sim &     Sim \\
Target, Pct Cite in Target MSA  &   0.092 &   0.097 &    0.11 \\
Control, Pct Cite in Target MSA &   0.048 &   0.041 &   0.054 \\
Ratio                           &   1.923 &   2.397 &   2.047 \\
$p$-value                       &       0 &       0 &       0 \\
Obs.                            &   36917 &   67332 &  117137 \\
Type                            &  Lawyer &  Lawyer &  Lawyer \\
Target, Pct Cite in Target MSA  &   0.094 &     0.1 &   0.115 \\
Contr

### 2. JTH Regression tables

Code based on: https://sfengc7.stern.nyu.edu:8888/notebooks/patentdiffusion/201808Results/Reg1016/4a-CombinedResults-1026.ipynb

In [89]:
regs = pickle.load(open("DataStore/2018-10/Reg1016/reg_model_1016.pkl", "rb"))

jr = pd.read_csv("DataStore/2018-11/JTHReg1112/jth_rep_reg_1112.csv", index_col=0)

# Other regressions
rr = pd.read_csv("DataStore/2018-10/Reg1016/reg_pairs_out_HC1_1016.csv",index_col=0)

# Change samp
rr["samp"] = rr["samp"].apply(lambda x: eval(x)[1]+"-"+eval(x)[0])

rr = pd.concat([jr,rr],axis=0)
# Reset index
rr = rr.reset_index(drop=True)

# Equation id
rr["id"] = ["({0}, {1}, {2})".format(i,j,k) for i,j,k in zip(rr["samp"], rr["LKS"], rr["Model"])]
rr["id"] = rr["id"].astype(str)

# Add S.E.
# S.E. index
se_ind = rr.loc[rr["index"].isnull()].index.tolist()

for ind in se_ind:
    rr.loc[ind, "index"] = "SE {0}".format(rr.loc[ind-1, "index"])

In [90]:
j_ind_pc = [
    '(JTH Rep-PC, norm_perc_match_10, N PC FE-Year FE)',
    '(JTH Rep-Sim, norm_perc_match_10, N PC FE-Year FE)',
    '(JTH Rep-Lawyer, norm_perc_match_10, N PC FE-Year FE)'
]
j_ind_ex = [
    '(JTH Rep-PC, norm_perc_match_10, N Examiner FE-Year FE)',
    '(JTH Rep-Sim, norm_perc_match_10, N Examiner FE-Year FE)',
    '(JTH Rep-Lawyer, norm_perc_match_10, N Examiner FE-Year FE)'
]
mod_n =  ["Control Selection: Standard JTH", "Control Selection: Similarity", "Control Selection: Lawyer"]
mod_dict = dict(zip(j_ind_pc, mod_n))
mod_dict.update(dict(zip(j_ind_ex, mod_n)))

In [91]:
# Localization estimates
t = {}
for l in [("PC", j_ind_pc), ("Examiner", j_ind_ex)]:
    rc = rr.loc[rr["id"].isin(l[1]) & rr["index"].isin(["C(inv_msa_match)[T.True]",\
            "SE C(inv_msa_match)[T.True]", "$N$", "Adjusted $R^2$"])].copy()
    rc["id"] = rc["id"].map(mod_dict)
    rc.loc[rc["index"].isin(["$N$", "Adjusted $R^2$"]), "id"] =\
    rc.loc[rc["index"].isin(["$N$", "Adjusted $R^2$"]), "index"]
    # Fixed effects columns
    rc["Year \& PC FE"] = ""
    rc["Other Controls"] = ""
    rc.loc[(rc["index"] == "SE C(inv_msa_match)[T.True]"), ['id']] = ""
    rc.loc[(rc["index"] == "C(inv_msa_match)[T.True]"), ["Year \& PC FE"]] = True
    rc.loc[(rc["index"] == "C(inv_msa_match)[T.True]"), ["Other Controls"]] = False
    # cols = ['id', '1975-85', '1985-95', '1995-05', '2005-15', "Year \& PC FE", "Other Controls"]
    cols = ['id', '1975-85', '1985-95', '1995-05']
    rc = rc[cols]
    rc = rc.rename(columns={"id": ""})
    t[l[0]] = rc
t1 = t

In [92]:
for k,v in t1.items():
    print(k)
    print(v.to_latex(index=False, escape=False, column_format="lccc"))

PC
\begin{tabular}{lccc}
\toprule
                                 &    1975-85 &    1985-95 &    1995-05 \\
\midrule
 Control Selection: Standard JTH &  0.2422*** &  0.2849*** &  0.2968*** \\
                                 &   (0.0074) &   (0.0051) &   (0.0041) \\
                             $N$ &      58647 &     107358 &     185154 \\
                  Adjusted $R^2$ &       0.03 &       0.05 &       0.05 \\
   Control Selection: Similarity &  0.2029*** &  0.2681*** &  0.2621*** \\
                                 &   (0.0100) &   (0.0067) &   (0.0054) \\
                             $N$ &      36917 &      67332 &     117137 \\
                  Adjusted $R^2$ &       0.02 &       0.03 &       0.04 \\
       Control Selection: Lawyer &  0.0806*** &  0.0935*** &  0.1150*** \\
                                 &   (0.0136) &   (0.0086) &   (0.0069) \\
                             $N$ &      22914 &      51837 &      85855 \\
                  Adjusted $R^2$ &       0.02 &       0.0

## NAICS and PC Samples

### 1. Summary of average similarity

In [98]:
rt = pd.DataFrame()
y = "sim_docvecs"
for k in ["naics_name", "primclass"]:
    v = fastparquet.ParquetFile("DataStore/2018-10/Reg1016/{0}_all_1016.parq".format(k)).to_pandas()
    v2 = v[[y, "year_group", "inv_msa_match"]].groupby(["year_group", "inv_msa_match"]).mean().reset_index()
    
    t_pct = {n:g[y].values for n,g in v.loc[v["inv_msa_match"] == True, ["year_group", y]].groupby("year_group")}
    c_pct = {n:g[y].values for n,g in v.loc[v["inv_msa_match"] == False, ["year_group", y]].groupby("year_group")}
    
    tc = v2.loc[v2["inv_msa_match"] == True, y].values
    cc = v2.loc[v2["inv_msa_match"] == False, y].values
    yg = v2.loc[v2["inv_msa_match"] == True, "year_group"].values
    p = [sp.stats.ttest_ind(t_pct[k], c_pct[k], equal_var=False, nan_policy="omit")[1] for k in t_pct.keys()] 
    n = v[[y, "year_group"]].groupby(["year_group"]).size().values
    ind = ["Type", "Within Cluster, $I(MSA\,Match)=1$", "Across Cluster, $I(MSA\,Match)=0$", "Ratio", "$p$-value", "$N$"]
    
    r = pd.DataFrame({"Year Group": yg,
                      "Within Cluster, $I(MSA\,Match)=1$": tc, 
                      "Across Cluster, $I(MSA\,Match)=0$": cc, 
                      "$p$-value": p, "$N$": n})
    
#     r = r.loc[r["Year Group"] != "2005-15"].set_index("Year Group")
    r = r.set_index("Year Group")
    
    r["Ratio"] = r["Within Cluster, $I(MSA\,Match)=1$"]/r["Across Cluster, $I(MSA\,Match)=0$"]
    r = r.round(3)
    r["Type"] = k
    
    r = r[ind].T
    r.loc["$N$"] = r.loc["$N$"].astype(int).astype(str)
    rt = rt.append(r)

In [100]:
print(rt.to_latex(index=True, escape=False, column_format = "lcccc"))

\begin{tabular}{lcccc}
\toprule
Year Group &     1975-85 &     1985-95 &     1995-05 &     2005-15 \\
\midrule
Type                              &  naics_name &  naics_name &  naics_name &  naics_name \\
Within Cluster, $I(MSA\,Match)=1$ &       0.127 &       0.127 &       0.133 &       0.145 \\
Across Cluster, $I(MSA\,Match)=0$ &       0.124 &        0.12 &       0.125 &       0.137 \\
Ratio                             &        1.02 &       1.059 &       1.062 &       1.056 \\
$p$-value                         &       0.001 &           0 &           0 &           0 \\
$N$                               &      194131 &      282112 &      443885 &      578056 \\
Type                              &   primclass &   primclass &   primclass &   primclass \\
Within Cluster, $I(MSA\,Match)=1$ &       0.197 &       0.193 &       0.195 &       0.197 \\
Across Cluster, $I(MSA\,Match)=0$ &        0.19 &       0.182 &       0.184 &       0.188 \\
Ratio                             &       1.036 &   

### 1. Summary including variance
- Testing significant difference in variance: https://stackoverflow.com/questions/21494141/how-do-i-do-a-f-test-in-python

In [115]:
rt = pd.DataFrame()
y = "sim_docvecs"
for k in ["naics_name", "primclass"]:
    v = fastparquet.ParquetFile("DataStore/2018-10/Reg1016/{0}_all_1016.parq".format(k)).to_pandas()
    v = v.dropna(subset=[y])
    v2 = v[[y, "year_group", "inv_msa_match"]].groupby(["year_group", "inv_msa_match"]).mean().reset_index()
    v3 = v[[y, "year_group", "inv_msa_match"]].groupby(["year_group", "inv_msa_match"]).std().reset_index()
    
    t_pct = {n:g[y].values for n,g in v.loc[v["inv_msa_match"] == True, ["year_group", y]].groupby("year_group")}
    c_pct = {n:g[y].values for n,g in v.loc[v["inv_msa_match"] == False, ["year_group", y]].groupby("year_group")}
    
    tc = v2.loc[v2["inv_msa_match"] == True, y].values
    cc = v2.loc[v2["inv_msa_match"] == False, y].values
    
    tsd = v3.loc[v3["inv_msa_match"] == True, y].values
    csd = v3.loc[v3["inv_msa_match"] == False, y].values
    
    yg = v2.loc[v2["inv_msa_match"] == True, "year_group"].values
    # Equal means
    p = [sp.stats.ttest_ind(t_pct[k], c_pct[k], equal_var=False, nan_policy="omit")[1] for k in t_pct.keys()] 
    # Equal variance: F
    F = [sp.stats.f.sf(np.var(t_pct[k])/np.var(c_pct[k]), len(t_pct[k])-1, len(c_pct[k])-1) for k in t_pct.keys()]
    # Equal variance: Levene
    L = [sp.stats.levene(t_pct[k], c_pct[k])[1] for k in t_pct.keys()]
    
    # Population size
    t_n = [len(t_pct[k]) for k in t_pct.keys()]
    c_n = [len(c_pct[k]) for k in c_pct.keys()]
    
    ind = ["Type", "Within Cluster, $I(MSA\,Match)=1$", "Within Cluster, S.D.", "Within Cluster, $N$",
           "Across Clusters, $I(MSA\,Match)=0$", "Across Clusters, S.D.", "Across Clusters, $N$", "Ratio",
           "$p$-value, Equal Means",
           "$p$-value, Equal Var (F-test)","$p$-value, Equal Var (Levene)"]
    
    r = pd.DataFrame({"Year Group": yg,
                      "Within Cluster, $I(MSA\,Match)=1$": tc,
                      "Within Cluster, S.D.": tsd,
                      "Within Cluster, $N$": t_n,
                      "Across Clusters, $I(MSA\,Match)=0$": cc,
                      "Across Clusters, S.D.": csd,
                      "Across Clusters, $N$": c_n,
                      "$p$-value, Equal Means": p,
                      "$p$-value, Equal Var (F-test)": F,
                      "$p$-value, Equal Var (Levene)": L,
                      })
    
#     r = r.loc[r["Year Group"] != "2005-15"].set_index("Year Group")
    r = r.set_index("Year Group")
    
    r["Ratio"] = r["Within Cluster, $I(MSA\,Match)=1$"]/r["Across Clusters, $I(MSA\,Match)=0$"]
    r = r.round(3)
    r["Type"] = k
    
    r = r[ind].T
    r.loc["Across Clusters, $N$"] = r.loc["Across Clusters, $N$"].astype(int).astype(str)
    r.loc["Within Cluster, $N$"] = r.loc["Within Cluster, $N$"].astype(int).astype(str)
    rt = rt.append(r)

In [116]:
print(rt.to_latex(index=True, escape=False, column_format = "lcccc"))

\begin{tabular}{lcccc}
\toprule
Year Group &     1975-85 &     1985-95 &     1995-05 &     2005-15 \\
\midrule
Type                               &  naics_name &  naics_name &  naics_name &  naics_name \\
Within Cluster, $I(MSA\,Match)=1$  &       0.127 &       0.127 &       0.133 &       0.145 \\
Within Cluster, S.D.               &       0.137 &       0.135 &       0.135 &       0.137 \\
Within Cluster, $N$                &       45804 &       66749 &      106800 &      143309 \\
Across Clusters, $I(MSA\,Match)=0$ &       0.124 &        0.12 &       0.125 &       0.137 \\
Across Clusters, S.D.              &       0.136 &       0.133 &       0.133 &       0.135 \\
Across Clusters, $N$               &      147037 &      214473 &      330885 &      425943 \\
Ratio                              &        1.02 &       1.059 &       1.062 &       1.056 \\
$p$-value, Equal Means             &       0.001 &           0 &           0 &           0 \\
$p$-value, Equal Var (F-test)      &       

In [117]:
v = fastparquet.ParquetFile("DataStore/2018-10/Reg1016/naics_name_all_1016.parq").to_pandas()

In [118]:
v.columns

Index(['tp', 'op', 'sim_docvecs', 'sim_ldavecs', 'tp_gyear', 'tp_naics_name',
       'op_naics_name', 'op_primclass', 'op_inv_msa', 'inv_msa_match',
       'primclass_match', 'norm_sim_ldavecs', 'norm_sim_docvecs', 'year_group',
       'common_est_inv', 'common_pat_inv', 'lawyer_match', 'num_common_cited',
       'norm_num_common_cited', 'tp_pct_common_cited',
       'norm_tp_pct_common_cited', 'common_cited_match', 'common_npc_match',
       'mean_sim_docvecs_pc', 'mean_sim_ldavecs_pc', 'mean_sim_docvecs_pc_msa',
       'mean_sim_ldavecs_pc_msa', 'norm_mean_sim_docvecs_pc',
       'norm_mean_sim_ldavecs_pc', 'norm_mean_sim_docvecs_pc_msa',
       'norm_mean_sim_ldavecs_pc_msa', 'sd_sim_docvecs_pc',
       'sd_sim_ldavecs_pc', 'sd_sim_docvecs_pc_msa', 'sd_sim_ldavecs_pc_msa',
       'pc_msa_greater_0', 'pc_msa_less_0', 'num_common_npc',
       'norm_num_common_npc', 'tp_primclass_FE', 'tp_inv_msa_FE',
       'tp_examiner_FE', 'tp_lawyer_FE', 'examiner_match'],
      dtype='object')