In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import timeit
import datetime
import time
import pprint
import itertools
import pickle
import dask
import dask.dataframe as dd
import dask.array as da
import os
os.chdir('/mnt/t48/bighomes-active/sfeng/patentdiffusion/')
import fastparquet
seed = 3
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import h5py
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

  from ._conv import register_converters as _register_converters


### File summaries
- `naics_name_sim_tr_0726.parq`, `primclass_sim_tr_0726.parq`: complete transformed similarities
- `reg_naics_name_sim_tr_0726.parq`, `reg_primclass_sim_tr_0726.parq`: sampled transformed similarities, merged with field-MSA similarities

### 1. Merging similarity with *invpat* data
- Getting data and cleaning out outliers

In [12]:
pathdir = "DataStore/2018-07-P2/Reg0726/"

cols = ["patent", "gyear", "inv_msa", "naics_name", "primclass"]
tp_cols = ["tp", "tp_gyear", "tp_inv_msa", "tp_naics_name", "tp_primclass"]
op_cols = ["op", "op_gyear", "op_inv_msa", "op_naics_name", "op_primclass"]

for k in ["naics_name", "primclass"]:
    # 1. Load similarities
    simf = "{0}_sim_0726.parq".format(k)
    %time sim = dd.read_parquet(pathdir+simf).compute()
    if "sim_ldavecs_x" in sim.columns:
        sim = sim.rename(columns={"sim_ldavecs_x":"sim_ldavecs"}).drop("sim_ldavecs_y",1)

    # MSA Match
    ip_tp = fastparquet.ParquetFile("RawData/Cleaned/patent_loc_unique_us_0628.parq")\
    .to_pandas(["patent", "gyear", "naics_name", "primclass", "inv_msa"]).rename(columns=dict(zip(cols, tp_cols)))
    ip_op = fastparquet.ParquetFile("RawData/Cleaned/patent_loc_unique_us_0628.parq")\
    .to_pandas(["patent", "gyear", "naics_name", "primclass", "inv_msa"]).rename(columns=dict(zip(cols, op_cols)))
    
    sim = sim.merge(ip_tp, how = 'left', on = 'tp')
    sim = sim.merge(ip_op, how = 'left', on = 'op')
    
    for c in ["inv_msa", "primclass"]:
        sim["{0}_match".format(c)] = (sim["tp_{0}".format(c)] == sim["op_{0}".format(c)])
        
    print(sim.columns)

    # Scale values
    # Pre scaled averages
    print(k)
    display(np.round(sim.describe(),3))

#     def scale_docvecs(x, dv_min, eps):
#         scaled = ((x-dv_min)/(1-dv_min))*(1-eps)+eps
#         return scaled
#     def scale_ldavecs(x, eps):
#         scaled = x*(1-eps)+eps
#         return scaled

    # 1. Normed values

    # Define similarity columns
    sim_cols = [c for c in sim.columns if "sim_" in c]

    for c in sim_cols:
        # Get normed values for all similarities
        sim["norm_{0}".format(c)] = np.nan
        sim.loc[sim[c].notnull(), "norm_{0}".format(c)] = \
        scaler.fit_transform(sim.loc[sim[c].notnull(), c].values.reshape(-1,1))

    # Prune values
    if "sim_docvecs" in sim_cols:
        dv_cols = [c for c in sim.columns if "docvecs" in c]
        
    for c in sim_cols:
        if "docvecs" in c:
            # Prune values below -4 sd
            sim.loc[(sim["norm_{0}".format(c)]< -4) | (sim["norm_{0}".format(c)] > 4), dv_cols] = np.nan 

    print("transformed")
    print(k)
    display(np.round(sim.describe(),3))
    o_f = "{0}_sim_tr_0726.parq".format(k)

    %time fastparquet.write(pathdir+o_f, sim, compression="GZIP")

CPU times: user 1.54 s, sys: 223 ms, total: 1.76 s
Wall time: 1.94 s
Index(['tp', 'op', 'sim_ldavecs', 'sim_docvecs', 'tp_gyear', 'tp_naics_name',
       'tp_primclass', 'tp_inv_msa', 'op_gyear', 'op_naics_name',
       'op_primclass', 'op_inv_msa', 'inv_msa_match', 'primclass_match'],
      dtype='object')
naics_name


Unnamed: 0,tp,sim_ldavecs,sim_docvecs,tp_gyear,tp_primclass,op_gyear,op_primclass
count,4993946.0,4943559.0,4943559.0,4993946.0,4993946.0,4993946.0,4993946.0
mean,6184892.518,0.241,0.131,1998.522,369.155,2000.086,370.918
std,1387740.639,0.222,0.137,10.865,195.396,10.892,196.605
min,3930283.0,0.001,-0.801,1976.0,1.0,1976.0,1.0
25%,4990923.0,0.061,0.039,1991.0,222.0,1993.0,224.0
50%,6124207.0,0.174,0.123,2000.0,370.0,2002.0,370.0
75%,7328211.0,0.368,0.214,2008.0,493.0,2009.0,502.0
max,8924916.0,1.0,0.939,2014.0,850.0,2014.0,902.0


transformed
naics_name


Unnamed: 0,tp,sim_ldavecs,sim_docvecs,tp_gyear,tp_primclass,op_gyear,op_primclass,norm_sim_ldavecs,norm_sim_docvecs
count,4993946.0,4943559.0,4936169.0,4993946.0,4993946.0,4993946.0,4993946.0,4943559.0,4936169.0
mean,6184892.518,0.241,0.131,1998.522,369.155,2000.086,370.918,-0.0,-0.006
std,1387740.639,0.222,0.135,10.865,195.396,10.892,196.605,1.0,0.986
min,3930283.0,0.001,-0.416,1976.0,1.0,1976.0,1.0,-1.085,-3.999
25%,4990923.0,0.061,0.039,1991.0,222.0,1993.0,224.0,-0.814,-0.675
50%,6124207.0,0.174,0.123,2000.0,370.0,2002.0,370.0,-0.305,-0.063
75%,7328211.0,0.368,0.213,2008.0,493.0,2009.0,502.0,0.569,0.597
max,8924916.0,1.0,0.679,2014.0,850.0,2014.0,902.0,3.419,4.0


CPU times: user 2min 44s, sys: 1.07 s, total: 2min 45s
Wall time: 2min 47s
CPU times: user 1.39 s, sys: 113 ms, total: 1.51 s
Wall time: 1.66 s
Index(['tp', 'op', 'sim_ldavecs', 'sim_docvecs', 'tp_gyear', 'tp_naics_name',
       'tp_primclass', 'tp_inv_msa', 'op_gyear', 'op_naics_name',
       'op_primclass', 'op_inv_msa', 'inv_msa_match', 'primclass_match'],
      dtype='object')
primclass


Unnamed: 0,tp,sim_ldavecs,sim_docvecs,tp_gyear,tp_primclass,op_gyear,op_primclass
count,4566111.0,4519543.0,4519543.0,4566111.0,4566111.0,4566111.0,4566111.0
mean,6211176.465,0.377,0.189,1998.733,373.565,2000.288,373.565
std,1386162.92,0.245,0.14,10.809,195.709,10.834,195.709
min,3930283.0,0.001,-0.824,1976.0,1.0,1976.0,1.0
25%,5011515.0,0.172,0.094,1991.0,235.0,1993.0,235.0
50%,6145287.0,0.347,0.183,2000.0,370.0,2002.0,370.0
75%,7370836.0,0.559,0.276,2008.0,506.0,2009.0,506.0
max,8925098.0,1.0,0.955,2014.0,850.0,2014.0,850.0


transformed
primclass


Unnamed: 0,tp,sim_ldavecs,sim_docvecs,tp_gyear,tp_primclass,op_gyear,op_primclass,norm_sim_ldavecs,norm_sim_docvecs
count,4566111.0,4519543.0,4514776.0,4566111.0,4566111.0,4566111.0,4566111.0,4519543.0,4514776.0
mean,6211176.465,0.377,0.188,1998.733,373.565,2000.288,373.565,-0.0,-0.003
std,1386162.92,0.245,0.139,10.809,195.709,10.834,195.709,1.0,0.99
min,3930283.0,0.001,-0.371,1976.0,1.0,1976.0,1.0,-1.537,-4.0
25%,5011515.0,0.172,0.094,1991.0,235.0,1993.0,235.0,-0.838,-0.674
50%,6145287.0,0.347,0.183,2000.0,370.0,2002.0,370.0,-0.125,-0.041
75%,7370836.0,0.559,0.276,2008.0,506.0,2009.0,506.0,0.744,0.622
max,8925098.0,1.0,0.748,2014.0,850.0,2014.0,850.0,2.542,4.0


CPU times: user 1min 48s, sys: 912 ms, total: 1min 49s
Wall time: 1min 51s


### 2. Using patent pair data $(i,j)$, merge with field-MSA similarities 

In [3]:
def get_year_group(x):
    if x in range(1975,1980):
        yg = "1975-80"
    elif x in range(1980,1985):
        yg = "1980-85"
    elif x in range(1985, 1990):
        yg = "1985-90"
    elif x in range(1990,1995):
        yg = "1990-95"
    elif x in range(1995,2000):
        yg = "1995-00"
    elif x in range(2000,2005):
        yg = "2000-05"
    elif x in range(2005,2010):
        yg = "2005-10"
    elif x in range(2010, 2015):
        yg = "2010-15"
    else:
        yg = np.nan
    return yg

def get_year_group_10(x):
    if x in range(1975,1985):
        yg = "1975-85"
    elif x in range(1985,1995):
        yg = "1985-95"
    elif x in range(1995, 2005):
        yg = "1995-05"
    elif x in range(2005,2015):
        yg = "2005-15"
    else:
        yg = np.nan
    return yg

### 3. Groupby target patent

- *group_cols* are what each observation is grouped by
- *merge_cols* are the relevant information to merge on from field-MSA data. It was created using patent pairs so there are duplicates of rows with the same information. Dropping those duplicates, I then merge it to the original similarity data
- Similarity data is then grouped by the *group_cols* to get averages for each target patent, op msa pair

In [5]:
pathdir = "DataStore/2018-07-P2/Reg0726/"

files = ["naics_name_naics_msa_0726.pkl", "naics_name_pc_0726.pkl",
     "naics_name_pc_msa_0726.pkl","primclass_pc_msa_0726.pkl"]

sim_f = ["naics_name_sim_tr_0726.parq", "naics_name_sim_tr_0726.parq",
        "naics_name_sim_tr_0726.parq", "primclass_sim_tr_0726.parq"]

group_cols = [["tp", "op_inv_msa", "tp_gyear"], 
              ["tp", "op_inv_msa", "tp_primclass", "tp_gyear"],
             ["tp", "op_inv_msa", "tp_primclass", "tp_gyear"],
             ["tp", "op_inv_msa", "tp_primclass", "tp_gyear"]]

# Remember: this is all columns needed to index the similarities in Sim; so must merge on all
merge_cols = [["tp_inv_msa", "tp_naics_name", "op_inv_msa", "op_naics_name", "tp_gyear"],
              ["tp_primclass", "op_primclass", "tp_gyear"],
              ["tp_inv_msa", "tp_primclass", "op_inv_msa", "op_primclass", "tp_gyear"],
              ["tp_inv_msa", "tp_primclass", "op_inv_msa", "op_primclass", "tp_gyear"]]

sim_f = dict(zip(files, sim_f))
group_cols = dict(zip(files, group_cols))
merge_cols = dict(zip(files, merge_cols))

# for k in ["naics_name", "primclass"]:
for k in ["naics_name"]:
    sim_f = "{0}_sim_tr_0726.parq".format(k)
    sim = fastparquet.ParquetFile(pathdir+sim_f).to_pandas().drop("op_gyear",1)
    sim = sim.sample(frac=0.3)
    print(len(sim))
    
    if k == "naics_name":
        merge_files = ["naics_name_naics_msa_0726.pkl", "naics_name_pc_0726.pkl",
     "naics_name_pc_msa_0726.pkl"]
    else:
        merge_files = ["primclass_pc_msa_0726.pkl"]
    
    for f in merge_files:
        mf = pd.read_pickle(pathdir+f)
        # Drop extra keys
        mf = mf.drop_duplicates(merge_cols[f])
        # Only grab keys and similarity measures
        sim_cols = [c for c in mf.columns if "sim_" in c]
        mf = mf[sim_cols+merge_cols[f]]
        print(len(mf))
        sim = sim.merge(mf, how="left", on=merge_cols[f])
        print(len(sim))
    
    # Add year group
    sim["year_group"] = sim["tp_gyear"].apply(get_year_group_10)
#     sim["year_diff"] = sim["op_gyear"] - sim["tp_gyear"]
    
    
    # Normalize all similarities
    sim_cols = [c for c in sim.columns if "sim_" in c]
    for c in sim_cols:
        sim["norm_{0}".format(c)] = np.nan
        sim.loc[sim[c].notnull(), "norm_{0}".format(c)] = \
        scaler.fit_transform(sim.loc[sim[c].notnull(), c].values.reshape(-1,1))
    
    print(sim["sim_med_docvecs_pc"].describe())
    # Rename
    print(sim.columns)
    reg_f = "reg_"+sim_f
    %time fastparquet.write(pathdir+reg_f, sim, compression="GZIP")
     

1498184
1665937
1498184
382252
1498184
3950342
1498184
count    1.406769e+06
mean     6.849845e-01
std      1.912306e-01
min     -4.809314e-02
25%      5.368318e-01
50%      6.594408e-01
75%      8.307145e-01
max      1.000000e+00
Name: sim_med_docvecs_pc, dtype: float64
Index(['tp', 'op', 'sim_ldavecs', 'sim_docvecs', 'tp_gyear', 'tp_naics_name',
       'tp_primclass', 'tp_inv_msa', 'op_naics_name', 'op_primclass',
       'op_inv_msa', 'inv_msa_match', 'primclass_match', 'norm_sim_ldavecs',
       'norm_sim_docvecs', 'sim_med_ldavecs_naics_msa',
       'sim_mean_ldavecs_naics_msa', 'sim_med_docvecs_naics_msa',
       'sim_mean_docvecs_naics_msa', 'sim_med_ldavecs_pc',
       'sim_mean_ldavecs_pc', 'sim_med_docvecs_pc', 'sim_mean_docvecs_pc',
       'sim_med_ldavecs_pc_msa', 'sim_mean_ldavecs_pc_msa',
       'sim_med_docvecs_pc_msa', 'sim_mean_docvecs_pc_msa', 'year_group',
       'norm_norm_sim_ldavecs', 'norm_norm_sim_docvecs',
       'norm_sim_med_ldavecs_naics_msa', 'norm_sim_mean_

In [7]:
np.round(sim["sim_med_docvecs_pc"].describe(),2)

count    1406769.00
mean           0.68
std            0.19
min           -0.05
25%            0.54
50%            0.66
75%            0.83
max            1.00
Name: sim_med_docvecs_pc, dtype: float64