In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import timeit
import datetime
import time
import pprint
import itertools
import pickle
import sklearn
import dask
import os
os.chdir('/mnt/t48/bighomes-active/sfeng/patentdiffusion/')
import fastparquet
seed = 3
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import h5py
import collections
import dask
import dask.dataframe as dd

  from ._conv import register_converters as _register_converters


Inventor mobility citations taken from https://sfengc7.stern.nyu.edu:8888/notebooks/patentdiffusion/201808Results/StrategicNonCitations/1c-InventorMobilityCitationsAnalysis-0904.ipynb

In [2]:
c_dict = pickle.load(open("DataStore/2018-06/col_names_dict.pkl", "rb"))
ip = pd.read_pickle("DataStore/2018-07/inv_move_pats_0712.pkl")
ip = dict(zip(ip["patent"], ip["inv_asg_rank"]))
mdc = fastparquet.ParquetFile("DataStore/2018-08/inv_mob_local_cite_sim_0904.parq").to_pandas()
mdc["tp_asg_rank"] = mdc["tp"].map(ip)
mdc["op_asg_rank"] = mdc["op"].map(ip)

#TP Pct of Common Cites
mdc["tp_pct_common_cites"] = mdc["num_common_cites"]/mdc["tp_num_cites"]

# Primary class match
pdf = fastparquet.ParquetFile("RawData/Cleaned/patent_loc_unique_us_0628.parq").to_pandas(["patent", "primclass"])
mdc["tp_primclass"] = mdc["tp"].map(dict(zip(pdf["patent"], pdf["primclass"])))
mdc["op_primclass"] = mdc["op"].map(dict(zip(pdf["patent"], pdf["primclass"])))
mdc["primclass_match"] = (mdc["tp_primclass"] == mdc["op_primclass"])

# Bins
import math

def sim_bin(x, a):
    try:
        return round(round(x / a) * a, -int(math.floor(math.log10(a))))
    except:
        return np.nan
    
mdc["bin_sim_ldavecs"] = mdc["sim_ldavecs"].apply(lambda x: sim_bin(x, 0.1))
mdc["bin_sim_docvecs"] = mdc["sim_docvecs"].apply(lambda x: sim_bin(x, 0.1))

# Year group
def get_year_group_10(x):
    if x in range(1975,1985):
        yg = "1975-85"
    elif x in range(1985,1995):
        yg = "1985-95"
    elif x in range(1995, 2005):
        yg = "1995-05"
    elif x in range(2005,2015):
        yg = "2005-15"
    else:
        yg = np.nan
    return yg

mdc["year_group"] = mdc["tp_appyear"].apply(get_year_group_10)

In [3]:
# Only want to compare first to first, and first to second firm: same as 1b
print(len(mdc))
i2 = mdc.loc[((mdc["tp_asg_rank"] == 0) & (mdc["op_asg_rank"] == 0)) |
                       ((mdc["tp_asg_rank"] == 0) & (mdc["op_asg_rank"] == 1))].copy()
print(len(i2))

# Firm change
i2["firm_change"] = i2["op_asg_rank"].apply(lambda x: "Before" if x == 0 else "After")

571068
329575


In [4]:
i2.head()

Unnamed: 0_level_0,op,tp,inventor_id,tp_asg,op_asg,asg_match,op_cites_tp,sim_ldavecs,sim_docvecs,tp_inv_msa,...,bin_sim_ldavecs,bin_sim_docvecs,tp_appyear,op_appyear,tp_pct_common_cites,tp_primclass,op_primclass,primclass_match,year_group,firm_change
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3930283,3930282,3930282-1,6d92f55ba3875e01b5554b98c65c929a,6d92f55ba3875e01b5554b98c65c929a,True,False,0.546727,0.519405,"Lancaster, PA",...,0.5,0.5,1974.0,1974.0,0.375,452.0,452.0,True,,Before
1,4041572,3930283,3930282-1,6d92f55ba3875e01b5554b98c65c929a,6d92f55ba3875e01b5554b98c65c929a,True,False,0.378731,0.250033,"Lancaster, PA",...,0.4,0.3,1974.0,1976.0,0.0,452.0,452.0,True,,Before
2,4016624,3930283,3930282-1,6d92f55ba3875e01b5554b98c65c929a,6d92f55ba3875e01b5554b98c65c929a,True,False,0.410078,0.497906,"Lancaster, PA",...,0.4,0.5,1974.0,1975.0,0.0,452.0,452.0,True,,Before
3,4102014,3930283,3930282-1,6d92f55ba3875e01b5554b98c65c929a,6d92f55ba3875e01b5554b98c65c929a,True,True,0.678831,0.264625,"Lancaster, PA",...,0.7,0.3,1974.0,1977.0,0.0,452.0,452.0,True,,Before
4,4477942,3930283,3930282-1,6d92f55ba3875e01b5554b98c65c929a,6d92f55ba3875e01b5554b98c65c929a,True,False,0.566891,0.438904,"Lancaster, PA",...,0.6,0.4,1974.0,1983.0,0.0,452.0,452.0,True,,Before


In [5]:
i2.columns

Index(['op', 'tp', 'inventor_id', 'tp_asg', 'op_asg', 'asg_match',
       'op_cites_tp', 'sim_ldavecs', 'sim_docvecs', 'tp_inv_msa', 'op_inv_msa',
       'msa_change', 'tp_num_cites', 'op_num_cites', 'num_common_cites',
       'tp_num_inv_msa_cites', 'op_num_inv_msa_cites', 'ncc_tp_inv_msa',
       'ncc_other_msa', 'tp_pct_inv_msa_cites', 'op_pct_inv_msa_cites',
       'tp_asg_rank', 'op_asg_rank', 'bin_sim_ldavecs', 'bin_sim_docvecs',
       'tp_appyear', 'op_appyear', 'tp_pct_common_cites', 'tp_primclass',
       'op_primclass', 'primclass_match', 'year_group', 'firm_change'],
      dtype='object')

In [6]:
i2[["sim_docvecs", "tp_pct_common_cites"]].corr()

Unnamed: 0,sim_docvecs,tp_pct_common_cites
sim_docvecs,1.0,0.382942
tp_pct_common_cites,0.382942,1.0


In [10]:
sl = 0.3
display(i2.loc[(i2["firm_change"] == "After") & (i2["sim_docvecs"] > sl), "tp_pct_common_cites"].describe())
display(i2.loc[(i2["firm_change"] == "Before") & (i2["sim_docvecs"] > sl), "tp_pct_common_cites"].describe())

count    34260.000000
mean         0.099856
std          0.263909
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: tp_pct_common_cites, dtype: float64

count    91191.000000
mean         0.236587
std          0.377381
min          0.000000
25%          0.000000
50%          0.000000
75%          0.375000
max          1.000000
Name: tp_pct_common_cites, dtype: float64

________
### Example of inventor moving and citing less of the same patents

In [11]:
s1 = 0.5
ia = i2.loc[(i2["firm_change"] == "After") & (i2["sim_docvecs"] > sl)]
ib = i2.loc[(i2["firm_change"] == "Before") & (i2["sim_docvecs"] > sl)]