In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import timeit
import datetime
import time
import pprint
import itertools
import pickle
import sklearn
import dask.dataframe as dd
import os
os.chdir('/mnt/t48/bighomes-active/sfeng/patentdiffusion/')
import fastparquet
seed = 3
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
# Distances
import scipy.spatial.distance as distance
# KL
from scipy.stats import entropy
# Normalize
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Pairwise distances
from sklearn.metrics.pairwise import pairwise_distances
import h5py

  from ._conv import register_converters as _register_converters


Code taken from:
- [`201801KnowledgeSpilloversRep/Replication-1-JTH.ipynb`](https://sfengc7.stern.nyu.edu:8888/notebooks/patentdiffusion/201801KnowledgeSpilloversRep/Replication-1-JTH.ipynb)
- [`201801KnowledgeSpilloversRep/Replication-2-ControlSelection.ipynb`](https://sfengc7.stern.nyu.edu:8888/notebooks/patentdiffusion/201801KnowledgeSpilloversRep/Replication-2-ControlSelection.ipynb)
- [`201801KnowledgeSpilloversRep/Replication-3-CitationSimilarity.ipynb`](https://sfengc7.stern.nyu.edu:8888/notebooks/patentdiffusion/201801KnowledgeSpilloversRep/Replication-3-CitationSimilarity.ipynb)
- [`201801KnowledgeSpilloversRep/Replication-4-SummaryTables.ipynb`](https://sfengc7.stern.nyu.edu:8888/notebooks/patentdiffusion/201801KnowledgeSpilloversRep/Replication-4-SummaryTables.ipynb)

In [2]:
pdf = fastparquet.ParquetFile("RawData/Cleaned/patent_loc_unique_us_0628.parq").to_pandas()
print(len(pdf))
pat_dup = pd.read_pickle("RawData/Cleaned/duplicate_pattext_0712.pkl")
pdf = pdf.loc[~(pdf["patent"].isin(pat_dup))]
print(len(pdf))
# Check I have abstracts
pa = fastparquet.ParquetFile("RawData/Cleaned/patabs7615_us_no_dup.parq").to_pandas(["patent"])["patent"]
pdf = pdf.loc[pdf["patent"].isin(pa)]
print(len(pdf))

# Only use patents that have been cited
cit = dd.read_parquet("RawData/Cleaned/cit_0628.parq")
cit = cit["cited"]

pdf = pdf.loc[pdf["patent"].isin(cit.compute())]
print(len(pdf))
del(cit)

2523739
2220706
2002788
1726997


### Sampling procedure

1. Group patents by primary class; get index
2. Get slice of patents' indices
3. Get slide of patents' indices+1

Removal rules
- Remove pairs from different grant years
- Remove pairs from same location
- Remove pairs from same assignee

In [3]:
# No missing
pdf = pdf.loc[pdf["primclass"].notnull()]
# Exclude primary classes with less than 100 patents
vc = pdf["primclass"].value_counts()
vc = vc[vc > 100].index.tolist()
pdf = pdf.loc[pdf["primclass"].isin(vc)]
print(len(pdf))

1707316


In [4]:
%%time
pg = {n: np.array(g["patent"].tolist()) for n,g in pdf[["patent", "primclass"]].groupby("primclass")}
pgi = {n: np.array(range(len(g))) for n,g in pg.items()}

CPU times: user 826 ms, sys: 21.5 ms, total: 848 ms
Wall time: 841 ms


In [5]:
%%time

def samp_ind(g):
    
    if len(g)>10000:
        size = int(np.round(0.60*len(g)))
        t = np.random.choice(g, size = size, replace = False)
    else:
        size = int(np.round(0.8*len(g)))
        t = np.random.choice(g, size = size, replace = False)
    
    # Remove any indices greater than the length of the entire array
    t = t[np.where(t<len(g)-2)]
    
    c = t+1
    return t,c

# Sample
ts = pd.DataFrame()
for n,g in pgi.items():
    s = pd.DataFrame()
    tpi, cpi = samp_ind(g)
    s["tp"] = pg[n][tpi]
    s["cp"] = pg[n][cpi]
    s["tp_primclass"] = n
    ts = ts.append(s, ignore_index = True)

CPU times: user 1.97 s, sys: 0 ns, total: 1.97 s
Wall time: 1.96 s


In [6]:
# Grant year and MSA
ts = ts.merge(pdf[["gyear", "inv_msa", "patent"]].add_prefix("tp_"), how = "left", left_on = "tp",
             right_on = "tp_patent").drop("tp_patent",1)
ts = ts.merge(pdf[["gyear", "inv_msa", "patent"]].add_prefix("cp_"), how = "left", left_on = "cp",
             right_on = "cp_patent").drop("cp_patent",1)

# Remove non-matching grant year
print(len(ts))
ts = ts.loc[ts["tp_gyear"] == ts["cp_gyear"]]
print(len(ts))

# Remove matching MSA
ts = ts.loc[ts["tp_inv_msa"] != ts["cp_inv_msa"]]
print(len(ts))

1220680
1208642
1133831


In [7]:
# %%time
# asgs = fastparquet.ParquetFile("RawData/Cleaned/patent_assignees_0628.parq").to_pandas(["patent", "assignee_id"])
# asgs = {n: g["assignee_id"].tolist() for n,g in asgs.groupby("patent")}

# # Save assignee dictionary
# pickle.dump(asgs, open("RawData/Cleaned/patent_assignee_dict_0628.pkl", "wb"))

In [8]:
%%time
asgs = pickle.load(open("RawData/Cleaned/patent_assignee_dict_0628.pkl", "rb"))

# Check that target and control are in assignee list
%time asg_match = (set(asgs.get(tp, [])).intersection(asgs.get(cp, [])) for tp, cp in zip(ts["tp"], ts["cp"]))
%time asg_match = [len(i) for i in asg_match]
del(asgs)

ts["asg_match"] = asg_match
ts = ts.loc[ts["asg_match"] == 0]
ts = ts.drop("asg_match",1)
print(len(ts))

CPU times: user 33.3 ms, sys: 29.4 ms, total: 62.8 ms
Wall time: 62.5 ms
CPU times: user 2.49 s, sys: 2.34 ms, total: 2.49 s
Wall time: 2.47 s
1110720
CPU times: user 16.2 s, sys: 2.06 s, total: 18.2 s
Wall time: 18.1 s


In [9]:
ts.head()

Unnamed: 0,tp,cp,tp_primclass,tp_gyear,tp_inv_msa,cp_gyear,cp_inv_msa
0,6253206,6253207,1.0,2001,"Santa Barbara-Santa Maria-Goleta, CA",2001,"New York-Northern New Jersey-Long Island, NY-N..."
1,7580950,7580951,1.0,2009,"San Jose-Sunnyvale-Santa Clara, CA",2009,"Seattle-Tacoma-Bellevue, WA"
2,6944615,6944616,1.0,2005,"San Jose-Sunnyvale-Santa Clara, CA",2005,"Austin-Round Rock-San Marcos, TX"
3,5241648,5241671,1.0,1993,"San Jose-Sunnyvale-Santa Clara, CA",1993,"San Diego-Carlsbad-San Marcos, CA"
4,7254590,7254597,1.0,2007,"San Francisco-Oakland-Fremont, CA",2007,"Boston-Cambridge-Quincy, MA-NH"


In [10]:
ts.to_pickle("DataStore/2018-07-P2/targ_samp_0717.pkl")

______
## Citations from Sample

1. Get citations by each target and control patent
2. Remove self-citations
3. Group by target and control
4. Get percentage match to *tp_inv_msa*
5. Get list of *tp*'s citations that match *tp_inv_msa*

In [11]:
ts = pd.read_pickle("DataStore/2018-07-P2/targ_samp_0717.pkl")
tpats = list(ts["tp"])+list(ts["cp"])

In [12]:
cit = dd.read_parquet("RawData/Cleaned/cit_0628.parq")
cit = cit[cit["cited"].isin(tpats)]

# Convert back to pandas
cit = cit.compute()
print(len(cit))

30416671


In [13]:
%%time
# Remove self citations
asgs = pickle.load(open("RawData/Cleaned/patent_assignee_dict_0628.pkl", "rb"))
%time asg_match = (set(asgs.get(cited, [])).intersection(asgs.get(citing, [])) for cited, citing \
                   in zip(cit["cited"], cit["citing"]))
%time asg_match = [len(i) for i in asg_match]
del(asgs)

cit["asg_match"] = asg_match
cit = cit.loc[cit["asg_match"] == 0]
cit = cit[["citing", "cited"]]
print(len(cit))

CPU times: user 788 ms, sys: 784 ms, total: 1.57 s
Wall time: 1.56 s
CPU times: user 1min 16s, sys: 0 ns, total: 1min 16s
Wall time: 1min 16s
27856524
CPU times: user 1min 32s, sys: 3.34 s, total: 1min 35s
Wall time: 1min 35s


In [14]:
# Add tp to each citation
tdict = dict(zip(ts["tp"], ts["tp"]))
tdict.update(dict(zip(ts["cp"], ts["tp"])))
cit["tp"] = cit["cited"].map(tdict)
del(tdict)

In [15]:
# Merge gyear & location data
pdf = fastparquet.ParquetFile("RawData/Cleaned/patent_loc_unique_us_0628.parq").to_pandas(["patent", "inv_msa", "gyear"])
cit = cit.merge(pdf.add_prefix("citing_"), left_on = "citing", right_on = "citing_patent").drop("citing_patent",1)
# Cited patent's grant year
cit = cit.merge(pdf[["patent", "gyear"]].add_prefix("cited_"), left_on = "cited", right_on = "cited_patent").drop("cited_patent",1)
# Target patent's location
cit = cit.merge(pdf[["patent", "inv_msa"]].add_prefix("tp_"), left_on = "tp", right_on = "tp_patent").drop("tp_patent",1)
del(pdf)

# Citation occured within 10 years of cited patent's grant date
cit["year_diff"] = cit["citing_gyear"]-cit["cited_gyear"]
print(len(cit))

# Citing inv msa matches tp inv msa
cit["inv_msa_match"] = (cit["tp_inv_msa"] == cit["citing_inv_msa"])

# Is cited patent target
cit["is_tp"] = (cit["tp"] == cit["cited"])

18302287


In [16]:
cit.head()

Unnamed: 0,citing,cited,tp,citing_inv_msa,citing_gyear,cited_gyear,tp_inv_msa,year_diff,inv_msa_match,is_tp
0,9643605,5471515,5471515,"Detroit-Warren-Livonia, MI",2017,1995,"Los Angeles-Long Beach-Santa Ana, CA",22,False,True
1,8977439,5471515,5471515,"Grand Rapids-Wyoming, MI",2015,1995,"Los Angeles-Long Beach-Santa Ana, CA",20,False,True
2,9509957,5471515,5471515,"Detroit-Warren-Livonia, MI",2016,1995,"Los Angeles-Long Beach-Santa Ana, CA",21,False,True
3,8593521,5471515,5471515,"Holland-Grand Haven, MI",2013,1995,"Los Angeles-Long Beach-Santa Ana, CA",18,False,True
4,7719580,5471515,5471515,"Syracuse, NY",2010,1995,"Los Angeles-Long Beach-Santa Ana, CA",15,False,True


In [17]:
fastparquet.write("DataStore/2018-07-P2/tp_cp_cites_0717.parq", cit, compression="GZIP")

### Summarize info by each target patent

- If tp_cite_match is missing then target patent did not receive any citations; remove from sample

In [18]:
# cit = fastparquet.ParquetFile("DataStore/2018-07-P2/tp_cp_cites_0717.parq").to_pandas()
ts = pd.read_pickle("DataStore/2018-07-P2/targ_samp_0717.pkl")

In [19]:
# Remove non-cited patents from sample: some may have vanished from self-citation cleaning
print(len(ts))

tp_match_5 = cit.loc[(cit["is_tp"] == True) & (cit["year_diff"] <= 5), ["tp", "inv_msa_match"]].groupby(["tp"]).mean()
cp_match_5 = cit.loc[(cit["is_tp"] == False) & (cit["year_diff"] <= 5), ["tp", "inv_msa_match"]].groupby(["tp"]).mean()

tp_match_10 = cit.loc[(cit["is_tp"] == True) & (cit["year_diff"] <= 10), ["tp", "inv_msa_match"]].groupby(["tp"]).mean()
cp_match_10 = cit.loc[(cit["is_tp"] == False) & (cit["year_diff"] <= 10), ["tp", "inv_msa_match"]].groupby(["tp"]).mean()

ts["tp_match_5"] = ts["tp"].map(pd.Series(tp_match_5["inv_msa_match"]))
ts["cp_match_5"] = ts["tp"].map(pd.Series(cp_match_5["inv_msa_match"]))

ts["tp_match_10"] = ts["tp"].map(pd.Series(tp_match_10["inv_msa_match"]))
ts["cp_match_10"] = ts["tp"].map(pd.Series(cp_match_10["inv_msa_match"]))

# Drop all values that have all 4 columns missing; some self-citation removal might have made this empty
ts = ts.dropna(subset=["tp_match_5", "cp_match_5", "tp_match_10", "cp_match_10"], how="all")
print(len(ts))

1110720
915808


In [20]:
ts.head()

Unnamed: 0,tp,cp,tp_primclass,tp_gyear,tp_inv_msa,cp_gyear,cp_inv_msa,tp_match_5,cp_match_5,tp_match_10,cp_match_10
0,6253206,6253207,1.0,2001,"Santa Barbara-Santa Maria-Goleta, CA",2001,"New York-Northern New Jersey-Long Island, NY-N...",,0.0,,0.0
1,7580950,7580951,1.0,2009,"San Jose-Sunnyvale-Santa Clara, CA",2009,"Seattle-Tacoma-Bellevue, WA",,,,1.0
2,6944615,6944616,1.0,2005,"San Jose-Sunnyvale-Santa Clara, CA",2005,"Austin-Round Rock-San Marcos, TX",,0.133333,,0.097561
3,5241648,5241671,1.0,1993,"San Jose-Sunnyvale-Santa Clara, CA",1993,"San Diego-Carlsbad-San Marcos, CA",0.2,0.064516,0.285714,0.151261
4,7254590,7254597,1.0,2007,"San Francisco-Oakland-Fremont, CA",2007,"Boston-Cambridge-Quincy, MA-NH",,0.0,,0.0


In [21]:
ts[["tp_gyear", "tp_match_5", "cp_match_5", "tp_match_10", "cp_match_10"]].groupby("tp_gyear").mean()

Unnamed: 0_level_0,tp_match_5,cp_match_5,tp_match_10,cp_match_10
tp_gyear,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1976,0.107614,0.035056,0.098364,0.032596
1977,0.108696,0.035398,0.099756,0.034538
1978,0.094943,0.033199,0.082043,0.032428
1979,0.098548,0.030804,0.093542,0.030261
1980,0.107919,0.030878,0.095661,0.030389
1981,0.096908,0.031666,0.083445,0.030548
1982,0.095219,0.029673,0.08878,0.029239
1983,0.097666,0.031011,0.085513,0.029794
1984,0.106089,0.030467,0.090663,0.029108
1985,0.109765,0.030212,0.096732,0.029283


In [22]:
ts.to_pickle("DataStore/2018-07-P2/targ_samp_0717.pkl")

### Add target's citing patents

- Remove rows where there are no target citations

In [23]:
cit = fastparquet.ParquetFile("DataStore/2018-07-P2/tp_cp_cites_0717.parq").to_pandas()

In [24]:
cit = cit.loc[cit["is_tp"] == True]

for yd in [5,10]:
    c2 = cit.loc[cit["year_diff"] <= yd]

    # In same MSA
    sm = {n: g["citing"].tolist() for n,g in c2.loc[c2["inv_msa_match"] == True, ["tp", "citing"]].groupby(["tp"])}
    dm = {n: g["citing"].tolist() for n,g in c2.loc[c2["inv_msa_match"] == False, ["tp", "citing"]].groupby(["tp"])}

    ts["cite_msa_match_{0}".format(yd)] = ts["tp"].map(sm)
    ts["cite_msa_diff_{0}".format(yd)] = ts["tp"].map(dm)
    
print(len(ts))

# Remove rows where there are no target citations
ts = ts.dropna(subset=['cite_msa_match_5', 'cite_msa_diff_5', 'cite_msa_match_10',
       'cite_msa_diff_10'], how="all")
print(len(ts))

915808
282079


In [25]:
ts.head()

Unnamed: 0,tp,cp,tp_primclass,tp_gyear,tp_inv_msa,cp_gyear,cp_inv_msa,tp_match_5,cp_match_5,tp_match_10,cp_match_10,cite_msa_match_5,cite_msa_diff_5,cite_msa_match_10,cite_msa_diff_10
3,5241648,5241671,1.0,1993,"San Jose-Sunnyvale-Santa Clara, CA",1993,"San Diego-Carlsbad-San Marcos, CA",0.2,0.064516,0.285714,0.151261,[5448727],"[5666526, 5537589, 5664172, 5594898]","[6505189, 6374232, 5448727, 6167399]","[6643636, 6640221, 5983215, 5666526, 6064999, ..."
5,6665662,6665663,1.0,2003,"San Francisco-Oakland-Fremont, CA",2003,"San Jose-Sunnyvale-Santa Clara, CA",0.0,0.5,0.145833,0.6,,"[7131069, 7243095, 7376641, 7284235, 7441007, ...","[8560534, 7752159, 7756810, 8271794, 8495002, ...","[8205242, 7831559, 7131069, 8307007, 7243095, ..."
9,5797117,5797128,1.0,1998,"Los Angeles-Long Beach-Santa Ana, CA",1998,"Colorado Springs, CO",0.0,0.0,0.0,0.025641,,"[6092073, 5897633, 6654879, 5903895, 5987253]",,"[6092073, 5897633, 6654879, 5903895, 5987253]"
11,7502777,7502779,1.0,2009,"San Jose-Sunnyvale-Santa Clara, CA",2009,"Austin-Round Rock-San Marcos, TX",0.0,0.0,0.0,0.0,,"[7784033, 7921084]",,"[7784033, 7921084]"
15,7599939,7599941,1.0,2009,"San Jose-Sunnyvale-Santa Clara, CA",2009,"Los Angeles-Long Beach-Santa Ana, CA",0.166667,0.15,0.25,0.103448,[8903836],"[8578393, 8079081, 8156553, 8407335, 8719232]","[8903836, 9298691]","[8578393, 8079081, 9657567, 8156553, 8407335, ..."


In [26]:
ts.to_pickle("DataStore/2018-07-P2/targ_samp_0717.pkl")