In [3]:
import numpy as np
import scipy as sp
import pandas as pd
import timeit
import datetime
import time
import pprint
import itertools
import pickle
import sklearn
import dask
import dask.dataframe as dd
import dask.array as da
import os
os.chdir('/mnt/t48/bighomes-active/sfeng/patentdiffusion/')
import fastparquet
seed = 3
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
# Distances
import scipy.spatial.distance as distance
# KL
from scipy.stats import entropy
# Normalize
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Pairwise distances
from sklearn.metrics.pairwise import pairwise_distances
import h5py

Code taken from: https://sfengc7.stern.nyu.edu:8888/notebooks/patentdiffusion/Results/ExogSpillovers/2a-UniversityPatents-v2.ipynb

In [4]:
# Load data
pdf = fastparquet.ParquetFile("RawData/Cleaned/patent_loc_unique_us_0628.parq").to_pandas()
print(len(pdf))
pdf = pdf.drop_duplicates("patent")
# Remove actual duplicates
dup_pats = pd.read_pickle("RawData/Cleaned/duplicate_pattext_0712.pkl")
pdf = pdf.loc[~(pdf["patent"].isin(dup_pats))]
print(len(pdf))

def get_year_group(x):
    if x in range(1975,1980):
        yg = "1975-80"
    elif x in range(1980,1985):
        yg = "1980-85"
    elif x in range(1985, 1990):
        yg = "1985-90"
    elif x in range(1990,1995):
        yg = "1990-95"
    elif x in range(1995,2000):
        yg = "1995-00"
    elif x in range(2000,2005):
        yg = "2000-05"
    elif x in range(2005,2010):
        yg = "2005-10"
    elif x in range(2010, 2015):
        yg = "2010-15"
    else:
        yg = np.nan
    return yg

pdf["year_group"] = pdf["gyear"].apply(get_year_group)
# Drop missing columns
pdf = pdf.dropna(how="any",subset=["gyear", "naics_name", "primclass", "year_group"])
print(len(pdf))

# Drop locations with less than 200 patents
vc = pdf["inv_msa"].value_counts()
print(len(vc))
vc = vc[vc > 200].index.tolist()
print(len(vc))
pdf = pdf.loc[pdf["inv_msa"].isin(vc)]
print(len(pdf))

# Only use patents granted 1976-2015
pdf = pdf.loc[pdf["gyear"].isin(range(1976,2015))]

2523739
2220706
1918565
18719
369
1756166


In [5]:
targ = pdf.sample(frac=0.3, random_state = seed)

### In this sample: Sample less from own MSA and more from other MSAs

In [7]:
def get_sample(key, d, num):
    try:
        s = np.random.choice(d[key], size=num, replace=True)
    except Exception:
        s = [np.nan]*num
    return s

In [8]:
k = "year_group"
m = "inv_msa"

for c in ["naics_name", "primclass"]:
    print(c)
    print(datetime.datetime.now())
    # Relevant columns
    cols = [c,k,m,"gyear", "patent"]
    # Exclude fields with less than 100 patents
    vc = pdf[c].value_counts()
    vc = vc[vc > 100].index.tolist()
    p_rel = pdf.loc[pdf[c].isin(vc), cols]
    
    num_in_msa = 5
    num_out_msa = 15
    # Each sample group
    targ_g = {n: g["patent"].tolist()*num_out_msa for n,g in targ[cols].groupby([c,k,m])} # Each group is from same field, MSA, year group
    targ_g_m = {n: g["patent"].tolist()*num_in_msa for n,g in targ[cols].groupby([c,k,m])}
    p_c_m = {n: g["patent"].tolist() for n,g in p_rel[cols].groupby([c,k,m])} # Each group is from same field, MSA, year group
    p_c = {n: g["patent"].tolist() for n,g in p_rel[cols].groupby([c,k])} # Each group from the same field, year group
    ts = pd.DataFrame()
    
    print("getting sample")
    print(datetime.datetime.now())
    # In MSA Sample
    op_in_msa = (get_sample(n, p_c_m, len(g)) for n,g in targ_g_m.items())
    op_in_msa = [item for sublist in op_in_msa for item in sublist]
    # Field Sample: Group names NOT the same as target!
    op_in_field = (get_sample((n[0],n[1]), p_c, len(g)) for n,g in targ_g.items())
    op_in_field = [item for sublist in op_in_field for item in sublist]
    # Target patents
    tp = [item for sublist in targ_g.values() for item in sublist]
    tp_m = [item for sublist in targ_g_m.values() for item in sublist]
    print(datetime.datetime.now())
    
    s1 = pd.DataFrame({"tp": tp_m, "op": op_in_msa})
    s1["samp"] = "In MSA"
    s2 = pd.DataFrame({"tp": tp, "op": op_in_field})
    s2["samp"] = "In Field"
    
    ts = s1.append(s2, ignore_index = True)
    print(len(ts))
    
    # Drop missing
    ts = ts.dropna(how="any")
    print(len(ts))
    
    # tp first
    ts = ts.loc[ts["tp"]<ts["op"]]
    print(len(ts))
    
    print("cleaning assignees")
    print(datetime.datetime.now())
    asgs = pickle.load(open("RawData/Cleaned/patent_assignee_dict_0628.pkl", "rb"))
    # Check that target and other do not have same assignee
    %time asg_match = (set(asgs.get(tp, [])).intersection(asgs.get(op, [])) for tp, op in zip(ts["tp"], ts["op"]))
    %time asg_match = [len(i) for i in asg_match]

    ts["asg_match"] = asg_match
    ts = ts.loc[ts["asg_match"] == 0]
    ts = ts.drop("asg_match",1)
    print(len(ts))
    del(asgs)
    
    #Drop duplicates
    ts = ts.drop_duplicates()
    print(len(ts))
    
    # Add index otherwise it doesn't know how to sort
    ts = ts.reset_index()
    
    ts["index2"] = ts["index"]
    
    # Convert to dask dataframe
    ts = dd.from_pandas(ts, npartitions=1000)
    # Save to parquet
    dd.to_parquet(ts, path="DataStore/2018-07-P2/Reg0726/local_{0}_samp_0723.parq".format(c), compression="gzip")
    
    print("finished")
    print(datetime.datetime.now())
    

naics_name
2018-07-26 17:45:40.766773
getting sample
2018-07-26 17:46:03.251552
2018-07-26 17:46:28.497517
10537000
10536820
5230254
cleaning assignees
2018-07-26 17:46:34.063364
CPU times: user 140 ms, sys: 104 ms, total: 244 ms
Wall time: 242 ms
CPU times: user 9.22 s, sys: 21 ms, total: 9.24 s
Wall time: 9.16 s
5030388
4995523
finished
2018-07-26 17:47:57.941952
primclass
2018-07-26 17:47:57.942161
getting sample
2018-07-26 17:49:34.773994
2018-07-26 17:49:48.163366
10537000
10532740
5074573
cleaning assignees
2018-07-26 17:49:53.979071
CPU times: user 153 ms, sys: 103 ms, total: 256 ms
Wall time: 282 ms
CPU times: user 8.02 s, sys: 10.9 ms, total: 8.04 s
Wall time: 8.22 s
4718622
4580533
finished
2018-07-26 17:50:56.599692


In [None]:
d