In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import timeit
import datetime
import time
import pprint
import itertools
import pickle
import sklearn
import dask
import dask.dataframe as dd
import dask.array as da
import os
os.chdir('/mnt/t48/bighomes-active/sfeng/patentdiffusion/')
import fastparquet
seed = 3
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
# Distances
import scipy.spatial.distance as distance
# KL
from scipy.stats import entropy
# Normalize
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Pairwise distances
from sklearn.metrics.pairwise import pairwise_distances
import h5py

  from ._conv import register_converters as _register_converters


Code from:
- https://sfengc7.stern.nyu.edu:8888/notebooks/patentdiffusion/201808Results/LocalRegressions/2c-AddingControls-0808.ipynb
- https://sfengc7.stern.nyu.edu:8888/notebooks/patentdiffusion/201808Results/Reg1016/1-NewDataReg.ipynb

In [25]:
k = "naics_name"
mdc = dd.read_parquet("DataStore/2018-07-P2/Reg0726/local_{0}_samp_0723.parq".format(k)).compute()
print(mdc.columns)
print(len(mdc))
mdc = mdc.drop(["index", "index2", "samp"],1)

In [26]:
# Num common inventors
inv = fastparquet.ParquetFile("RawData/Cleaned/patent_inventors_0628.parq").to_pandas(["patent", "inventor_id"])
%time inv = {n: g["inventor_id"].tolist() for n, g in inv.groupby("patent")}

CPU times: user 19min 12s, sys: 12.9 s, total: 19min 24s
Wall time: 19min 16s


In [27]:
print(len(mdc))
%time num_common_inv = [len(set(inv[tp]).intersection(inv[op])) if (tp in inv.keys()) & (op in inv.keys())\
                  else np.nan for tp, op in zip(mdc["tp"], mdc["op"])]
mdc["num_common_pat_inv"] = num_common_inv
del(inv, num_common_inv)
mdc["common_pat_inv"] = np.nan
mdc.loc[mdc["num_common_pat_inv"] >= 1, "common_pat_inv"] = True
mdc.loc[mdc["num_common_pat_inv"] == 0, "common_pat_inv"] = False
print(mdc["num_common_pat_inv"].value_counts())
print(len(mdc))

4580533
CPU times: user 16.3 s, sys: 330 ms, total: 16.6 s
Wall time: 16.6 s
0     4565373
1       13071
2        1627
3         358
4          69
5          16
8           6
6           6
7           3
11          2
9           2
Name: num_common_pat_inv, dtype: int64
4580533


In [28]:
# Lawyers
ldf = pd.read_csv("RawData/Cleaned/patent_lawyer.csv")
%time ldf = {n:g["lawyer_id"].tolist() for n,g in ldf[["patent", "lawyer_id"]].groupby("patent")}

%time l_match = (set(ldf.get(tp, [])).intersection(set(ldf.get(op, []))) for tp, op in zip(mdc["tp"], mdc["op"]))
%time mdc["lawyer_match"] = [len(i) for i in l_match]
%time mdc["lawyer_match"] = mdc["lawyer_match"].apply(lambda x: True if x >= 1 else False)
print(mdc["lawyer_match"].value_counts())
print(len(mdc))
del(ldf)

CPU times: user 40min 15s, sys: 25.5 s, total: 40min 41s
Wall time: 40min 22s
CPU times: user 305 ms, sys: 236 ms, total: 541 ms
Wall time: 537 ms
CPU times: user 14.9 s, sys: 88.1 ms, total: 15 s
Wall time: 14.9 s
CPU times: user 1.37 s, sys: 1.31 ms, total: 1.37 s
Wall time: 1.36 s
False    4555458
True       25075
Name: lawyer_match, dtype: int64
4580533


In [31]:
# Examiners
re = fastparquet.ParquetFile("RawData/Cleaned/patexaminer1016.parq").to_pandas()
re = dict(zip(re["patent_id"], re["examiner_id"]))
mdc["examiner_match"] = (mdc["tp"].map(re) == mdc["op"].map(re))

In [48]:
mdc.columns

Index(['op', 'tp', 'num_common_pat_inv', 'common_pat_inv', 'lawyer_match',
       'examiner_match', 'direct_cite', 'num_common_cites', 'num_common_npc',
       'common_npc_match', 'common_cited_match', 'tp_inv_msa', 'op_inv_msa',
       'inv_msa_match'],
      dtype='object')

In [47]:
# Location match
pdf = fastparquet.ParquetFile("RawData/Cleaned/patent_loc_unique_us_0628.parq").to_pandas(["patent", "inv_msa"])
pdf = dict(zip(pdf["patent"], pdf["inv_msa"]))
mdc["tp_inv_msa"] = mdc["tp"].map(pdf)
mdc["op_inv_msa"] = mdc["op"].map(pdf)
mdc["inv_msa_match"] = (mdc["tp_inv_msa"] == mdc["op_inv_msa"])
del(pdf)

In [49]:
fastparquet.write("DataStore/2019-01/{0}_pr_cite_0110.parq".format(k), mdc, compression="GZIP")

  head = data[:10] if isinstance(data, pd.Index) else data.valid()[:10]


In [34]:
cit = dd.read_parquet("RawData/Cleaned/cit_0628.parq")

# Direct citations
# Create all False
mdc["direct_cite"] = False
# Create zipped pairs
mdc["tp_op"] = list(zip(mdc["tp"], mdc["op"]))
# Citations
c2 = cit[cit["cited"].isin(mdc["tp"])].compute()
mdc.loc[mdc["tp_op"].isin(list(zip(c2["cited"], c2["citing"]))), "direct_cite"] = True
del(c2)
print(mdc["direct_cite"].value_counts())
# Drop pairs
mdc = mdc.drop("tp_op",1)

# Number of common citations
# Citations
%time c2 = cit[cit["citing"].isin(mdc["tp"]) | cit["citing"].isin(mdc["op"])].compute()
# Dictionary of citing: cited patents
%time c2 = {n:g["cited"].tolist() for n,g in c2.groupby("citing")}
# Get number of overlapping
%time num_common_cites = (set(c2.get(tp, [])).intersection(set(c2.get(op, []))) for tp, op in zip(mdc["tp"], mdc["op"]))
%time mdc["num_common_cites"] = [len(i) for i in num_common_cites]
del(c2, cit)
# At least one number of common cites
mdc["common_cited_match"] = False
mdc.loc[(mdc["num_common_cites"] >= 1), "common_cited_match"] = True
print(mdc["num_common_cites"].value_counts()[:20])
print(mdc["common_cited_match"].value_counts())
print(len(mdc))
print(mdc.columns)
    
    

False    4575670
True        4863
Name: direct_cite, dtype: int64
CPU times: user 4min 4s, sys: 36.9 s, total: 4min 41s
Wall time: 1min 39s
CPU times: user 9min 52s, sys: 18.9 s, total: 10min 11s
Wall time: 9min 58s
CPU times: user 275 ms, sys: 145 ms, total: 420 ms
Wall time: 417 ms
CPU times: user 26.7 s, sys: 91.7 ms, total: 26.7 s
Wall time: 26.7 s
0     4520243
1       37282
2        9129
3        3986
4        2175
5        1363
6         999
7         702
8         523
9         413
10        319
11        314
12        228
13        186
14        163
15        146
17        133
16        126
18        120
19        115
Name: num_common_cites, dtype: int64
False    4520243
True       60290
Name: common_cited_match, dtype: int64
4580533
Index(['op', 'tp', 'num_common_pat_inv', 'common_pat_inv', 'lawyer_match',
       'examiner_match', 'tp_inv_msa', 'op_inv_msa', 'direct_cite',
       'num_common_cites', 'common_cited_match'],
      dtype='object')


In [35]:
# Non patent citations
%time oc = fastparquet.ParquetFile("RawData/Cleaned/otherreference1016.parq").to_pandas()
%time oc = {n:g["ref_id"].tolist() for n,g in oc.groupby("patent_id")}

num_common_cites = (set(oc.get(tp, [])).intersection(set(oc.get(op, []))) for tp, op in zip(mdc["tp"], mdc["op"]))
%time mdc["num_common_npc"] = [len(i) for i in num_common_cites]
del(num_common_cites, oc)

# Match
mdc["common_npc_match"] = False
mdc.loc[mdc["num_common_npc"] >= 1, "common_npc_match"] = True

mdc["common_cited_match"] = False
mdc.loc[mdc["num_common_cites"] >= 1, "common_cited_match"] = True

CPU times: user 1min 19s, sys: 20.2 s, total: 1min 40s
Wall time: 1min 38s
CPU times: user 21min 31s, sys: 14.2 s, total: 21min 45s
Wall time: 21min 35s
CPU times: user 34.8 s, sys: 102 ms, total: 34.9 s
Wall time: 34.9 s


In [36]:
%time fastparquet.write("DataStore/2019-01/{0}_pr_cite_0110.parq".format(k), mdc, compression="GZIP")

  head = data[:10] if isinstance(data, pd.Index) else data.valid()[:10]


In [41]:
mdc.columns

Index(['op', 'tp', 'num_common_pat_inv', 'common_pat_inv', 'lawyer_match',
       'examiner_match', 'tp_inv_msa', 'op_inv_msa', 'direct_cite',
       'num_common_cites', 'common_cited_match', 'num_common_npc',
       'common_npc_match', 'inv_msa_match'],
      dtype='object')