In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import timeit
import datetime
import time
import pprint
import itertools
import pickle
import sklearn
import dask
import os
os.chdir('/mnt/t48/bighomes-active/sfeng/patentdiffusion/')
import fastparquet
seed = 3
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import h5py
import collections
import dask.dataframe as dd

  from ._conv import register_converters as _register_converters


### Similarity across patents using the same terms over time
- Code from https://sfengc7.stern.nyu.edu:8888/notebooks/patentdiffusion/201807Results/MutualCitations/3b-NewTerms-Sim.ipynb

In [13]:
py = pd.read_pickle("DataStore/2018-06/newterm_pat_year_0614.pkl")

# Groupby term, yr_from_first and get set of leading & following patents
lead_pats = {n: g["patent"].tolist() for n,g in py.loc[py["yr_from_first"] == 0,\
                            ["yr_from_first", "patent", "term"]].groupby("term")}
follow_pats = {n: g["patent"].tolist() for n,g in py.loc[py["yr_from_first"] >= 1,\
                            ["yr_from_first", "patent", "term"]].groupby("term") \
              if n in lead_pats.keys()}
# Filter lead_pats by terms in follow_pats
lead_pats = {k:v for k,v in lead_pats.items() if k in follow_pats.keys()}

# Dataframe
pairlists = pd.DataFrame({"term": list(lead_pats.keys()), 
                          "lead_pats": [v for k,v in lead_pats.items()],
                         "follow_pats": [v for k,v in follow_pats.items()]})



In [14]:
import scipy.spatial.distance as distance
import dask.array as da
import math
dms = ["ldavecs", "docvecs"]
res = {}

for dm in dms:
    print("Loading matrix and dict")
    print(datetime.datetime.now())
    pat_dict = fastparquet.ParquetFile("RawData/Cleaned/patabs7615_us_no_dup.parq").to_pandas(["patent"])["patent"].tolist()
    pat_dict = dict(zip(pat_dict, range(len(pat_dict))))
    pm = fastparquet.ParquetFile("DataStore/2018-07-P2/ML/{0}_pats_0712.parq".format(dm))\
        .to_pandas().values
    
    l3 = pairlists
    # Only use patents in dictionary
    %time l3["lead_pats"] = [[i for i in l if i in pat_dict.keys()] for l in l3["lead_pats"].tolist()]
    
    # Get pats and then sample from follow_pats
    %time fp = ([i for i in l if i in pat_dict.keys()] for l in l3["follow_pats"].tolist())    
    %time l3["follow_pats"] = [np.random.choice(l, int(np.round(len(l)/3)), replace=False) for l in fp]
    
    r = pd.DataFrame()
    print("getting sim")
    for n in list(range(len(l3))):
        l4 = pd.DataFrame()
        # Patent lists
        tps = l3.loc[n, "lead_pats"]
        ops = l3.loc[n, "follow_pats"]
        # Patent vectors
        i = pm[[pat_dict[p] for p in tps]]
        j = pm[[pat_dict[p] for p in ops]]
        
        # Cosine distance
        cos_dis = distance.cdist(i,j, metric = "cosine")
        cos_dis = 1-cos_dis
        l4["sim_{0}".format(dm)] = cos_dis.flatten()
        # index: hope this is correct?
        pairs = list(itertools.product(tps, ops))
        
        l4["lead_pat"] = [i[0] for i in pairs]
        l4["follow_pat"] = [i[1] for i in pairs]
        l4["term"] = l3.loc[n, "term"]
        
        r = r.append(l4, ignore_index = True) 
    # Save results
    res[dm] = r
    
    print("finished")
    print(datetime.datetime.now())
    del(l3, r)         
    

Loading matrix and dict
2018-09-10 14:13:16.275875
CPU times: user 5.14 ms, sys: 0 ns, total: 5.14 ms
Wall time: 5.21 ms
CPU times: user 200 µs, sys: 0 ns, total: 200 µs
Wall time: 210 µs
CPU times: user 82 ms, sys: 244 µs, total: 82.3 ms
Wall time: 81.1 ms
getting sim
finished
2018-09-10 14:16:20.898967
Loading matrix and dict
2018-09-10 14:16:20.899898
CPU times: user 3.23 ms, sys: 159 µs, total: 3.39 ms
Wall time: 3.43 ms
CPU times: user 197 µs, sys: 0 ns, total: 197 µs
Wall time: 208 µs
CPU times: user 52.9 ms, sys: 2.12 ms, total: 55 ms
Wall time: 50.9 ms
getting sim
finished
2018-09-10 14:17:40.696567


In [15]:
r = res["docvecs"].merge(res["ldavecs"][["lead_pat", "follow_pat", "sim_ldavecs"]], how = "left",\
                         on = ["lead_pat", "follow_pat"])
print(len(r))

5609605


In [16]:
fastparquet.write("DataStore/2018-08/newterms_lead_follow_0910.parq", r, compression="GZIP")
del(r)

### Add relevant data

In [2]:
nt = fastparquet.ParquetFile("DataStore/2018-08/newterms_lead_follow_0910.parq").to_pandas().drop_duplicates()\
.rename(columns={"lead_pat":"tp", "follow_pat":"op"})
pdf = fastparquet.ParquetFile("RawData/Cleaned/patent_loc_unique_us_0628.parq").to_pandas(\
                            ["patent", "appyear", "inv_msa", "primclass", "naics_name", "title"])
print(len(nt))
nt = nt.merge(pdf.add_prefix("tp_"), how="left", left_on = "tp", right_on="tp_patent").drop("tp_patent",1)
nt = nt.merge(pdf.add_prefix("op_"), how="left", left_on = "op", right_on="op_patent").drop("op_patent",1)  
print(len(nt))

# Add other variables
nt["inv_msa_match"] = (nt["tp_inv_msa"] == nt["op_inv_msa"])
nt["year_diff"] = nt["op_appyear"] - nt["tp_appyear"]

5609234
5609234


In [3]:
# Add list of cited patents for each
cit = dd.read_parquet("RawData/Cleaned/cit_0628.parq")

%time c2 = cit[cit["citing"].isin(nt["tp"]) | cit["citing"].isin(nt["op"])].compute()
del(cit)

# See if there's a direct citation
%time c3 = {n:g["cited"].tolist() for n,g in c2.groupby("citing")}
%time nt["direct_cite"] = [True if tp in c3.get(op, []) else False for tp, op in zip(nt["tp"], nt["op"])]
del(c3)


# Add assignees for each patent
asgs = pickle.load(open("RawData/Cleaned/patent_assignee_dict_0628.pkl", "rb"))

# Remove self-citations
%time asg_match = (set(asgs.get(cited, [])).intersection(asgs.get(citing, [])) for cited, citing \
                   in zip(c2["cited"], c2["citing"]))
%time asg_match = [len(i) for i in asg_match]
del(asgs)

c2["asg_match"] = asg_match
c2 = c2.loc[c2["asg_match"] == 0]
c2 = c2.drop("asg_match", 1)

# Number of common citations
%time c3 = {n:g["cited"].tolist() for n,g in c2.groupby("citing")}

%time num_common_cites = (set(c3.get(tp, [])).intersection(set(c3.get(op, []))) for tp, op in zip(nt["tp"], nt["op"]))
%time nt["num_common_cites"] = [len(i) for i in num_common_cites]

CPU times: user 3min 57s, sys: 53.4 s, total: 4min 50s
Wall time: 2min
CPU times: user 2.83 s, sys: 121 ms, total: 2.95 s
Wall time: 2.89 s
CPU times: user 6.94 s, sys: 0 ns, total: 6.94 s
Wall time: 6.99 s
CPU times: user 13 ms, sys: 18.9 ms, total: 31.9 ms
Wall time: 31.6 ms
CPU times: user 1.29 s, sys: 0 ns, total: 1.29 s
Wall time: 1.26 s
CPU times: user 3.85 s, sys: 82.6 ms, total: 3.93 s
Wall time: 3.74 s
CPU times: user 91.7 ms, sys: 8.59 ms, total: 100 ms
Wall time: 97.6 ms
CPU times: user 20.8 s, sys: 0 ns, total: 20.8 s
Wall time: 22 s


In [8]:
# Normalize spillover measures
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
for c in ["sim_ldavecs", "sim_docvecs", "num_common_cites"]:
    nt["norm_{0}".format(c)] = np.nan
    nt.loc[nt[c].notnull(), "norm_{0}".format(c)] = scaler.fit_transform(nt.loc[nt[c].notnull(), c].values\
                                                                         .reshape(-1, 1))

  


In [10]:
np.round(nt[["sim_docvecs", "num_common_cites",\
    "norm_sim_docvecs", "norm_num_common_cites",
    "inv_msa_match", "year_diff"]].groupby(["inv_msa_match", "year_diff"]).mean(),3)

Unnamed: 0_level_0,Unnamed: 1_level_0,sim_docvecs,num_common_cites,norm_sim_docvecs,norm_num_common_cites
inv_msa_match,year_diff,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
False,1.0,0.13,0.002,0.023,0.002
False,2.0,0.13,0.002,0.023,0.004
False,3.0,0.127,0.001,-0.001,0.001
False,4.0,0.134,0.0,0.054,-0.005
False,5.0,0.124,0.001,-0.019,-0.004
False,6.0,0.129,0.001,0.017,-0.003
False,7.0,0.132,0.001,0.037,-0.004
False,8.0,0.122,0.001,-0.032,-0.004
False,9.0,0.123,0.001,-0.028,-0.003
True,1.0,0.164,0.037,0.282,0.216


In [11]:
nt[["sim_docvecs", "num_common_cites",\
    "norm_sim_docvecs", "norm_num_common_cites",]].corr()

Unnamed: 0,sim_docvecs,num_common_cites,norm_sim_docvecs,norm_num_common_cites
sim_docvecs,1.0,0.016308,1.0,0.016308
num_common_cites,0.016308,1.0,0.016308,1.0
norm_sim_docvecs,1.0,0.016308,1.0,0.016308
norm_num_common_cites,0.016308,1.0,0.016308,1.0


### Add common inventor

In [3]:
nt = fastparquet.ParquetFile("DataStore/2018-08/newterms_lead_follow_0910.parq").to_pandas().drop_duplicates()

inv = fastparquet.ParquetFile("RawData/Cleaned/patent_inventors_0628.parq").to_pandas(["patent", "inventor_id"])
%time inv = {n: g["inventor_id"].tolist() for n, g in inv.groupby("patent")}

CPU times: user 12min 6s, sys: 443 ms, total: 12min 6s
Wall time: 12min 6s


In [4]:
%time num_common_inv = [len(set(inv[tp]).intersection(inv[op])) if (tp in inv.keys()) & (op in inv.keys())\
                      else np.nan for tp, op in zip(nt["tp"], nt["op"])]
nt["num_common_pat_inv"] = num_common_inv
del(num_common_inv)
nt["common_pat_inv"] = np.nan
nt.loc[nt["num_common_pat_inv"] >= 1, "common_pat_inv"] = True
nt.loc[nt["num_common_pat_inv"] == 0, "common_pat_inv"] = False

display(nt["common_pat_inv"].value_counts())
nt = nt.drop("num_common_pat_inv",1)

CPU times: user 12.5 s, sys: 199 ms, total: 12.7 s
Wall time: 12 s


False    4415275
True         575
Name: common_pat_inv, dtype: int64

In [6]:
nt = nt.drop("num_common_pat_inv",1)
nt.columns

Index(['sim_docvecs', 'tp', 'op', 'term', 'sim_ldavecs', 'tp_appyear',
       'tp_inv_msa', 'tp_primclass', 'tp_naics_name', 'tp_title', 'op_appyear',
       'op_inv_msa', 'op_primclass', 'op_naics_name', 'op_title',
       'inv_msa_match', 'year_diff', 'direct_cite', 'num_common_cites',
       'norm_sim_ldavecs', 'norm_sim_docvecs', 'norm_num_common_cites',
       'common_pat_inv'],
      dtype='object')

### Add common non-patent reference

In [4]:
nt = fastparquet.ParquetFile("DataStore/2018-08/newterms_lead_follow_0910.parq").to_pandas().drop_duplicates()

# %time oc = fastparquet.ParquetFile("RawData/Cleaned/otherreference1016.parq").to_pandas()
# %time oc = {n:g["ref_id"].tolist() for n,g in oc.groupby("patent_id")}

In [5]:
%time num_common_cites = (set(oc.get(tp, [])).intersection(set(oc.get(op, []))) for tp, op in zip(nt["tp"], nt["op"]))
%time nt["num_common_npc"] = [len(i) for i in num_common_cites]
nt["common_npc_match"] = (nt["num_common_npc"] >= 1)
display(nt["common_npc_match"].value_counts())

CPU times: user 502 ms, sys: 228 ms, total: 730 ms
Wall time: 722 ms
CPU times: user 19.5 s, sys: 123 ms, total: 19.6 s
Wall time: 19.7 s


False    5609194
True          40
Name: common_npc_match, dtype: int64

In [6]:
fastparquet.write("DataStore/2018-08/newterms_lead_follow_0910.parq", nt, compression="GZIP")

In [13]:
nt.columns

Index(['sim_docvecs', 'tp', 'op', 'term', 'sim_ldavecs', 'tp_appyear',
       'tp_inv_msa', 'tp_primclass', 'tp_naics_name', 'tp_title', 'op_appyear',
       'op_inv_msa', 'op_primclass', 'op_naics_name', 'op_title',
       'inv_msa_match', 'year_diff', 'direct_cite', 'num_common_cites',
       'norm_sim_ldavecs', 'norm_sim_docvecs', 'norm_num_common_cites'],
      dtype='object')