In [2]:
import numpy as np
import scipy as sp
import pandas as pd
import timeit
import datetime
import time
import pprint
import itertools
import pickle
import sklearn
import dask
import os
os.chdir('/mnt/t48/bighomes-active/sfeng/patentdiffusion/')
import fastparquet
seed = 3
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
# Distances
import scipy.spatial.distance as distance
# KL
from scipy.stats import entropy
# Normalize
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Pairwise distances
from sklearn.metrics.pairwise import pairwise_distances
import h5py
import dask
import dask.dataframe as dd
import dask.array as da

  from ._conv import register_converters as _register_converters


*17.05.18*
## Mutual citing patents
1. Get random patents (50,000)
2. Get list of all patents *cited by* 1
3. Get patents also citing 2, but granted *before* 1, so that 1 could potentially have also cited 3
4. **Key**: Sample from 3. Repeat. Otherwise my results may be too driven by initial sample.

Sampling taken from `201709KnowledgeSpilloversRep/1A-MutualCitingSample.ipynb`.

See `1a-MutualCitesDataset-v1` for full version of this code.

In [2]:
patent_df = fastparquet.ParquetFile("RawData/Cleaned/patent_loc_unique_us_0628.parq").to_pandas()
# Use only citations not from same assignee, granted within 10 years
citation_df = fastparquet.ParquetFile("DataStore/2018-07-P2/citation_pairs_samp_0716.parq").to_pandas()

- Updated script from `201804KnowledgeSpilloversRep/2018-03-P1/get_citations.py`

In [3]:
def get_citations(pats_col, targ_pats_df, max_year_diff = 100, targ_pats_cited = True,
                  citation_cols_to_merge = ['patent', 'gyear'], 
                 targ_cols_to_merge = ['patent', 'gyear']):

    # 1. Limit dataframe to the ones relevant to target dataframe
    if targ_pats_cited is True:
        # Find patents who cited the target_pat
        c1 = citation_df.loc[citation_df["cited"].isin(targ_pats_df[pats_col])]
        # Get their patent info
        c2 = c1.merge(patent_df[citation_cols_to_merge].add_prefix("citing_"), how = "left", left_on = ["citing"], right_on = "citing_patent")
        c2 = c2.drop("citing_patent", 1)
        # Merge with targ_pats_df columns
        c2 = c2.merge(patent_df[targ_cols_to_merge], how = "left", left_on = ["cited"], right_on = ["patent"])
        # Drop patent column
        c2 = c2.drop("patent", 1)
        # Get the year difference
        c2["year_diff"] = c2["citing_gyear"] - c2["gyear"]
        # Cull the years greater than the max_year_diff
        c2 = c2.loc[abs(c2["year_diff"]) <= max_year_diff]
        # Rename the cited column the pats column
        c2 = c2.rename(columns={"cited": pats_col})
        
    else:
        # Find patents the target_pats cite
        c1 = citation_df.loc[citation_df["citing"].isin(targ_pats_df[pats_col])]
        c2 = c1.merge(patent_df[citation_cols_to_merge].add_prefix("cited_"), left_on = ["cited"], right_on = "cited_patent")
        c2 = c2.drop("cited_patent", 1)
        # Merge with targ_pats_df columns
        c2 = c2.merge(patent_df[targ_cols_to_merge], how = "left", left_on = ["citing"], right_on = ["patent"])
        # Drop patent column
        c2 = c2.drop("patent", 1)
        # Get the year difference; cited_gyear < gyear
        c2["year_diff"] = c2["cited_gyear"] - c2["gyear"]
        # Cull the years greater than the max_year_diff
        c2 = c2.loc[abs(c2["year_diff"]) <= max_year_diff]
        # Rename the cited column the pats column
        c2 = c2.rename(columns={"citing": pats_col})
           
    return c2.drop(["year_diff"],1)

In [4]:
# 1. Create initial run
mdc = pd.DataFrame()
i = 0
p2 = patent_df.loc[(patent_df["gyear"] >= 1980) & (patent_df["gyear"] <= 2015)].copy()

while i < 11:
    print(i)
    print(datetime.datetime.now())
    # 2. Initial sample
    tp = p2.sample(50000, random_state = seed)
    # tp["gyear"].value_counts()

    # 3. All patents cited by tp
    %time tpc = get_citations(pats_col = "patent", targ_pats_df = tp, \
                              targ_pats_cited = False, max_year_diff = 100)
    print(datetime.datetime.now())
    print(len(tpc))
    # 4. All patents citing cited
    %time tpc2 = get_citations(pats_col = "cited", targ_pats_df = tpc, \
                              targ_pats_cited = True, max_year_diff = 100)
    print(datetime.datetime.now())
    # 5. Merge with tpc
    # First drop gyear column on tpc2
    %time tpc = tpc.merge(tpc2.drop(["gyear"],1), how = "left", on = "cited")
    print(len(tpc))
    del(tpc2)
    # 6. Drop those with gyear after tp, as we want tp to be potentially citable by op
    tpc = tpc.loc[(tpc["gyear"] < tpc["citing_gyear"])]
    # Groupby patent, citing and get number of mutual patents
    tpc = tpc.groupby(["patent", "citing"]).size().reset_index()
    tpc = tpc.rename(columns={"patent":"tp", "citing": "op", 0: "num_common_cited"})
    print(len(tpc))

    # 7. Sample only % of resulting dataset
    mdc = mdc.append(tpc.sample(frac=0.25), ignore_index = True)
    print(len(mdc))
    
    # Update sampling set
    p2 = p2.loc[~(patent_df["patent"].isin(mdc["tp"]))]
    i += 1

0
2018-07-16 16:58:10.225926
CPU times: user 2.7 s, sys: 183 ms, total: 2.89 s
Wall time: 3.18 s
2018-07-16 16:58:13.495161
163476
CPU times: user 3.46 s, sys: 342 ms, total: 3.8 s
Wall time: 4.18 s
2018-07-16 16:58:17.673923
CPU times: user 374 ms, sys: 89.4 ms, total: 464 ms
Wall time: 509 ms
4113585
1090078
272520
1
2018-07-16 16:58:19.662897
CPU times: user 2.59 s, sys: 183 ms, total: 2.77 s
Wall time: 3.04 s
2018-07-16 16:58:22.812381
163674
CPU times: user 3.44 s, sys: 332 ms, total: 3.78 s
Wall time: 4.15 s
2018-07-16 16:58:26.965375
CPU times: user 361 ms, sys: 66.4 ms, total: 427 ms
Wall time: 469 ms
4210564
1095308
546347
2
2018-07-16 16:58:28.789901
CPU times: user 2.65 s, sys: 169 ms, total: 2.82 s
Wall time: 3.1 s
2018-07-16 16:58:31.991730
159261
CPU times: user 3.45 s, sys: 305 ms, total: 3.75 s
Wall time: 4.13 s
2018-07-16 16:58:36.121275
CPU times: user 367 ms, sys: 43.6 ms, total: 411 ms
Wall time: 451 ms
4064372
1069225
813653
3
2018-07-16 16:58:37.832815
CPU times: 

In [5]:
# Drop any duplicates
print(len(mdc))
mdc = mdc.drop_duplicates(subset=["tp", "op"])
print(len(mdc))
fastparquet.write("DataStore/2018-07-P2/mutual_cited_samp_0716.parq", mdc, compression="GZIP")

2810185
2810185


### Add other variables
- *tp*: original patent
- *op*: patent that also cited a patent that *tp* does, but is granted after *tp*

#### Check if *tp* is cited by *op*

Get all patents cited by *op*, see if *tp* in the list

In [6]:
%time tpc3 = get_citations(pats_col = "op", targ_pats_df = mdc, \
                          targ_pats_cited = False, max_year_diff = 100)
%time op_cited = {n:g["cited"].tolist() for n,g in tpc3[["op", "cited"]].groupby("op")}
del(tpc3)
mdc["op_cites_tp"] = [tp in op_cited[op] if op in op_cited.keys() else np.nan for tp,op \
                      in zip(mdc["tp"], mdc["op"])] 
print(len(mdc))

CPU times: user 5.39 s, sys: 464 ms, total: 5.85 s
Wall time: 5.85 s
CPU times: user 1min 29s, sys: 981 ms, total: 1min 30s
Wall time: 1min 28s
2810185


In [7]:

mdc["op_cites_tp"].value_counts()

False    2754414
True       55771
Name: op_cites_tp, dtype: int64

In [8]:
fastparquet.write("DataStore/2018-07-P2/mutual_cited_samp_0716.parq", mdc, compression="GZIP")

## Add similarity
### 2.1 Measuring similarity across data

In [9]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.basicConfig(level=logging.INFO)
logger.addHandler(logging.FileHandler('Logs/mutual_cite_sim_{0}.log'.format(datetime.datetime.now().\
                                                            strftime("%Y-%m-%d"), 'a')))
print = logging.info
print('good day to you madam fiona')
print('started')
print(datetime.datetime.now())

INFO:root:good day to you madam fiona
INFO:root:started
INFO:root:2018-07-16 17:03:21.326738


In [10]:
import scipy.spatial.distance as distance
import dask.array as da
dms = ["ldavecs", "docvecs"]
res = {}
for dm in dms:
    print("Loading matrix and dict")
    print(datetime.datetime.now())
    
    pat_dict = fastparquet.ParquetFile("RawData/Cleaned/patabs7615_us_no_dup.parq").to_pandas(["patent"])["patent"].tolist()
    pat_dict = dict(zip(pat_dict, range(len(pat_dict))))
    # Store as dask array
    if dm == "ldavecs":
        ncols = 60
    else:
        ncols = 100
    pm = dd.read_parquet("DataStore/2018-07-P2/ML/{0}_pats_0712.parq".format(dm)).values.compute()
    pm = da.from_array(pm, chunks=(10000,ncols))

    l3 = fastparquet.ParquetFile("DataStore/2018-07-P2/mutual_cited_samp_0716.parq").to_pandas()\
    .rename(columns={"cited":"tp", "citing":"op"})

    # Remove missing values
    print(len(l3))
    l3 = l3.loc[l3["tp"].isin(pat_dict.keys()) & l3["op"].isin(pat_dict.keys())]
    print(len(l3))
    if dm == "ldavecs":
        ncols = 60
    else:
        ncols = 100
        
    print(len(l3))

    tp_chunks = pm[[pat_dict[p] for p in l3["tp"].tolist()]].compute()
    op_chunks = pm[[pat_dict[p] for p in l3["op"].tolist()]].compute()
    print(len(l3))

    # Split into chunks
    n_chunks = np.round(len(l3)/3000)
    tp_chunks = np.array_split(tp_chunks, n_chunks)
    op_chunks = np.array_split(op_chunks, n_chunks)

    print("Getting patent pair similarity")
    print("cosine")
    print(datetime.datetime.now())
    # Recursively lengthen the array of cosine distances
    cos_dis = np.array([])
    for i,j in zip(tp_chunks, op_chunks):
        cos_dis = np.hstack((cos_dis, np.hstack([np.diag(distance.cdist(i,j, metric = "cosine"))])))
        
    l3["sim_{0}".format(dm)] = 1-cos_dis
    res[dm] = l3
    print("finished")
    print(datetime.datetime.now())
    del(l3)    
        
        

INFO:root:Loading matrix and dict
INFO:root:2018-07-16 17:03:21.345195
INFO:root:2810185
INFO:root:2445169
INFO:root:2445169
INFO:root:2445169
INFO:root:Getting patent pair similarity
INFO:root:cosine
INFO:root:2018-07-16 17:03:40.075070
INFO:root:finished
INFO:root:2018-07-16 17:12:53.408940


In [11]:
mdc = res["ldavecs"].merge(res["docvecs"][["tp", "op", "sim_docvecs"]],
                          how = "left", on = ["tp", "op"])

In [12]:
mdc.head()

Unnamed: 0,tp,op,num_common_cited,op_cites_tp,sim_ldavecs,sim_docvecs
0,8312465,9104645,1,False,0.201732,0.142462
1,6546553,6823508,1,False,0.686707,0.220453
2,7310179,7916103,33,False,0.160098,0.092198
3,8250031,8595191,1,False,0.396526,0.347709
4,7085553,7379733,1,False,0.578231,0.016689


In [13]:
fastparquet.write("DataStore/2018-07-P2/mutual_cited_sim_0716.parq", mdc, compression="GZIP")

### Merge with original dataset for other variables

In [3]:
pdf = fastparquet.ParquetFile("RawData/Cleaned/patent_loc_unique_us_0628.parq").to_pandas().drop_duplicates(["patent"])
mdc = fastparquet.ParquetFile("DataStore/2018-07-P2/mutual_cited_sim_0716.parq").to_pandas()
print(len(mdc))
# Drop missing
mdc = mdc.loc[(mdc["tp"].notnull()) & (mdc["op"].notnull())]
print(len(mdc))

2445169
2445169


In [4]:
mdc = mdc.merge(pdf[["patent", "gyear", "inv_msa", "naics_name", "primclass"]]
               .add_prefix("tp_"), how = "left", left_on = "tp", right_on = "tp_patent").\
drop("tp_patent",1)
# Drop missing
mdc = mdc.loc[(mdc["tp"].notnull()) & (mdc["op"].notnull())]
print(len(mdc))
mdc = mdc.merge(pdf[["patent", "gyear", "inv_msa", "naics_name", "primclass"]]
               .add_prefix("op_"), how = "right", left_on = "op", right_on = "op_patent").\
drop("op_patent",1)
# Drop missing
mdc = mdc.loc[(mdc["tp"].notnull()) & (mdc["op"].notnull())]
print(len(mdc))

2445169
2445169


In [None]:
# Add year group
def get_year_group(x):
    if x in range(1975,1985):
        yg = "1975-85"
    elif x in range(1985,1995):
        yg = "1985-95"
    elif x in range(1995, 2005):
        yg = "1995-05"
    elif x in range(2005,2015):
        yg = "2005-15"
    else:
        yg = np.nan
    return yg
mdc["year_group"] = mdc["tp_gyear"].apply(get_year_group)

# Don't scale
# eps = 0.01
# dv_min = 0.7
# def scale_docvecs(x):
#     scaled = ((x+dv_min)/(1+dv_min))*(1-eps)+eps
#     return scaled
# def scale_ldavecs(x):
#     scaled = x*(1-eps)+eps
#     return scaled
# mdc["sim_ldavecs"] = mdc["sim_ldavecs"].apply(scale_ldavecs)
# mdc["sim_docvecs"] = mdc["sim_docvecs"].apply(scale_docvecs)
# print(len(mdc))

In [None]:
fastparquet.write("DataStore/2018-07-P2/mutual_cited_0716.parq", mdc, compression="GZIP")

In [19]:
np.round(mdc[["sim_ldavecs", "sim_docvecs"]].describe(),3)

Unnamed: 0,sim_ldavecs,sim_docvecs
count,2445169.0,2445169.0
mean,0.458,0.548
std,0.252,0.084
min,0.011,0.037
25%,0.252,0.491
50%,0.455,0.545
75%,0.66,0.601
max,1.0,0.968
