In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import timeit
import datetime
import time
import pprint
import itertools
import pickle
import sklearn
import dask
import dask.dataframe as dd
import os
os.chdir('/mnt/t48/bighomes-active/sfeng/patentdiffusion/')
import fastparquet
seed = 3
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
# Distances
import scipy.spatial.distance as distance
# KL
from scipy.stats import entropy
# Normalize
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Pairwise distances
from sklearn.metrics.pairwise import pairwise_distances
import h5py
# Percentiles
from scipy.stats import percentileofscore

  from ._conv import register_converters as _register_converters


### Removing duplicate patents
- This is a huge problem; over 10% of patent text granted is duplicated

In [2]:
# # Create patent assignee unique data
# pa = fastparquet.ParquetFile("RawData/Cleaned/patent_assignees_0628.parq").to_pandas()
# print(len(pa))
# # Patent-asg count
# pa = pa[["patent", "assignee_id", "type"]].groupby(["patent", "assignee_id"]).count().reset_index()
# print(len(pa))
# # Sort by assignee count per patent
# pa = pa.sort_values(["patent", "type"], ascending = [1,0])
# # Keep assignee with most count
# pa = pa.drop_duplicates(subset=["patent"], keep="first").drop("type",1)
# print(len(pa))
# fastparquet.write("RawData/Cleaned/patent_assignees_unique_0628.parq", pa, compression="GZIP")

In [3]:
# Inventor-patent pairs
ip = fastparquet.ParquetFile("RawData/Cleaned/patent_inventors_0628.parq").to_pandas()
dup_pats = pd.read_pickle("RawData/Cleaned/duplicate_pattext_0712.pkl").tolist()
print(len(ip))
# Get relevant US Patents
usp = fastparquet.ParquetFile("RawData/Cleaned/patent_loc_unique_us_0628.parq").to_pandas(["patent"])["patent"]
# Look at only relevant US patents and non-duplicated patents
ip = ip.loc[ip["patent"].isin(usp) & ~(ip["patent"].isin(dup_pats))]
print(len(ip))
# Patent-assignee pairs
pa = fastparquet.ParquetFile("RawData/Cleaned/patent_assignees_unique_0628.parq").to_pandas()
ip = ip.merge(pa, how = "left", on = "patent")
del(pa, usp)

5017246
4302101


In [4]:
# Get rid of patents with no assignee
print(len(ip))
ip = ip.loc[ip["assignee_id"].notnull()]
print(len(ip))

# Count number of assignees by inventor
i2 = ip[["inventor_id", "assignee_id"]].drop_duplicates("assignee_id").groupby("inventor_id").size()
print(len(i2)) # Num inventors working at firms in total

# Keep inventors at more than 2 different assignees
i2 = i2.loc[i2 >= 2]
print(len(i2))

4302101
3860816
141583
12377


In [5]:
# Patents by inventors who move
print(len(ip))
ip = ip.loc[ip["inventor_id"].isin(i2.index.tolist())]
print(len(ip))
ip.to_pickle("DataStore/2018-07/inv_move_pats_0712.pkl")

3860816
140076


### Magnitude
- 12377/141583 total inventors working at firms have switched firms
- They account for 140076 of 3860816 patents assigned

___________

## Inventor similarity pairs

### 1. Total number of patent pairs for each inventor?

In [6]:
%%time
ig = ip[["patent", "inventor_id"]].groupby(["inventor_id"])
inv_pairs = [len(set(itertools.combinations(g["patent"].tolist(),2))) for n,g in ig]
print(np.sum(inv_pairs))

3278262
CPU times: user 3.58 s, sys: 6.76 ms, total: 3.58 s
Wall time: 3.56 s


### 2. Create pairs of inventor patents dataframe

In [7]:
inv_pairs = [list(itertools.combinations(g["patent"].tolist(),2)) for n,g in ig]
inv_pairs = [item for sublist in inv_pairs for item in sublist]

# Order them by earlier patent first
inv_pairs = [(i,j) if i<j else (j,i) for i,j in inv_pairs ]

In [8]:
# Convert to dataframe
inv_pairs = pd.DataFrame({"tp": [i[0] for i in inv_pairs],
                         "op": [i[1] for i in inv_pairs]})
print(len(inv_pairs))
inv_pairs = inv_pairs.drop_duplicates()
print(len(inv_pairs))

3281556
3087989


In [9]:
# Get inventor
pat_asg = dict(zip(ip["patent"], ip["assignee_id"]))
inv_pairs["inventor_id"] = inv_pairs["tp"].map(dict(zip(ip["patent"], ip["inventor_id"])))
# Get assignee for each patent
inv_pairs["tp_asg"] = inv_pairs["tp"].map(pat_asg)
inv_pairs["op_asg"] = inv_pairs["op"].map(pat_asg)
# # Assignee match DON'T DO THIS
# inv_pairs["asg_match"] = (inv_pairs["tp_asg"] == inv_pairs["op_asg"])

### 3. Assignee match: if assignee list in pairs have an assignee in common

In [10]:
asgs = pickle.load(open("RawData/Cleaned/patent_assignee_dict_0628.pkl", "rb"))
print(len(inv_pairs))
# Check that target and other do not have same assignee
%time asg_match = (set(asgs.get(tp, [])).intersection(asgs.get(op, [])) for tp, op in zip(inv_pairs["tp"], inv_pairs["op"]))
%time asg_match = [len(i) for i in asg_match]

inv_pairs["asg_match"] = [True if i >=1 else False for i in asg_match]
print(len(inv_pairs))

3087989
CPU times: user 84.3 ms, sys: 80.5 ms, total: 165 ms
Wall time: 164 ms
CPU times: user 3.29 s, sys: 14.9 ms, total: 3.3 s
Wall time: 3.28 s
3087989


In [11]:
inv_pairs["asg_match"].value_counts()

False    1819533
True     1268456
Name: asg_match, dtype: int64

### 4. Did citation occur?

In [12]:
cit = dd.read_parquet("RawData/Cleaned/cit_0628.parq")
# Use only relevant citations
cit = cit[cit["cited"].isin(inv_pairs["tp"])]
print(len(cit))
cit = cit[cit["citing"].isin(inv_pairs["op"])]
print(len(cit))
cit = cit.compute()

# Citation dictionary
cit = dict(zip(list(zip(cit["cited"], cit["citing"])), [True]*len(cit)))

3315549
345420


In [13]:
inv_pairs["op_cites_tp"] = [cit.get(i, False) for i in zip(inv_pairs["tp"], inv_pairs["op"])]

In [14]:
inv_pairs["op_cites_tp"].value_counts()

False    2954128
True      133861
Name: op_cites_tp, dtype: int64

In [15]:
fastparquet.write("DataStore/2018-07/inv_move_pat_pairs_0712.parq", inv_pairs, compression="GZIP")

In [16]:
del(inv_pairs)

### 5. Similarity

In [None]:
import scipy.spatial.distance as distance
dms = ["ldavecs", "docvecs"]
# dms = ["ldavecs"]
res = {}
for dm in dms:
    print("Loading matrix and dict")
    print(datetime.datetime.now())
    
    pat_dict = fastparquet.ParquetFile("RawData/Cleaned/patabs7615_us_no_dup.parq").to_pandas(["patent"])["patent"].tolist()
    pat_dict = dict(zip(pat_dict, range(len(pat_dict))))
    pm = dd.read_parquet("DataStore/2018-07-P2/ML/{0}_pats_0712.parq".format(dm)).values.compute()
    
    l3 = fastparquet.ParquetFile("DataStore/2018-07/inv_move_pat_pairs_0712.parq").to_pandas(["tp", "op"])
    
    # Remove missing values
    print(len(l3))
    l3 = l3.loc[l3["tp"].isin(pat_dict.keys()) & l3["op"].isin(pat_dict.keys())]
    print(len(l3))
    if dm == "ldavecs":
        ncols = 60
    else:
        ncols = 100
    print(len(l3))
    tp_pv = pm[[pat_dict[p] for p in l3["tp"].tolist()]]
    op_pv = pm[[pat_dict[p] for p in l3["op"].tolist()]]
    print(len(l3))
    del(pat_dict, pm)

    # Split into chunks
    n_chunks = np.round(len(l3)/3000)
    tp_chunks = np.array_split(tp_pv, n_chunks)
    op_chunks = np.array_split(op_pv, n_chunks)
    del(tp_pv, op_pv)

    print("Getting patent pair similarity")
    print("cosine")
    print(datetime.datetime.now())
    # Cosine
    # Recursively lengthen the array of cosine distances
    cos_dis = np.array([])
    for i,j in zip(tp_chunks, op_chunks):
        cos_dis = np.hstack((cos_dis, np.hstack([np.diag(distance.cdist(i,j, metric = "cosine"))])))
        
    l3["sim_{0}".format(dm)] = 1-cos_dis
    res[dm] = l3
    print("finished")
    print(datetime.datetime.now())
    del(l3)    
        

Loading matrix and dict
2018-07-14 10:48:37.078151
3087989
2144532
2144532
2144532
Getting patent pair similarity
cosine
2018-07-14 10:48:49.787916


In [None]:
sim = res["ldavecs"].merge(res["docvecs"][["tp", "op", "sim_docvecs"]],
                          how = "left", on = ["tp", "op"])
# Save raw similarity
fastparquet.write("DataStore/2018-07/inv_move_pat_pairs_sim_0712.parq", sim, compression="GZIP")

In [None]:
inv_pairs = fastparquet.ParquetFile("DataStore/2018-07/inv_move_pat_pairs_0712.parq").to_pandas().drop(\
                                        ["sim_ldavecs", "sim_docvecs", "bin_sim_ldavecs", "bin_sim_docvecs"],1)
# sim = fastparquet.ParquetFile("DataStore/2018-07/inv_move_pat_pairs_sim_0712.parq").to_pandas()
# Merge with original
inv_pairs = inv_pairs.merge(sim, how = "left", on = ["tp", "op"])

In [5]:
np.round(sim["sim_docvecs"].describe(),3)

count    2144532.000
mean           0.254
std            0.185
min           -0.696
25%            0.125
50%            0.235
75%            0.364
max            1.000
Name: sim_docvecs, dtype: float64

In [6]:
# Scale
eps = 0.01
dv_min = 0.7
def scale_docvecs(x):
    scaled = ((x+dv_min)/(1+dv_min))*(1-eps)+eps
    return scaled
def scale_ldavecs(x):
    scaled = x*(1-eps)+eps
    return scaled

# Scale measures
inv_pairs["sim_docvecs"] = inv_pairs["sim_docvecs"].apply(scale_docvecs)
inv_pairs["sim_ldavecs"] = inv_pairs["sim_ldavecs"].apply(scale_ldavecs)

In [7]:
# Bins
import math

def sim_bin(x, a):
    try:
        return math.ceil(x / a) * a
    except:
        return np.nan
    
inv_pairs["bin_sim_ldavecs"] = inv_pairs["sim_ldavecs"].apply(lambda x: sim_bin(x, 0.1))
inv_pairs["bin_sim_docvecs"] = inv_pairs["sim_docvecs"].apply(lambda x: sim_bin(x, 0.1))

In [11]:
fastparquet.write("DataStore/2018-07/inv_move_pat_pairs_0712.parq", inv_pairs, compression="GZIP")

In [8]:
inv_pairs[["bin_sim_docvecs", "asg_match", "op_cites_tp"]].groupby(["bin_sim_docvecs", "asg_match"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,op_cites_tp
bin_sim_docvecs,asg_match,Unnamed: 2_level_1
0.1,False,0.0
0.1,True,0.0
0.2,False,0.004739
0.2,True,0.041237
0.3,False,0.004717
0.3,True,0.017634
0.4,False,0.006432
0.4,True,0.023333
0.5,False,0.012876
0.5,True,0.03667


In [9]:
inv_pairs[["bin_sim_ldavecs", "asg_match", "op_cites_tp"]].groupby(["bin_sim_ldavecs", "asg_match"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,op_cites_tp
bin_sim_ldavecs,asg_match,Unnamed: 2_level_1
0.1,False,0.008046
0.1,True,0.029525
0.2,False,0.016276
0.2,True,0.04827
0.3,False,0.026112
0.3,True,0.056831
0.4,False,0.036836
0.4,True,0.060344
0.5,False,0.045256
0.5,True,0.070155


In [10]:
inv_pairs[["bin_sim_docvecs", "asg_match", "op_cites_tp"]].groupby(["bin_sim_docvecs", "asg_match"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,op_cites_tp
bin_sim_docvecs,asg_match,Unnamed: 2_level_1
0.1,False,17
0.1,True,11
0.2,False,211
0.2,True,97
0.3,False,2756
0.3,True,1361
0.4,False,61874
0.4,True,21472
0.5,False,363480
0.5,True,161359


In [24]:
inv_pairs[["bin_sim_docvecs", "asg_match", "op_cites_tp"]].groupby(["bin_sim_docvecs", "asg_match"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,op_cites_tp
bin_sim_docvecs,asg_match,Unnamed: 2_level_1
0.1,False,54
0.1,True,6
0.2,False,7757
0.2,True,1534
0.3,False,137179
0.3,True,40566
0.4,False,478685
0.4,True,218881
0.5,False,470791
0.5,True,331589
