In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import timeit
import datetime
import time
import pprint
import itertools
import pickle
import sklearn
import dask
import dask.dataframe as dd
import os
os.chdir('/mnt/t48/bighomes-active/sfeng/patentdiffusion/')
import fastparquet
seed = 3
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
# Distances
import scipy.spatial.distance as distance
# KL
from scipy.stats import entropy
# Normalize
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Pairwise distances
from sklearn.metrics.pairwise import pairwise_distances
import h5py
# Percentiles
from scipy.stats import percentileofscore

  from ._conv import register_converters as _register_converters


### Removing duplicate patents
- This is a huge problem; over 10% of patent text granted is duplicated

In [2]:
# # Create patent assignee unique data
# pa = fastparquet.ParquetFile("RawData/Cleaned/patent_assignees_0628.parq").to_pandas()
# print(len(pa))
# # Patent-asg count
# pa = pa[["patent", "assignee_id", "type"]].groupby(["patent", "assignee_id"]).count().reset_index()
# print(len(pa))
# # Sort by assignee count per patent
# pa = pa.sort_values(["patent", "type"], ascending = [1,0])
# # Keep assignee with most count
# pa = pa.drop_duplicates(subset=["patent"], keep="first").drop("type",1)
# print(len(pa))
# fastparquet.write("RawData/Cleaned/patent_assignees_unique_0628.parq", pa, compression="GZIP")

In [3]:
# Inventor-patent pairs
ip = fastparquet.ParquetFile("RawData/Cleaned/patent_inventors_0628.parq").to_pandas()
dup_pats = pd.read_pickle("RawData/Cleaned/duplicate_pattext_0712.pkl").tolist()
print(len(ip))
# Get relevant US Patents
usp = fastparquet.ParquetFile("RawData/Cleaned/patent_loc_unique_us_0628.parq").to_pandas(["patent"])["patent"]
# Look at only relevant US patents and non-duplicated patents
ip = ip.loc[ip["patent"].isin(usp) & ~(ip["patent"].isin(dup_pats))]
print(len(ip))
# Patent-assignee pairs
pa = fastparquet.ParquetFile("RawData/Cleaned/patent_assignees_unique_0628.parq").to_pandas()
ip = ip.merge(pa, how = "left", on = "patent")
del(pa, usp)

5017246
4302101


In [4]:
# Get rid of patents with no assignee
print(len(ip))
ip = ip.loc[ip["assignee_id"].notnull()]
print(len(ip))

# Count number of assignees by inventor
i2 = ip[["inventor_id", "assignee_id"]].drop_duplicates("assignee_id").groupby("inventor_id").size()
print(len(i2)) # Num inventors working at firms in total

# Keep inventors at more than 2 different assignees
i2 = i2.loc[i2 >= 2]
print(len(i2))

4302101
3860816
141583
12377


In [5]:
# Patents by inventors who move
print(len(ip))
ip = ip.loc[ip["inventor_id"].isin(i2.index.tolist())]
print(len(ip))
ip.to_pickle("DataStore/2018-07/inv_move_pats_0712.pkl")

3860816
140076


### Magnitude
- 12377/141583 total inventors working at firms have switched firms
- They account for 140076 of 3860816 patents assigned

_____
## Adjusting for moves across firms
### Get rank of number of firms inventor has been at

In [19]:
ip = pd.read_pickle("DataStore/2018-07/inv_move_pats_0712.pkl")
# Sort ascending by inventor_id and grant year
ip = ip.sort_values(["inventor_id", "gyear"], ascending = [1,1])

In [15]:
ir = ip[["inventor_id", "assignee_id"]].drop_duplicates(["inventor_id", "assignee_id"])
ir = ir.groupby("inventor_id")
ir2 = pd.DataFrame()
for n,g in ir:
    g["inv_asg_rank"] = range(len(g))
    ir2 = ir2.append(g, ignore_index = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [20]:
ip = ip.merge(ir2, how = "left", on = ["inventor_id", "assignee_id"])
ip.head()

Unnamed: 0,patent,inventor_id,location_id,city,state,country,inv_msa,gyear,assignee_id,inv_asg_rank
0,3930283,3930282-1,kkownp0p29c9,Ephrata,PA,US,"Lancaster, PA",1976,6d92f55ba3875e01b5554b98c65c929a,0
1,3930282,3930282-1,kkownp0p29c9,Ephrata,PA,US,"Lancaster, PA",1976,6d92f55ba3875e01b5554b98c65c929a,0
2,4041572,3930282-1,kkownp0p29c9,Ephrata,PA,US,"Lancaster, PA",1977,6d92f55ba3875e01b5554b98c65c929a,0
3,4016624,3930282-1,kkownp0p29c9,Ephrata,PA,US,"Lancaster, PA",1977,6d92f55ba3875e01b5554b98c65c929a,0
4,4102014,3930282-1,kkownp0p29c9,Ephrata,PA,US,"Lancaster, PA",1978,6d92f55ba3875e01b5554b98c65c929a,0


In [21]:
ip.to_pickle("DataStore/2018-07/inv_move_pats_0712.pkl")

### Use only patents from rank 0 or 1

In [43]:
ip = pd.read_pickle("DataStore/2018-07/inv_move_pats_0712.pkl")

In [44]:
print(len(ip))
ip = ip.loc[ip["inv_asg_rank"].isin([0,1])]
print(len(ip))

140076
66790


### Inventor cross-patent similarity pairs

In [45]:
ig = ip[["patent", "inventor_id"]].groupby(["inventor_id"])
inv_pairs = [list(itertools.combinations(g["patent"].tolist(),2)) for n,g in ig]
inv_pairs = [item for sublist in inv_pairs for item in sublist]

# Order them by earlier patent first
inv_pairs = [(i,j) if i<j else (j,i) for i,j in inv_pairs ]

In [46]:
# Convert to dataframe
inv_pairs = pd.DataFrame({"tp": [i[0] for i in inv_pairs],
                         "op": [i[1] for i in inv_pairs]})
print(len(inv_pairs))
inv_pairs = inv_pairs.drop_duplicates()
print(len(inv_pairs))

708757
694573


In [47]:
# Get inventor
pat_asg = dict(zip(ip["patent"], ip["assignee_id"]))
inv_pairs["inventor_id"] = inv_pairs["tp"].map(dict(zip(ip["patent"], ip["inventor_id"])))
# Get assignee for each patent
inv_pairs["tp_asg"] = inv_pairs["tp"].map(pat_asg)
inv_pairs["op_asg"] = inv_pairs["op"].map(pat_asg)
# # Assignee match DON'T DO THIS
# inv_pairs["asg_match"] = (inv_pairs["tp_asg"] == inv_pairs["op_asg"])

In [48]:
# Assignee match
asgs = pickle.load(open("RawData/Cleaned/patent_assignee_dict_0628.pkl", "rb"))
print(len(inv_pairs))
# Check that target and other do not have same assignee
%time asg_match = (set(asgs.get(tp, [])).intersection(asgs.get(op, [])) for tp, op in zip(inv_pairs["tp"], inv_pairs["op"]))
%time asg_match = [len(i) for i in asg_match]

inv_pairs["asg_match"] = [True if i >=1 else False for i in asg_match]
print(len(inv_pairs))

694573
CPU times: user 25.6 ms, sys: 5.1 ms, total: 30.7 ms
Wall time: 29.9 ms
CPU times: user 805 ms, sys: 232 µs, total: 805 ms
Wall time: 784 ms
694573


In [49]:
inv_pairs["asg_match"].value_counts()

True     556338
False    138235
Name: asg_match, dtype: int64

### Did citation occur?

In [50]:
cit = dd.read_parquet("RawData/Cleaned/cit_0628.parq")
# Use only relevant citations
cit = cit[cit["cited"].isin(inv_pairs["tp"])]
print(len(cit))
cit = cit[cit["citing"].isin(inv_pairs["op"])]
print(len(cit))
cit = cit.compute()

# Citation dictionary
cit = dict(zip(list(zip(cit["cited"], cit["citing"])), [True]*len(cit)))

1700957
91813


In [51]:
inv_pairs["op_cites_tp"] = [cit.get(i, False) for i in zip(inv_pairs["tp"], inv_pairs["op"])]

In [52]:
inv_pairs["op_cites_tp"].value_counts()

False    639532
True      55041
Name: op_cites_tp, dtype: int64

In [53]:
fastparquet.write("DataStore/2018-07-P2/inv_move_pat_pairs_0714.parq", inv_pairs, compression="GZIP")

In [54]:
del(inv_pairs)

### Similarity

In [62]:
import scipy.spatial.distance as distance
dms = ["ldavecs", "docvecs"]
# dms = ["ldavecs"]
res = {}
for dm in dms:
    print("Loading matrix and dict")
    print(datetime.datetime.now())
    
    pat_dict = fastparquet.ParquetFile("RawData/Cleaned/patabs7615_us_no_dup.parq").to_pandas(["patent"])["patent"].tolist()
    pat_dict = dict(zip(pat_dict, range(len(pat_dict))))
    pm = dd.read_parquet("DataStore/2018-07-P2/ML/{0}_pats_0712.parq".format(dm)).values.compute()
    
    l3 = fastparquet.ParquetFile("DataStore/2018-07-P2/inv_move_pat_pairs_0714.parq").to_pandas(["tp", "op"])
    
    # Remove missing values
    print(len(l3))
    l3 = l3.loc[l3["tp"].isin(pat_dict.keys()) & l3["op"].isin(pat_dict.keys())]
    print(len(l3))
    if dm == "ldavecs":
        ncols = 60
    else:
        ncols = 100
    print(len(l3))
    tp_pv = pm[[pat_dict[p] for p in l3["tp"].tolist()]]
    op_pv = pm[[pat_dict[p] for p in l3["op"].tolist()]]
    print(len(l3))
    del(pat_dict, pm)

    # Split into chunks
    n_chunks = np.round(len(l3)/3000)
    tp_chunks = np.array_split(tp_pv, n_chunks)
    op_chunks = np.array_split(op_pv, n_chunks)
    del(tp_pv, op_pv)

    print("Getting patent pair similarity")
    print("cosine")
    print(datetime.datetime.now())
    # Cosine
    # Recursively lengthen the array of cosine distances
    cos_dis = np.array([])
    for i,j in zip(tp_chunks, op_chunks):
        cos_dis = np.hstack((cos_dis, np.hstack([np.diag(distance.cdist(i,j, metric = "cosine"))])))
        
    l3["sim_{0}".format(dm)] = 1-cos_dis
    res[dm] = l3
    print("finished")
    print(datetime.datetime.now())
    del(l3)    
        

Loading matrix and dict
2018-07-16 18:10:59.821376
694573
571068
571068
571068
Getting patent pair similarity
cosine
2018-07-16 18:11:09.198377
finished
2018-07-16 18:12:02.016132
Loading matrix and dict
2018-07-16 18:12:02.016190
694573
571068
571068
571068
Getting patent pair similarity
cosine
2018-07-16 18:12:18.771313
finished
2018-07-16 18:13:36.925054


In [63]:
sim = res["ldavecs"].merge(res["docvecs"][["tp", "op", "sim_docvecs"]],
                          how = "left", on = ["tp", "op"])
# Save raw similarity
fastparquet.write("DataStore/2018-07-P2/inv_move_pat_pairs_sim_0714.parq", sim, compression="GZIP")

In [4]:
inv_pairs = fastparquet.ParquetFile("DataStore/2018-07-P2/inv_move_pat_pairs_0714.parq").to_pandas()\
.drop(["sim_ldavecs", "sim_docvecs"],1)
print(len(inv_pairs))
sim = fastparquet.ParquetFile("DataStore/2018-07/inv_move_pat_pairs_sim_0712.parq").to_pandas()
# Merge with original
inv_pairs = inv_pairs.merge(sim, how = "left", on = ["tp", "op"])
print(len(inv_pairs))

694573
694573


In [5]:
inv_pairs.head()

Unnamed: 0,op,tp,inventor_id,tp_asg,op_asg,asg_match,op_cites_tp,sim_ldavecs,sim_docvecs
0,3930283,3930282,3930282-1,6d92f55ba3875e01b5554b98c65c929a,6d92f55ba3875e01b5554b98c65c929a,True,False,0.546727,0.519405
1,4041572,3930283,3930282-1,6d92f55ba3875e01b5554b98c65c929a,6d92f55ba3875e01b5554b98c65c929a,True,False,0.378731,0.250033
2,4016624,3930283,3930282-1,6d92f55ba3875e01b5554b98c65c929a,6d92f55ba3875e01b5554b98c65c929a,True,False,0.410078,0.497906
3,4102014,3930283,3930282-1,6d92f55ba3875e01b5554b98c65c929a,6d92f55ba3875e01b5554b98c65c929a,True,True,0.678831,0.264625
4,4477942,3930283,3930282-1,6d92f55ba3875e01b5554b98c65c929a,6d92f55ba3875e01b5554b98c65c929a,True,False,0.566891,0.438904


In [66]:
# # Scale
# eps = 0.01
# dv_min = 0.7
# def scale_docvecs(x):
#     scaled = ((x+dv_min)/(1+dv_min))*(1-eps)+eps
#     return scaled
# def scale_ldavecs(x):
#     scaled = x*(1-eps)+eps
#     return scaled

# # Scale measures
# inv_pairs["sim_docvecs"] = inv_pairs["sim_docvecs"].apply(scale_docvecs)
# inv_pairs["sim_ldavecs"] = inv_pairs["sim_ldavecs"].apply(scale_ldavecs)

In [6]:
# Not scaled
fastparquet.write("DataStore/2018-07-P2/inv_move_pat_pairs_0714.parq", inv_pairs, compression="GZIP")