In [33]:
import numpy as np
import scipy as sp
import pandas as pd
import timeit
import datetime
import time
import pprint
import itertools
import pickle
import sklearn
import dask
import os
os.chdir('/mnt/t48/bighomes-active/sfeng/patentdiffusion/')
import fastparquet
seed = 3
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import h5py
import collections
import dask
import dask.dataframe as dd
import dask.array as da

### Create same of patents with common NPC

In [51]:
%time oc = fastparquet.ParquetFile("RawData/Cleaned/otherreference1016.parq").to_pandas()

# Only use available patents
pdf = fastparquet.ParquetFile("RawData/Cleaned/patent_loc_unique_us_0628.parq")\
.to_pandas(["patent"])
dup_pats = pd.read_pickle("RawData/Cleaned/duplicate_pattext_0712.pkl").tolist()
# Get relevant US Patents
pdf = pdf.loc[~pdf["patent"].isin(dup_pats)]

oc = oc.loc[oc["patent_id"].isin(pdf["patent"])]
del(pdf)
print(len(oc["ref_id"].unique()))

# Only select refernces with at least 3 patent references
oc_m = oc["ref_id"].value_counts()
oc_m = oc_m[oc_m >= 3]
# About 2.1 million: sample 33%
oc_m = oc_m.sample(frac=0.33, random_state=3)
print(len(oc_m))

oc = oc.loc[oc["ref_id"].isin(oc_m.index.tolist())]
print(len(oc))

# Convert to ref: patent list dictionary
%time oc = {n:g["patent_id"].tolist() for n,g in oc.groupby("ref_id")}

CPU times: user 31.1 s, sys: 7.42 s, total: 38.5 s
Wall time: 40.6 s
8911717
288369
1826523
CPU times: user 1min 43s, sys: 187 ms, total: 1min 43s
Wall time: 1min 46s


In [52]:
%%time
# All possible combinations of patent pairs sharing one common outside reference
%time oc_l = [len(list(itertools.combinations(v, r=2))) for k,v in oc.items()]
print(np.sum(oc_l))

# Get all pairs and sample
oc_l = (itertools.combinations(v, r=2) for k,v in oc.items())
del(oc)
# Get all pairs
oc_l = [item for sublist in oc_l for item in sublist]

# Sample 30%
oc_l_ind = np.random.choice(np.shape(oc_l)[0], size=int(np.round(np.shape(oc_l)[0]*0.7)))
oc_l = np.array(oc_l)[oc_l_ind]
print(len(oc_l))

CPU times: user 1.24 s, sys: 394 µs, total: 1.24 s
Wall time: 1.25 s
15037492
10526244
CPU times: user 26.6 s, sys: 954 ms, total: 27.5 s
Wall time: 28.4 s


In [53]:
oc_l = pd.DataFrame({"tp": [i[0] for i in oc_l],
                    "op": [i[1] for i in oc_l]})
print(len(oc_l))
oc_l = oc_l.drop_duplicates()
print(len(oc_l))
fastparquet.write("DataStore/2018-10/mutual_npc_1027.parq", oc_l, compression = "GZIP")

10526244
2382230


In [54]:
import scipy.spatial.distance as distance

def grouper(n, iterable):
    """
    >>> list(grouper(3, 'ABCDEFG'))
    [['A', 'B', 'C'], ['D', 'E', 'F'], ['G']]
    """
    iterable = iter(iterable)
    return iter(lambda: list(itertools.islice(iterable, n)), [])

dms = ["ldavecs", "docvecs"]

l2 = oc_l
print(len(l2))
for dm in dms:
    print("Loading matrix and dict")
    print(datetime.datetime.now())

    pat_dict = fastparquet.ParquetFile("RawData/Cleaned/patabs7615_us_no_dup.parq").to_pandas(["patent"])["patent"].tolist()
    pat_dict = dict(zip(pat_dict, range(len(pat_dict))))
    # Store as dask array
    if dm == "ldavecs":
        ncols = 60
    else:
        ncols = 100
    dmf = "DataStore/2018-07-P2/ML/{0}_pats_0712.parq".format(dm)
    %time pm = fastparquet.ParquetFile(dmf).to_pandas().values
#     %time pm = da.from_array(pm, chunks=(10000,ncols))

    print("Getting row values")
    print(datetime.datetime.now())
    #----------#
    # 2. Get pat vecs & pat similarity
    print(len(l2))
    # Remove missing values
    l3 = l2[["tp", "op"]].loc[l2["tp"].isin(pat_dict.keys()) & l2["op"].isin(pat_dict.keys())]

    if dm == "ldavecs":
        ncols = 60
    else:
        ncols = 100
    print(len(l3))

    print("Getting chunks")
    print(datetime.datetime.now())
    # Split into chunks
    n_rows = 3000
    n_chunks = int(np.round(len(l3)/n_rows))
    tp_chunks = grouper(n_rows, pm[[pat_dict[p] for p in l3["tp"].tolist()]])
    op_chunks = grouper(n_rows, pm[[pat_dict[p] for p in l3["op"].tolist()]])
    chunks = itertools.zip_longest(tp_chunks, op_chunks)

    print("Getting patent pair similarity")
    print("cosine")
    print(datetime.datetime.now())
    # Cosine

    cos_dis = np.empty(len(l3))

    for r, c in enumerate(chunks):
        cos_dis[r*n_rows:r*n_rows+n_rows] = np.diag(distance.cdist(c[0],c[1], metric = "cosine"))

    l3["sim_{0}".format(dm)] = 1-cos_dis
    del(cos_dis)

    l2 = l2.merge(l3[["tp", "op", "sim_{0}".format(dm)]], how = "left", on = ["tp", "op"])
    print(len(l2))           
    print("Finished {0}".format(dm))
    print(datetime.datetime.now())
l2 = l2.drop_duplicates()
print(len(l2))    

2382230
Loading matrix and dict
2018-10-27 18:22:17.045083
CPU times: user 28.7 s, sys: 3.39 s, total: 32.1 s
Wall time: 11.2 s
Getting row values
2018-10-27 18:22:30.613511
2382230
1464315
Getting chunks
2018-10-27 18:22:34.394811
Getting patent pair similarity
cosine
2018-10-27 18:22:51.781929
2382230
Finished ldavecs
2018-10-27 18:26:32.724859
Loading matrix and dict
2018-10-27 18:26:32.724963
CPU times: user 49.2 s, sys: 1.74 s, total: 51 s
Wall time: 16.1 s
Getting row values
2018-10-27 18:26:50.693223
2382230
1464315
Getting chunks
2018-10-27 18:26:53.265869
Getting patent pair similarity
cosine
2018-10-27 18:27:20.130966
2382230
Finished docvecs
2018-10-27 18:32:29.346892
2382230


In [58]:
# del(oc_l)
# l2 = l2.loc[l2["sim_docvecs"].notnull()]
# print(len(l2))
fastparquet.write("DataStore/2018-10/mutual_npc_1027.parq", l2, compression = "GZIP")

In [61]:
# inv = fastparquet.ParquetFile("RawData/Cleaned/patent_inventors_0628.parq").to_pandas(["patent", "inventor_id"])
# %time inv = {n: g["inventor_id"].tolist() for n, g in inv.groupby("patent")}

mdc = l2
print(len(mdc))
%time num_common_inv = [len(set(inv[tp]).intersection(inv[op])) if (tp in inv.keys()) & (op in inv.keys())\
                  else np.nan for tp, op in zip(mdc["tp"], mdc["op"])]
del(inv)
mdc["num_common_pat_inv"] = num_common_inv
del(num_common_inv)
mdc["common_pat_inv"] = np.nan
mdc.loc[mdc["num_common_pat_inv"] >= 1, "common_pat_inv"] = True
mdc.loc[mdc["num_common_pat_inv"] == 0, "common_pat_inv"] = False
mdc = mdc.drop("num_common_pat_inv",1)
print(len(mdc))
print(mdc["common_pat_inv"].value_counts())

1464315
CPU times: user 7.38 s, sys: 0 ns, total: 7.38 s
Wall time: 8.11 s
1464315
False    955553
True     508762
Name: common_pat_inv, dtype: int64


In [67]:
# asgs = pickle.load(open("RawData/Cleaned/patent_assignee_dict_0628.pkl", "rb"))
# cit = dd.read_parquet("RawData/Cleaned/cit_0628.parq")

# # Direct citations
# # Create all False
# mdc["direct_cite"] = False
# # Create zipped pairs
# mdc["tp_op"] = list(zip(mdc["tp"], mdc["op"]))
# # Citations
# c2 = cit[cit["cited"].isin(mdc["tp"])].compute()
# mdc.loc[mdc["tp_op"].isin(list(zip(c2["cited"], c2["citing"]))), "direct_cite"] = True
# del(c2)
# print(mdc["direct_cite"].value_counts())
# # Drop pairs
# mdc = mdc.drop("tp_op",1)

# # Number of common citations
# try:
#     mdc = mdc.drop(["num_common_cites", "common_cites_match"],1)
# except Exception:
#     pass

%time c2 = cit[cit["citing"].isin(mdc["tp"]) | cit["citing"].isin(mdc["op"])].compute()

# Remove self-citations
%time asg_match = (set(asgs.get(cited, [])).intersection(asgs.get(citing, [])) for cited, citing \
                   in zip(c2["cited"], c2["citing"]))
%time asg_match = [len(i) for i in asg_match]
c2["asg_match"] = asg_match
c2 = c2.loc[c2["asg_match"] == 0]
c2 = c2[["citing", "cited"]]
print("c2 length")
print(len(c2))

# Number of common citations after self-citation removal
# Dictionary of citing: cited patents
%time c2 = {n:g["cited"].tolist() for n,g in c2.groupby("citing")}

# Total number of cites
mdc["tp_num_cited"] = [len(c2.get(p, [])) for p in mdc["tp"]]
mdc["op_num_cited"] = [len(c2.get(p, [])) for p in mdc["op"]]

# Get number of overlapping
%time num_common_cites = (set(c2.get(tp, [])).intersection(set(c2.get(op, []))) for tp, op in zip(mdc["tp"], mdc["op"]))
%time mdc["num_common_cited"] = [len(i) for i in num_common_cites]
del(c2)

# Pct of common cites
mdc["tp_pct_common_cited"] = mdc["num_common_cited"]/mdc["tp_num_cited"]
# At least one number of common cites
mdc["common_cited_match"] = False
mdc.loc[(mdc["num_common_cited"] >= 1), "common_cited_match"] = True

CPU times: user 3min 49s, sys: 46.1 s, total: 4min 35s
Wall time: 1min 48s
CPU times: user 1.03 s, sys: 673 ms, total: 1.7 s
Wall time: 1.6 s
CPU times: user 37.5 s, sys: 217 ms, total: 37.7 s
Wall time: 37.5 s
c2 length
8673948
CPU times: user 36.4 s, sys: 1.87 s, total: 38.3 s
Wall time: 37 s
CPU times: user 189 ms, sys: 234 ms, total: 423 ms
Wall time: 108 ms
CPU times: user 32.5 s, sys: 157 ms, total: 32.6 s
Wall time: 31.8 s


In [None]:
fastparquet.write("DataStore/2018-10/mutual_npc_1027.parq", mdc, compression = "GZIP")

In [70]:
# Add other variables

# Only use available patents
pdf = fastparquet.ParquetFile("RawData/Cleaned/patent_loc_unique_us_0628.parq")\
.to_pandas(["patent", "gyear", "inv_msa", "naics_name", "primclass"])
dup_pats = pd.read_pickle("RawData/Cleaned/duplicate_pattext_0712.pkl").tolist()
# Get relevant US Patents
pdf = pdf.loc[~pdf["patent"].isin(dup_pats)]

m2 = mdc.merge(pdf.add_prefix("tp_"), how="left", left_on="tp", right_on="tp_patent").drop("tp_patent",1)
m2 = m2.merge(pdf.add_prefix("op_"), how="left", left_on="op", right_on="op_patent").drop("op_patent",1)

m2["inv_msa_match"] = (m2["tp_inv_msa"] == m2["op_inv_msa"])
m2["primclass_match"] = (m2["tp_primclass"] == m2["op_primclass"])

In [72]:
mdc = m2
fastparquet.write("DataStore/2018-10/mutual_npc_1027.parq", mdc, compression = "GZIP")