In [3]:
import numpy as np
import scipy as sp
import pandas as pd
import timeit
import datetime
import time
import pprint
import itertools
import pickle
import sklearn
import dask
import os
os.chdir('/mnt/t48/bighomes-active/sfeng/patentdiffusion/')
import fastparquet
seed = 3
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import h5py
import dask
import dask.dataframe as dd
import dask.array as da
# Percentiles
from scipy.stats import percentileofscore
from IPython.display import display

  from ._conv import register_converters as _register_converters


## Inventor mobility effects on citations at new location

1.1 Find new location citations to inventor's old patents and inventor's new patents
- Show new patents have greater rates of citation compared to old patents

1.2 Show rate of citation at new location increases, but doesn't translate to more similar inventions
1. Find new patent's citations in new location
2. Find new patents' similarity to new citations
3. Find new patents' similarity to old patents of same assignee who don't cite new patent

### Find inventors who moved and their patents

In [95]:
yv = "appyear"
# All inventors who have moved
ip = pd.read_pickle("DataStore/2018-07/inv_move_pats_0712.pkl")
print(len(ip))

pdf = fastparquet.ParquetFile("RawData/Cleaned/patent_loc_unique_us_0628.parq")\
.to_pandas(["patent", "inv_msa", "gyear", "appyear"])

# Add application year
ip[yv] = ip["patent"].map(dict(zip(pdf["patent"], pdf[yv])))

# Sort by inventor, grant year
ip = ip.sort_values(["inventor_id", yv])

# Only look at inventors' first and second cities
ip = ip.loc[(ip["inv_asg_rank"] <= 1)]

# Inventors' second cities
sc = ip.loc[(ip["inv_asg_rank"] == 1), ["inventor_id", "inv_msa", yv]].drop_duplicates(["inventor_id", "inv_msa"])

# Inventors' second city compared to first
ip["sec_inv_msa"] = ip["inventor_id"].map(dict(zip(sc["inventor_id"], sc["inv_msa"])))

# Second city's first grant year
ip["sec_fyear"] = ip["inventor_id"].map(dict(zip(sc["inventor_id"], sc[yv])))

# Get rid of the inventors whose second MSA matches the first
ip = ip.loc[~(ip["inv_msa"] == ip["sec_inv_msa"])]
print(len(ip))

# Get rid inventors with any missing cities
missing_cities = ip.loc[(ip["inv_msa"].isnull() | ip["sec_inv_msa"].isnull()), "inventor_id"].tolist()
ip = ip.loc[~ip["inventor_id"].isin(missing_cities)]
print(len(ip))

140076
12846
10826


### Citations to each of mobile inventors' patents

In [96]:
cit = dd.read_parquet("RawData/Cleaned/cit_0628.parq")

%time c2 = cit[cit["cited"].isin(ip["patent"])].compute()

# Remove self-citations
asgs = pickle.load(open("RawData/Cleaned/patent_assignee_dict_0628.pkl", "rb"))

%time asg_match = (set(asgs.get(cited, [])).intersection(asgs.get(citing, [])) for cited, citing \
                   in zip(c2["cited"], c2["citing"]))
%time asg_match = [len(i) for i in asg_match]
c2["asg_match"] = asg_match
c2 = c2.loc[c2["asg_match"] == 0]
c2 = c2[["citing", "cited"]]

# Add assignees
c2["cited_asg"] = c2["cited"].map(asgs)
c2["citing_asg"] = c2["citing"].map(asgs)
del(asgs)

print(len(c2))

CPU times: user 2min 14s, sys: 34.7 s, total: 2min 49s
Wall time: 1min 25s
CPU times: user 50 ms, sys: 0 ns, total: 50 ms
Wall time: 47.2 ms
CPU times: user 1.72 s, sys: 0 ns, total: 1.72 s
Wall time: 1.62 s
331597


#### Similarities for citation pairs

In [97]:
def grouper(n, iterable):
    """
    >>> list(grouper(3, 'ABCDEFG'))
    [['A', 'B', 'C'], ['D', 'E', 'F'], ['G']]
    """
    iterable = iter(iterable)
    return iter(lambda: list(itertools.islice(iterable, n)), [])


import scipy.spatial.distance as distance
dms = ["ldavecs", "docvecs"]

print("Getting row values")
print(datetime.datetime.now())
pat_dict = fastparquet.ParquetFile("RawData/Cleaned/patabs7615_us_no_dup.parq").to_pandas(["patent"])["patent"].tolist()
pat_dict = dict(zip(pat_dict, range(len(pat_dict))))

l2 = pd.DataFrame({"tp": c2["cited"], "op": c2["citing"]})

for dm in dms:
    print((dm,"started"))
    print("Loading matrix and dict")
    print(datetime.datetime.now())
    # Store copy as array
    l3 = l2.loc[l2["tp"].isin(pat_dict.keys()) & l2["op"].isin(pat_dict.keys()), ["tp", "op"]].copy()

    if dm == "ldavecs":
        ncols = 60
    else:
        ncols = 100

    pm = fastparquet.ParquetFile("DataStore/2018-07-P2/ML/{0}_pats_0712.parq".format(dm))\
.to_pandas().values

    # Convert to chunks
    print("Getting chunks")
    print(datetime.datetime.now())
    # Split into chunks
    n_rows = 3000
    n_chunks = int(np.round(len(l3)/n_rows))
    tp_chunks = grouper(n_rows, pm[[pat_dict[p[1]] for p in l3["tp"].iteritems()]])
    op_chunks = grouper(n_rows, pm[[pat_dict[p[1]] for p in l3["op"].iteritems()]])
    del(pm)
    chunks = itertools.zip_longest(tp_chunks, op_chunks)

    print("Getting patent pair cosine similarity")
    print(datetime.datetime.now())
    # Cosine

    cos_dis = np.empty(len(l3))

    for r, c in enumerate(chunks):
        cos_dis[r*n_rows:r*n_rows+n_rows] = np.diag(distance.cdist(c[0],c[1], metric = "cosine"))

    l3["sim_{0}".format(dm)] = 1-cos_dis
    
    # Rename columns
    l3 = l3.rename(columns={"tp": "cited", "op": "citing"})
    c2 = c2.merge(l3, how = "left", on = ["cited", "citing"])
    del(l3)
    print("finished")
    print(datetime.datetime.now())

Getting row values
2018-09-12 17:18:15.702909
('ldavecs', 'started')
Loading matrix and dict
2018-09-12 17:18:18.019595
Getting chunks
2018-09-12 17:18:27.765365
Getting patent pair cosine similarity
2018-09-12 17:18:28.850094
finished
2018-09-12 17:18:48.055166
('docvecs', 'started')
Loading matrix and dict
2018-09-12 17:18:48.055292
Getting chunks
2018-09-12 17:19:06.406469
Getting patent pair cosine similarity
2018-09-12 17:19:08.236596
finished
2018-09-12 17:19:34.887668


In [98]:
# Get location for each patent
# pdf = fastparquet.ParquetFile("RawData/Cleaned/patent_loc_unique_us_0628.parq")\
# .to_pandas(["patent", "inv_msa", "gyear", "appyear"])

# Get MSA of cited patent
c2["cited_inv_msa"] = c2["cited"].map(dict(zip(pdf["patent"], pdf["inv_msa"])))
c2["citing_inv_msa"] = c2["citing"].map(dict(zip(pdf["patent"], pdf["inv_msa"])))

# Get gyear of cited patent
c2["cited_"+yv] = c2["cited"].map(dict(zip(pdf["patent"], pdf[yv])))
c2["citing_"+yv] = c2["citing"].map(dict(zip(pdf["patent"], pdf[yv])))
del(pdf)

# Get second cities for each patent
c2["sec_inv_msa"] = c2["cited"].map(dict(zip(ip["patent"], ip["sec_inv_msa"])))

# Get second cities first grant year for each patent
c2["sec_fyear"] = c2["cited"].map(dict(zip(ip["patent"], ip["sec_fyear"])))

# Matching citing patent MSA to second MSA
c2["sec_inv_msa_match"] = (c2["citing_inv_msa"] == c2["sec_inv_msa"])

# Match rate to second MSA
# Before move
prior = c2.loc[(c2["citing_"+yv] < c2["sec_fyear"]), ["cited", "sec_inv_msa_match"]].groupby("cited").mean()
# After move
post = c2.loc[(c2["citing_"+yv] >= c2["sec_fyear"]), ["cited", "sec_inv_msa_match"]].groupby("cited").mean()

# Average similarity before and after move
for c in ["sim_docvecs", "sim_ldavecs"]:
    # Average of second MSA citations similarity to cited patent, before and after move
    c3 = c2.loc[(c2["citing_"+yv] < c2["sec_fyear"]) & (c2["sec_inv_msa_match"] == True),
                ["cited", c]].groupby("cited").mean()
    c4 = c2.loc[(c2["citing_"+yv] >= c2["sec_fyear"]) & (c2["sec_inv_msa_match"] == True),
                ["cited", c]].groupby("cited").mean()
    prior = pd.concat([prior, c3], axis=1)
    post = pd.concat([post, c4], axis=1)

In [99]:
# Get match rate to second MSA for each patent
for c in ["sec_inv_msa_match", "sim_docvecs", "sim_ldavecs"]:
    ip["{0}_prior".format(c)] = ip["patent"].map(prior[c])
    ip["{0}_post".format(c)] = ip["patent"].map(post[c])
print(len(ip))
# Get rid of the inventors whose second MSA matches the first
ip = ip.loc[~(ip["inv_msa"] == ip["sec_inv_msa"])]
print(len(ip))

10826
10826


In [100]:
ip[["inv_asg_rank", "sec_inv_msa_match_prior", "sec_inv_msa_match_post",
   "sim_docvecs_prior", "sim_docvecs_post"]].groupby("inv_asg_rank").mean()

Unnamed: 0_level_0,sec_inv_msa_match_prior,sec_inv_msa_match_post,sim_docvecs_prior,sim_docvecs_post
inv_asg_rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.053176,0.088089,0.311498,0.311007
1,0.083393,0.070523,0.317783,0.299359


In [101]:
c2.to_pickle("DataStore/2018-08/inv_move_cites_0912.pkl")
ip.to_pickle("DataStore/2018-08/inv_move_pats_0912.pkl")

### Compare new assignees at new location's similarity to assignees who already cite prior patent

In [4]:
yv = "appyear"
c2 = pd.read_pickle("DataStore/2018-08/inv_move_cites_0912.pkl")
ip = pd.read_pickle("DataStore/2018-08/inv_move_pats_0912.pkl")

# Use unique assignees
c2 = c2.drop(["cited_asg", "citing_asg"],1)
asgs = fastparquet.ParquetFile("RawData/Cleaned/patent_assignees_unique_0628.parq").to_pandas(["patent", "assignee_id"])
pdf = fastparquet.ParquetFile("RawData/Cleaned/patent_loc_unique_us_0628.parq")\
.to_pandas(["patent", "primclass", "appyear"])
pdf = pdf.merge(asgs, how = "left", on = "patent")

c2 = c2.merge(asgs, how="left", left_on="citing", right_on="patent").rename(columns={"assignee_id": "citing_asg"}).drop("patent",1)
c2 = c2.merge(asgs, how="left", left_on="cited", right_on="patent").rename(columns={"assignee_id": "cited_asg"}).drop("patent",1)
del(asgs)

# New firms that cite prior patent post move
a1 = c2.loc[(c2["citing_appyear"] < c2["sec_fyear"]), "citing_asg"].tolist()
a2 = c2.loc[(c2["citing_appyear"] >= c2["sec_fyear"]), "citing_asg"].tolist()
new_cite_asgs = list(set(a2).difference(set(a1)))
prev_cite_asgs = list(set(a2).intersection(set(a1)))

In [5]:
csim = {}
csim["prev_prior"] = c2.loc[(c2["citing_"+yv] < c2["sec_fyear"]) & (c2["sec_inv_msa_match"] == True)\
            & c2["citing_asg"].isin(prev_cite_asgs),
            ["cited", "sim_docvecs", "sim_ldavecs"]].groupby("cited").mean().add_prefix("prev_prior_").reset_index()
csim["prev_post"] = c2.loc[(c2["citing_"+yv] >= c2["sec_fyear"]) & (c2["sec_inv_msa_match"] == True)\
            & c2["citing_asg"].isin(prev_cite_asgs),
            ["cited", "sim_docvecs", "sim_ldavecs"]].groupby("cited").mean().add_prefix("prev_post_").reset_index()
csim["new_post"] = c2.loc[(c2["citing_"+yv] >= c2["sec_fyear"]) & (c2["sec_inv_msa_match"] == True)\
            & c2["citing_asg"].isin(new_cite_asgs),
            ["cited", "sim_docvecs", "sim_ldavecs"]].groupby("cited").mean().add_prefix("new_post_").reset_index()

In [6]:
# Get match rate to second MSA for each patent
for k,v in csim.items():
    ip = ip.merge(v, how="left", left_on="patent", right_on="cited").drop("cited",1)

In [8]:
ip[['sec_inv_msa_match_prior',
       'sec_inv_msa_match_post', 'sim_docvecs_prior', 'sim_docvecs_post',
       'sim_ldavecs_prior', 'sim_ldavecs_post', 'prev_prior_sim_docvecs',
       'prev_prior_sim_ldavecs', 'prev_post_sim_docvecs', 'prev_post_sim_ldavecs',
       'new_post_sim_docvecs', 'new_post_sim_ldavecs']].mean()

sec_inv_msa_match_prior    0.057886
sec_inv_msa_match_post     0.084252
sim_docvecs_prior          0.312521
sim_docvecs_post           0.308676
sim_ldavecs_prior          0.550057
sim_ldavecs_post           0.527295
prev_prior_sim_docvecs     0.312827
prev_prior_sim_ldavecs     0.556394
prev_post_sim_docvecs      0.291704
prev_post_sim_ldavecs      0.526657
new_post_sim_docvecs       0.316823
new_post_sim_ldavecs       0.524154
dtype: float64