In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import timeit
import datetime
import time
import pprint
import itertools
import pickle
import sklearn
import dask
import os
os.chdir('/mnt/t48/bighomes-active/sfeng/patentdiffusion/')
import fastparquet
seed = 3
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import h5py
import dask
import dask.dataframe as dd
import dask.array as da
# Percentiles
from scipy.stats import percentileofscore
from IPython.display import display

  from ._conv import register_converters as _register_converters


For sample generation, see: https://sfengc7.stern.nyu.edu:8888/notebooks/patentdiffusion/201808Results/StrategicNonCitations/Previous/1d-InventorMobilityNewCitations-0911.ipynb

See previous notebook for similarity to cited patent:
https://sfengc7.stern.nyu.edu:8888/notebooks/patentdiffusion/201808Results/StrategicNonCitations/4a-InventorMobilityNewCites-0918.ipynb

In [3]:
yv = "appyear"
c2 = pd.read_pickle("DataStore/2018-08/inv_move_cites_0912.pkl")
ip = pd.read_pickle("DataStore/2018-08/inv_move_pats_0912.pkl")

# Use unique assignees
c2 = c2.drop(["cited_asg", "citing_asg"],1)
asgs = fastparquet.ParquetFile("RawData/Cleaned/patent_assignees_unique_0628.parq").to_pandas(["patent", "assignee_id"])
pdf = fastparquet.ParquetFile("RawData/Cleaned/patent_loc_unique_us_0628.parq")\
.to_pandas(["patent", "primclass", "appyear"])
pdf = pdf.merge(asgs, how = "left", on = "patent")

c2 = c2.merge(asgs, how="left", left_on="citing", right_on="patent").rename(columns={"assignee_id": "citing_asg"}).drop("patent",1)
c2 = c2.merge(asgs, how="left", left_on="cited", right_on="patent").rename(columns={"assignee_id": "cited_asg"}).drop("patent",1)
del(asgs)

# New firms that cite prior patent post move
a1 = c2.loc[(c2["citing_appyear"] < c2["sec_fyear"]), "citing_asg"].tolist()
a2 = c2.loc[(c2["citing_appyear"] >= c2["sec_fyear"]), "citing_asg"].tolist()
new_cite_asgs = list(set(a2).difference(set(a1)))

c3 = pd.read_pickle("DataStore/2018-08/inv_mob_cite_pc_control_0918.pkl")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [5]:
# Patents by newly citing assignees
pdf = pdf.loc[pdf["assignee_id"].isin(new_cite_asgs)]
len(pdf)

# Sort by assignee, primclass, app year
pdf = pdf.sort_values(["assignee_id", "primclass", yv], ascending = [1,1,0])

# Control patent by assignee, primclass
cdict = {}
adict = {}
for yr in range(1975, 2016):
    print(yr)
    print(datetime.datetime.now())
    # Patent by assignee, primclass
    p2 = pdf.loc[(pdf["appyear"].isin(range(yr-5,yr+1))), \
        ["appyear", "assignee_id", "primclass", "patent"]].groupby(["assignee_id", "primclass"])
    p2 = {n+(yr,): (g["patent"].tolist() if len(g["patent"].tolist()) >= 1 else None) for n,g in p2}
    cdict.update(p2)
    del(p2)
    
     # Patent by assignee
    p2 = pdf.loc[(pdf["appyear"].isin(range(yr-5,yr+1))), \
        ["appyear", "assignee_id", "primclass", "patent"]].groupby(["assignee_id"])
    p2 = {(n,yr): (g["patent"].tolist() if len(g["patent"].tolist()) >= 1 else None) for n,g in p2}
    adict.update(p2)
    del(p2)

1975
2018-09-27 12:02:22.616578
1976
2018-09-27 12:02:23.769388
1977
2018-09-27 12:02:24.669426
1978
2018-09-27 12:02:26.075173
1979
2018-09-27 12:02:27.812179
1980
2018-09-27 12:02:29.702357
1981
2018-09-27 12:02:31.276082
1982
2018-09-27 12:02:32.916224
1983
2018-09-27 12:02:34.581170
1984
2018-09-27 12:02:36.883197
1985
2018-09-27 12:02:39.371705
1986
2018-09-27 12:02:41.593707
1987
2018-09-27 12:02:43.628441
1988
2018-09-27 12:02:45.751639
1989
2018-09-27 12:02:48.046682
1990
2018-09-27 12:02:50.923579
1991
2018-09-27 12:02:54.131999
1992
2018-09-27 12:02:57.090262
1993
2018-09-27 12:03:01.017521
1994
2018-09-27 12:03:05.228901
1995
2018-09-27 12:03:10.064504
1996
2018-09-27 12:03:14.296597
1997
2018-09-27 12:03:19.123585
1998
2018-09-27 12:03:24.965483
1999
2018-09-27 12:03:30.686432
2000
2018-09-27 12:03:38.721005
2001
2018-09-27 12:03:47.812166
2002
2018-09-27 12:03:57.910829
2003
2018-09-27 12:04:06.070504
2004
2018-09-27 12:04:15.736518
2005
2018-09-27 12:04:26.182237
2006
201

In [45]:
# Get all of cited patent's citations
c2 = pd.read_pickle("DataStore/2018-08/inv_move_cites_0912.pkl")
c2 = {n:g["citing"].tolist() for n,g in c2[["cited", "citing"]].groupby("cited")}

In [48]:
# Get all firm's patents
%time a = [adict.get((asg, fyr), []) for asg,fyr in zip(c3["citing_asg"], c3["citing_appyear"])]
c3["asg_pats"] = a

# Get list of potential control candidates
%time c = [cdict.get((asg, pc, fyr), []) for asg,pc,fyr in zip(c3["citing_asg"], c3["citing_primclass"],\
                                                     c3["citing_appyear"])]
# Add to dataframe
c3["asg_pc_pats"] = c

CPU times: user 61.2 ms, sys: 2.41 ms, total: 63.6 ms
Wall time: 56.5 ms
CPU times: user 78.5 ms, sys: 195 µs, total: 78.7 ms
Wall time: 74.3 ms


In [49]:
c3.to_pickle("DataStore/2018-08/inv_mob_cite_pc_control_0927.pkl")

### Create similarity pair sample

#### First, find how long each pairwise list would be for (i) all firm's patents in previous 5 years; (ii) all firm's primary class patents

In [17]:
# Use itertools to get pairwise list
apats = (itertools.product(l,l) for l in c3["asg_pats"])
apats = [item for sublist in apats for item in sublist]
print(len(apats))

ppats = (itertools.product(l,l) for l in c3["asg_pc_pats"])
ppats = [item for sublist in ppats for item in sublist]
print(len(ppats))

del(apats, ppats)

209681377
27788655


In [18]:
del(apats, ppats)

#### Create pairs for the citing patent and control patent, each crossed with all firm's patents in in previous five years

In [38]:
# c4 = pd.DataFrame()

# for c in ["citing", "citing_control_asg_pc"]:
#     apats = (itertools.product([i],l) for i,l in zip(c3[c], c3["asg_pats"]))
#     apats = [item for sublist in apats for item in sublist]
#     apats = pd.DataFrame({"tp": [i[0] for i in apats], "op": [i[1] for i in apats]})
#     apats["type"] = c
#     c4 = c4.append(apats, ignore_index=True)
#     del(apats)
# print(len(c4))

# # Delete duplicates and self similarity
# c4 = c4.loc[~(c4["tp"] == c4["op"])]
# c4 = c4.drop_duplicates()

# Add original citing
co = dict(zip(c3["citing"], c3["citing"]))
co.update(dict(zip(c3["citing_control_asg_pc"], c3["citing"])))
c4["citing"] = c4["tp"].map(co)

In [29]:
def grouper(n, iterable):
    """
    >>> list(grouper(3, 'ABCDEFG'))
    [['A', 'B', 'C'], ['D', 'E', 'F'], ['G']]
    """
    iterable = iter(iterable)
    return iter(lambda: list(itertools.islice(iterable, n)), [])


import scipy.spatial.distance as distance
dms = ["ldavecs", "docvecs"]

print("Getting row values")
print(datetime.datetime.now())
pat_dict = fastparquet.ParquetFile("RawData/Cleaned/patabs7615_us_no_dup.parq").to_pandas(["patent"])["patent"].tolist()
pat_dict = dict(zip(pat_dict, range(len(pat_dict))))


l2 = c4.copy()

for dm in dms:
    print((dm,"started"))
    print("Loading matrix and dict")
    print(datetime.datetime.now())
    
    if dm == "ldavecs":
        ncols = 60
    else:
        ncols = 100

    pm = fastparquet.ParquetFile("DataStore/2018-07-P2/ML/{0}_pats_0712.parq".format(dm))\
    .to_pandas().values
    
    
    l3 = l2[["tp", "op"]]
    l3 = l3.dropna(how="any").drop_duplicates()
    # Store copy as array
    l3 = l3.loc[l3["tp"].isin(pat_dict.keys()) & l3["op"].isin(pat_dict.keys())]
    print(len(l3))

    # Convert to chunks
    print("Getting chunks")
    print(datetime.datetime.now())
    # Split into chunks
    n_rows = 3000
    n_chunks = int(np.round(len(l3)/n_rows))
    tp_chunks = grouper(n_rows, pm[[pat_dict[p[1]] for p in l3["tp"].iteritems()]])
    op_chunks = grouper(n_rows, pm[[pat_dict[p[1]] for p in l3["op"].iteritems()]])

    chunks = itertools.zip_longest(tp_chunks, op_chunks)

    print("Getting patent pair cosine similarity")
    print(datetime.datetime.now())
    # Cosine

    cos_dis = np.empty(len(l3))

    for r, c in enumerate(chunks):
        cos_dis[r*n_rows:r*n_rows+n_rows] = np.diag(distance.cdist(c[0],c[1], metric = "cosine"))

    l3["sim_{0}".format(dm)] = 1-cos_dis

    # Rename columns
    l2 = l2.merge(l3, how = "left", on = ["tp", "op"])
    print(len(l2))
    del(l3)
    print("finished")
    print(datetime.datetime.now())
    del(pm)

Getting row values
2018-09-27 12:50:26.593922
('ldavecs', 'started')
Loading matrix and dict
2018-09-27 12:50:27.941563
634330
Getting chunks
2018-09-27 12:50:41.917328
Getting patent pair cosine similarity
2018-09-27 12:50:44.813701
1741394
finished
2018-09-27 12:51:57.410473
('docvecs', 'started')
Loading matrix and dict
2018-09-27 12:51:57.415277
634330
Getting chunks
2018-09-27 12:52:16.414043
Getting patent pair cosine similarity
2018-09-27 12:52:23.671629
1741394
finished
2018-09-27 12:54:13.375081


In [31]:
c4 = l2
c4 = c4.loc[~(c4["tp"] == c4["op"])]
c4 = c4.drop_duplicates()

In [54]:
# Add primary class of citing and other
pdf = fastparquet.ParquetFile("RawData/Cleaned/patent_loc_unique_us_0628.parq")\
.to_pandas(["patent", "primclass"])
pdf = dict(zip(pdf["patent"], pdf["primclass"]))
c4["citing_primclass"] = c4["citing"].map(pdf)
c4["op_primclass"] = c4["op"].map(pdf)
c4["citing_primclass_match"] = (c4["citing_primclass"] == c4["op_primclass"])
print(len(c4))
c4.to_pickle("DataStore/2018-08/inv_mobility_cite_sim_0927.pkl")

In [58]:
acite = c4.loc[(c4["type"] == "citing"), ["citing", "sim_ldavecs", "sim_docvecs"]].groupby("citing").mean()\
.rename(columns={"sim_ldavecs": "citing_mean_asg_ldavecs", "sim_docvecs": "citing_mean_asg_docvecs"})
acont = c4.loc[(c4["type"] == "citing_control_asg_pc"), ["citing", "sim_ldavecs", "sim_docvecs"]].groupby("citing").mean()\
.rename(columns={"sim_ldavecs": "control_mean_asg_ldavecs", "sim_docvecs": "control_mean_asg_docvecs"})
pcite = c4.loc[(c4["type"] == "citing") & (c4["citing_primclass_match"] == True),
               ["citing", "sim_ldavecs", "sim_docvecs"]].groupby("citing").mean()\
.rename(columns={"sim_ldavecs": "citing_mean_asg_pc_ldavecs", "sim_docvecs": "citing_mean_asg_pc_docvecs"})
pcont = c4.loc[(c4["type"] == "citing_control_asg_pc") & (c4["citing_primclass_match"] == True),
               ["citing", "sim_ldavecs", "sim_docvecs"]].groupby("citing").mean()\
.rename(columns={"sim_ldavecs": "control_mean_asg_pc_ldavecs", "sim_docvecs": "control_mean_asg_pc_docvecs"})


c5 = pd.concat([acite, acont, pcite, pcont], axis=1).reset_index()

In [59]:
c5.dropna(how="any").describe()

Unnamed: 0,citing,citing_mean_asg_ldavecs,citing_mean_asg_docvecs,control_mean_asg_ldavecs,control_mean_asg_docvecs,citing_mean_asg_pc_ldavecs,citing_mean_asg_pc_docvecs,control_mean_asg_pc_ldavecs,control_mean_asg_pc_docvecs
count,4552.0,4552.0,4552.0,4552.0,4552.0,4552.0,4552.0,4552.0,4552.0
mean,6656108.0,0.443163,0.276347,0.442195,0.274165,0.519948,0.325458,0.520451,0.32364
std,868033.8,0.199616,0.131502,0.198718,0.13296,0.215114,0.149217,0.212739,0.15051
min,4077916.0,0.022381,-0.219281,0.021534,-0.202524,0.00342,-0.247424,0.00342,-0.202524
25%,6098831.0,0.286974,0.186439,0.291407,0.184888,0.364968,0.228775,0.36429,0.224118
50%,6833612.0,0.424437,0.260221,0.417002,0.257215,0.51719,0.310277,0.518015,0.30935
75%,7354067.0,0.578274,0.342578,0.577924,0.344953,0.674154,0.403715,0.673964,0.402988
max,7861161.0,1.0,0.870421,1.0,0.870421,1.0,0.911395,1.0,0.911395


In [60]:
c3 = c3.merge(c5, how="left", on="citing")

In [64]:
c3 = c3.rename(columns={"sim_ldavecs": "citing_ldavecs", "sim_docvecs": "citing_docvecs",
                       "sim_ldavecs_citing_control_asg_pc": "control_ldavecs",
                        'sim_docvecs_citing_control_asg_pc': "control_docvecs"})

c3.to_pickle("DataStore/2018-08/inv_mob_cite_pc_control_0927.pkl")

### Analysis: Create data tables

In [84]:
dm = "docvecs"
tab = pd.DataFrame({})
for c in [dm, "mean_asg_{0}".format(dm), "mean_asg_pc_{0}".format(dm)]:
    c4 = c3.dropna(subset=["citing_{0}".format(c), "control_{0}".format(c)], how="any")
    cite_m = c4["citing_{0}".format(c)].mean()
    cont_m = c4["control_{0}".format(c)].mean()
    # Independent samples
#     t1 = sp.stats.ttest_ind(c3["citing_{0}".format(c)], c3["control_{0}".format(c)], equal_var=False, nan_policy="omit")
    # Related samples
    t2 = sp.stats.ttest_rel(c4["citing_{0}".format(c)], c4["control_{0}".format(c)], nan_policy="omit")
    tab[c] = [cite_m, cont_m, t2[0], t2[1], len(c4)]
    
tab.columns = ["Sim DocVecs to Cited", "\makecell{Mean Sim Docvecs,\\\\Own Prior Pats}",
               "\makecell{Mean Sim Docvecs,\\\\Own Prior Pats in Citing PC}"]
tab.index = ["Citing", "Control", "$t$-value", "$p$-value", "$N$"]
 

In [88]:
tab2 = np.round(tab,3)
tab2.loc["$N$"] = tab2.loc["$N$"].astype(int).astype(str)
print(tab2.to_latex(escape=False,column_format="lccc"))

\begin{tabular}{lccc}
\toprule
{} & Sim DocVecs to Cited & \makecell{Mean Sim Docvecs,\\Own Prior Pats} & \makecell{Mean Sim Docvecs,\\Own Prior Pats in Citing PC} \\
\midrule
Citing    &                0.278 &                                        0.281 &                                              0.328 \\
Control   &                0.234 &                                         0.28 &                                              0.327 \\
$t$-value &               26.637 &                                        1.096 &                                              1.458 \\
$p$-value &                    0 &                                        0.273 &                                              0.145 \\
$N$       &                 8951 &                                         6407 &                                               6338 \\
\bottomrule
\end{tabular}



### Stats on inventors moving

In [93]:
yv = "appyear"
# All inventors who have moved
ip = pd.read_pickle("DataStore/2018-07/inv_move_pats_0712.pkl")
print(len(ip))

pdf = fastparquet.ParquetFile("RawData/Cleaned/patent_loc_unique_us_0628.parq")\
.to_pandas(["patent", "inv_msa", "gyear", "appyear"])

# Add application year
ip[yv] = ip["patent"].map(dict(zip(pdf["patent"], pdf[yv])))

# Sort by inventor, grant year
ip = ip.sort_values(["inventor_id", yv])

# Only look at inventors' first and second cities
ip = ip.loc[(ip["inv_asg_rank"] <= 1)]

# Inventors' second cities
sc = ip.loc[(ip["inv_asg_rank"] == 1), ["inventor_id", "inv_msa", yv]].drop_duplicates(["inventor_id", "inv_msa"])

# Inventors' second city compared to first
ip["sec_inv_msa"] = ip["inventor_id"].map(dict(zip(sc["inventor_id"], sc["inv_msa"])))

# Second city's first grant year
ip["sec_fyear"] = ip["inventor_id"].map(dict(zip(sc["inventor_id"], sc[yv])))

# # Get rid of the inventors whose second MSA matches the first
# ip = ip.loc[~(ip["inv_msa"] == ip["sec_inv_msa"])]
# print(len(ip))

140076


In [96]:
print(len(ip.loc[~(ip["inv_msa"] == ip["sec_inv_msa"])]), len(ip.loc[(ip["inv_msa"] == ip["sec_inv_msa"])]),
      len(ip), len(ip.loc[~(ip["inv_msa"] == ip["sec_inv_msa"])])/len(ip))

12846 53944 66790 0.1923341817637371


### Mobile inventors' prior patent citations

In [99]:
ip = pd.read_pickle("DataStore/2018-08/inv_move_pats_0912.pkl")
ip.columns

Index(['patent', 'inventor_id', 'location_id', 'city', 'state', 'country',
       'inv_msa', 'gyear', 'assignee_id', 'inv_asg_rank', 'appyear',
       'sec_inv_msa', 'sec_fyear', 'sec_inv_msa_match_prior',
       'sec_inv_msa_match_post', 'sim_docvecs_prior', 'sim_docvecs_post',
       'sim_ldavecs_prior', 'sim_ldavecs_post'],
      dtype='object')

In [104]:
ip.head()
# Two approaches
# 1. Drop rows with missing values
i2 = ip[['sec_inv_msa_match_prior', 'sec_inv_msa_match_post']].dropna(how="any")
display(i2.describe())

# 2. Only use patents that received citations, i.e. is not null for both prior and post. Then Fill nan with 0.
i3 = ip[['sec_inv_msa_match_prior', 'sec_inv_msa_match_post']].dropna(how="all").fillna(0)
display(i3.describe())

Unnamed: 0,sec_inv_msa_match_prior,sec_inv_msa_match_post
count,2754.0,2754.0
mean,0.058752,0.097849
std,0.189363,0.215061
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,0.076923
max,1.0,1.0


Unnamed: 0,sec_inv_msa_match_prior,sec_inv_msa_match_post
count,6497.0,6497.0
mean,0.029152,0.077534
std,0.136997,0.198578
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,0.0
max,1.0,1.0


### Number of assignees that cite mobile inventors patents

In [108]:
print(len(set(a1)), len(set(a2)), len(set(new_cite_asgs)), len(set(new_cite_asgs))/len(set(a2)))

4316 10578 8497 0.803270939686141


In [109]:
# Number of citations from new assignees
print(len(c3))

27817
