In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import timeit
import datetime
import time
import pprint
import itertools
import pickle
import sklearn
import dask
import os
os.chdir('/mnt/t48/bighomes-active/sfeng/patentdiffusion/')
import fastparquet
seed = 3
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import h5py
import dask
import dask.dataframe as dd
import dask.array as da
# Percentiles
from scipy.stats import percentileofscore
from IPython.display import display

  from ._conv import register_converters as _register_converters


For sample generation, see: https://sfengc7.stern.nyu.edu:8888/notebooks/patentdiffusion/201808Results/StrategicNonCitations/Previous/1d-InventorMobilityNewCitations-0911.ipynb

See previous notebook for similarity to cited patent:
https://sfengc7.stern.nyu.edu:8888/notebooks/patentdiffusion/201808Results/StrategicNonCitations/4a-InventorMobilityNewCites-0918.ipynb

In [3]:
yv = "appyear"
c2 = pd.read_pickle("DataStore/2018-08/inv_move_cites_0912.pkl")
ip = pd.read_pickle("DataStore/2018-08/inv_move_pats_0912.pkl")

# Use unique assignees
c2 = c2.drop(["cited_asg", "citing_asg"],1)
asgs = fastparquet.ParquetFile("RawData/Cleaned/patent_assignees_unique_0628.parq").to_pandas(["patent", "assignee_id"])
pdf = fastparquet.ParquetFile("RawData/Cleaned/patent_loc_unique_us_0628.parq")\
.to_pandas(["patent", "primclass", "appyear"])
pdf = pdf.merge(asgs, how = "left", on = "patent")

c2 = c2.merge(asgs, how="left", left_on="citing", right_on="patent").rename(columns={"assignee_id": "citing_asg"}).drop("patent",1)
c2 = c2.merge(asgs, how="left", left_on="cited", right_on="patent").rename(columns={"assignee_id": "cited_asg"}).drop("patent",1)
del(asgs)

# New firms that cite prior patent post move
a1 = c2.loc[(c2["citing_appyear"] < c2["sec_fyear"]), "citing_asg"].tolist()
a2 = c2.loc[(c2["citing_appyear"] >= c2["sec_fyear"]), "citing_asg"].tolist()
new_cite_asgs = list(set(a2).difference(set(a1)))

c3 = pd.read_pickle("DataStore/2018-08/inv_mob_cite_pc_control_0918.pkl")

In [4]:
# Patents by newly citing assignees
pdf = pdf.loc[pdf["assignee_id"].isin(new_cite_asgs)]
len(pdf)

# Sort by assignee, primclass, app year
pdf = pdf.sort_values(["assignee_id", "primclass", yv], ascending = [1,1,0])

# Control patent by assignee, primclass
cdict = {}
adict = {}
for yr in range(1975, 2016):
    print(yr)
    print(datetime.datetime.now())
    # Patent by assignee, primclass
    p2 = pdf.loc[(pdf["appyear"].isin(range(yr-5,yr+1))), \
        ["appyear", "assignee_id", "primclass", "patent"]].groupby(["assignee_id", "primclass"])
    p2 = {n+(yr,): (g["patent"].tolist() if len(g["patent"].tolist()) >= 1 else None) for n,g in p2}
    cdict.update(p2)
    del(p2)
    
     # Patent by assignee
    p2 = pdf.loc[(pdf["appyear"].isin(range(yr-5,yr+1))), \
        ["appyear", "assignee_id", "primclass", "patent"]].groupby(["assignee_id"])
    p2 = {(n,yr): (g["patent"].tolist() if len(g["patent"].tolist()) >= 1 else None) for n,g in p2}
    adict.update(p2)
    del(p2)

1975
2018-09-28 10:51:40.175795
1976
2018-09-28 10:51:41.564468
1977
2018-09-28 10:51:42.454555
1978
2018-09-28 10:51:43.595936
1979
2018-09-28 10:51:45.023820
1980
2018-09-28 10:51:46.626723
1981
2018-09-28 10:51:48.356156
1982
2018-09-28 10:51:50.578169
1983
2018-09-28 10:51:52.489608
1984
2018-09-28 10:51:54.427730
1985
2018-09-28 10:51:56.325800
1986
2018-09-28 10:51:58.427202
1987
2018-09-28 10:52:00.606020
1988
2018-09-28 10:52:02.852763
1989
2018-09-28 10:52:05.200777
1990
2018-09-28 10:52:07.716460
1991
2018-09-28 10:52:10.650861
1992
2018-09-28 10:52:13.579003
1993
2018-09-28 10:52:17.492835
1994
2018-09-28 10:52:21.006468
1995
2018-09-28 10:52:24.883178
1996
2018-09-28 10:52:29.185886
1997
2018-09-28 10:52:34.038337
1998
2018-09-28 10:52:39.403452
1999
2018-09-28 10:52:45.051059
2000
2018-09-28 10:52:51.210615
2001
2018-09-28 10:52:58.005438
2002
2018-09-28 10:53:05.971672
2003
2018-09-28 10:53:14.623323
2004
2018-09-28 10:53:23.679349
2005
2018-09-28 10:53:33.227107
2006
201

In [5]:
# Get all of cited patent's citations
c2 = pd.read_pickle("DataStore/2018-08/inv_move_cites_0912.pkl")
c2 = {n:g["citing"].tolist() for n,g in c2[["cited", "citing"]].groupby("cited")}

In [7]:
# Get all firm's patents
%time a = [adict.get((asg, fyr), []) for asg,fyr in zip(c3["citing_asg"], c3["citing_appyear"])]
c3["asg_pats"] = a

# Get list of potential control candidates
%time c = [cdict.get((asg, pc, fyr), []) for asg,pc,fyr in zip(c3["citing_asg"], c3["citing_primclass"],\
                                                     c3["citing_appyear"])]

# Remove patents that cite the cited patent
%time c_2 = [list(set(i)-set(c2.get(j, []))) for i,j in zip(c, c3["cited"])]

# Add to dataframe
c3["asg_pc_pats"] = c
c3["asg_pc_pats_c"] = c_2


CPU times: user 55.9 ms, sys: 7.03 ms, total: 62.9 ms
Wall time: 60.2 ms
CPU times: user 87.9 ms, sys: 1.99 ms, total: 89.9 ms
Wall time: 89.9 ms
CPU times: user 593 ms, sys: 4.76 ms, total: 598 ms
Wall time: 600 ms


In [8]:
c3.to_pickle("DataStore/2018-08/inv_mob_cite_pc_control_0928.pkl")

### Create similarity pair sample

#### Create pairs for the citing patent all firm's PC patents in previous five years with citing patent

In [2]:
c3 = pd.read_pickle("DataStore/2018-08/inv_mob_cite_pc_control_0928.pkl")
c3["citing_l"] = c3["citing"].apply(lambda x: [x])

In [3]:
c4 = pd.DataFrame()
for c in ["asg_pc_pats_c", "citing_l"]:
    apats = (itertools.product(l,[j],[k]) for l,j,k in zip(c3[c], c3["cited"], c3["citing"]))
    apats = [item for sublist in apats for item in sublist]
    s = pd.DataFrame({"tp": [i[0] for i in apats], 
                      "op": [i[1] for i in apats],
                      "cited": [i[1] for i in apats],
                      "citing": [i[2] for i in apats]})
    s["type"] = c
    c4 = c4.append(s, ignore_index=True)
# Delete duplicates and self similarity
c4 = c4.loc[~(c4["tp"] == c4["op"])]
c4 = c4.drop_duplicates()
print(len(c4))

223901


In [4]:
def grouper(n, iterable):
    """
    >>> list(grouper(3, 'ABCDEFG'))
    [['A', 'B', 'C'], ['D', 'E', 'F'], ['G']]
    """
    iterable = iter(iterable)
    return iter(lambda: list(itertools.islice(iterable, n)), [])


import scipy.spatial.distance as distance
dms = ["ldavecs", "docvecs"]

print("Getting row values")
print(datetime.datetime.now())
pat_dict = fastparquet.ParquetFile("RawData/Cleaned/patabs7615_us_no_dup.parq").to_pandas(["patent"])["patent"].tolist()
pat_dict = dict(zip(pat_dict, range(len(pat_dict))))


l2 = c4.copy()

for dm in dms:
    print((dm,"started"))
    print("Loading matrix and dict")
    print(datetime.datetime.now())
    
    if dm == "ldavecs":
        ncols = 60
    else:
        ncols = 100

    pm = fastparquet.ParquetFile("DataStore/2018-07-P2/ML/{0}_pats_0712.parq".format(dm))\
    .to_pandas().values
    
    
    l3 = l2[["tp", "op"]]
    l3 = l3.dropna(how="any").drop_duplicates()
    # Store copy as array
    l3 = l3.loc[l3["tp"].isin(pat_dict.keys()) & l3["op"].isin(pat_dict.keys())]
    print(len(l3))

    # Convert to chunks
    print("Getting chunks")
    print(datetime.datetime.now())
    # Split into chunks
    n_rows = 3000
    n_chunks = int(np.round(len(l3)/n_rows))
    tp_chunks = grouper(n_rows, pm[[pat_dict[p[1]] for p in l3["tp"].iteritems()]])
    op_chunks = grouper(n_rows, pm[[pat_dict[p[1]] for p in l3["op"].iteritems()]])

    chunks = itertools.zip_longest(tp_chunks, op_chunks)

    print("Getting patent pair cosine similarity")
    print(datetime.datetime.now())
    # Cosine

    cos_dis = np.empty(len(l3))

    for r, c in enumerate(chunks):
        cos_dis[r*n_rows:r*n_rows+n_rows] = np.diag(distance.cdist(c[0],c[1], metric = "cosine"))

    l3["sim_{0}".format(dm)] = 1-cos_dis

    # Rename columns
    l2 = l2.merge(l3, how = "left", on = ["tp", "op"])
    print(len(l2))
    del(l3)
    print("finished")
    print(datetime.datetime.now())
    del(pm)

Getting row values
2018-09-28 11:35:53.640521
('ldavecs', 'started')
Loading matrix and dict
2018-09-28 11:35:55.820078
97262
Getting chunks
2018-09-28 11:36:13.418898
Getting patent pair cosine similarity
2018-09-28 11:36:13.732763
223901
finished
2018-09-28 11:36:23.436870
('docvecs', 'started')
Loading matrix and dict
2018-09-28 11:36:23.440571
97262
Getting chunks
2018-09-28 11:36:44.004392
Getting patent pair cosine similarity
2018-09-28 11:36:44.682682
223901
finished
2018-09-28 11:37:00.053980


In [5]:
c4 = l2
del(l2)
c4.to_pickle("DataStore/2018-08/inv_mob_cite_pc_control_0928.pkl")

### Rank similarity data for each firm's primary class

In [8]:
c4 = pd.read_pickle("DataStore/2018-08/inv_mob_cite_pc_control_0928.pkl")
c4 = c4.dropna(subset=["sim_ldavecs", "sim_docvecs"], how="any")

In [9]:
# For each citing patent, what percentile is the citing similarity at compared to the other similarities?
# c4 = c4.rename(columns={"sim_ldavecs": "asg_pc_ldavecs", "sim_docvecs": "asg_pc_docvecs"})
# c3 = c3.rename(columns={"sim_ldavecs": "citing_ldavecs", "sim_docvecs": "citing_docvecs"})

# Merge with citing similarity
# c4 = c4.merge(c3[["citing", "cited", "citing_ldavecs", "citing_docvecs"]], how="left", on=["citing", "cited"])

# Rank data for each "citing" assignee primclass similarity group
c5 = pd.DataFrame()
for n,g in c4.groupby("citing"):
    g["ldavecs_rank"] = sp.stats.rankdata(1-g["sim_ldavecs"].values, method="min")
    g["docvecs_rank"] = sp.stats.rankdata(1-g["sim_docvecs"].values, method="min")
    g["num_asg_pc"] = len(g)
    c5 = c5.append(g, ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [20]:
c5.to_pickle("DataStore/2018-08/inv_mob_cite_pc_control_0928.pkl")

In [21]:
c6 = c5.loc[(c5["type"] == "citing_l") & (c5["num_asg_pc"] > 1), 
            ["docvecs_rank", "ldavecs_rank", "citing", "cited"]]
c6.describe()

Unnamed: 0,docvecs_rank,ldavecs_rank,citing,cited
count,13875.0,13875.0,13875.0,13875.0
mean,7.110198,7.349477,6853216.0,5229942.0
std,20.394489,21.741898,788725.4,707111.7
min,1.0,1.0,4077916.0,3930285.0
25%,1.0,1.0,6358993.0,4653940.0
50%,2.0,2.0,7024403.0,5258030.0
75%,5.0,5.0,7480044.0,5794207.0
max,583.0,567.0,7861161.0,7613631.0


In [22]:
# Merge with c3
c3 = c3.merge(c6, how="left",on=["citing", "cited"])
c3.to_pickle("DataStore/2018-08/inv_mob_cite_pc_control_0928.pkl")

In [23]:
c3.columns

Index(['citing', 'cited', 'sim_ldavecs', 'sim_docvecs', 'cited_inv_msa',
       'citing_inv_msa', 'cited_appyear', 'citing_appyear', 'sec_inv_msa',
       'sec_fyear', 'sec_inv_msa_match', 'citing_asg', 'cited_asg',
       'citing_primclass', 'citing_control_asg_pc',
       'sim_ldavecs_citing_control_asg_pc',
       'sim_docvecs_citing_control_asg_pc', 'asg_pats', 'asg_pc_pats',
       'asg_pc_pats_c', 'citing_l', 'docvecs_rank', 'ldavecs_rank'],
      dtype='object')

In [25]:
len(c3.loc[c3["docvecs_rank"].notnull() & (c3["docvecs_rank"]==1)])/len(c3.loc[c3["docvecs_rank"].notnull()])

0.30033072971641683

### Analysis: Create data tables

In [84]:
dm = "docvecs"
tab = pd.DataFrame({})
for c in [dm, "mean_asg_{0}".format(dm), "mean_asg_pc_{0}".format(dm)]:
    c4 = c3.dropna(subset=["citing_{0}".format(c), "control_{0}".format(c)], how="any")
    cite_m = c4["citing_{0}".format(c)].mean()
    cont_m = c4["control_{0}".format(c)].mean()
    # Independent samples
#     t1 = sp.stats.ttest_ind(c3["citing_{0}".format(c)], c3["control_{0}".format(c)], equal_var=False, nan_policy="omit")
    # Related samples
    t2 = sp.stats.ttest_rel(c4["citing_{0}".format(c)], c4["control_{0}".format(c)], nan_policy="omit")
    tab[c] = [cite_m, cont_m, t2[0], t2[1], len(c4)]
    
tab.columns = ["Sim DocVecs to Cited", "\makecell{Mean Sim Docvecs,\\\\Own Prior Pats}",
               "\makecell{Mean Sim Docvecs,\\\\Own Prior Pats in Citing PC}"]
tab.index = ["Citing", "Control", "$t$-value", "$p$-value", "$N$"]
 

In [88]:
tab2 = np.round(tab,3)
tab2.loc["$N$"] = tab2.loc["$N$"].astype(int).astype(str)
print(tab2.to_latex(escape=False,column_format="lccc"))

\begin{tabular}{lccc}
\toprule
{} & Sim DocVecs to Cited & \makecell{Mean Sim Docvecs,\\Own Prior Pats} & \makecell{Mean Sim Docvecs,\\Own Prior Pats in Citing PC} \\
\midrule
Citing    &                0.278 &                                        0.281 &                                              0.328 \\
Control   &                0.234 &                                         0.28 &                                              0.327 \\
$t$-value &               26.637 &                                        1.096 &                                              1.458 \\
$p$-value &                    0 &                                        0.273 &                                              0.145 \\
$N$       &                 8951 &                                         6407 &                                               6338 \\
\bottomrule
\end{tabular}



### Stats on inventors moving

In [93]:
yv = "appyear"
# All inventors who have moved
ip = pd.read_pickle("DataStore/2018-07/inv_move_pats_0712.pkl")
print(len(ip))

pdf = fastparquet.ParquetFile("RawData/Cleaned/patent_loc_unique_us_0628.parq")\
.to_pandas(["patent", "inv_msa", "gyear", "appyear"])

# Add application year
ip[yv] = ip["patent"].map(dict(zip(pdf["patent"], pdf[yv])))

# Sort by inventor, grant year
ip = ip.sort_values(["inventor_id", yv])

# Only look at inventors' first and second cities
ip = ip.loc[(ip["inv_asg_rank"] <= 1)]

# Inventors' second cities
sc = ip.loc[(ip["inv_asg_rank"] == 1), ["inventor_id", "inv_msa", yv]].drop_duplicates(["inventor_id", "inv_msa"])

# Inventors' second city compared to first
ip["sec_inv_msa"] = ip["inventor_id"].map(dict(zip(sc["inventor_id"], sc["inv_msa"])))

# Second city's first grant year
ip["sec_fyear"] = ip["inventor_id"].map(dict(zip(sc["inventor_id"], sc[yv])))

# # Get rid of the inventors whose second MSA matches the first
# ip = ip.loc[~(ip["inv_msa"] == ip["sec_inv_msa"])]
# print(len(ip))

140076


In [96]:
print(len(ip.loc[~(ip["inv_msa"] == ip["sec_inv_msa"])]), len(ip.loc[(ip["inv_msa"] == ip["sec_inv_msa"])]),
      len(ip), len(ip.loc[~(ip["inv_msa"] == ip["sec_inv_msa"])])/len(ip))

12846 53944 66790 0.1923341817637371


### Mobile inventors' prior patent citations

In [99]:
ip = pd.read_pickle("DataStore/2018-08/inv_move_pats_0912.pkl")
ip.columns

Index(['patent', 'inventor_id', 'location_id', 'city', 'state', 'country',
       'inv_msa', 'gyear', 'assignee_id', 'inv_asg_rank', 'appyear',
       'sec_inv_msa', 'sec_fyear', 'sec_inv_msa_match_prior',
       'sec_inv_msa_match_post', 'sim_docvecs_prior', 'sim_docvecs_post',
       'sim_ldavecs_prior', 'sim_ldavecs_post'],
      dtype='object')

In [104]:
ip.head()
# Two approaches
# 1. Drop rows with missing values
i2 = ip[['sec_inv_msa_match_prior', 'sec_inv_msa_match_post']].dropna(how="any")
display(i2.describe())

# 2. Only use patents that received citations, i.e. is not null for both prior and post. Then Fill nan with 0.
i3 = ip[['sec_inv_msa_match_prior', 'sec_inv_msa_match_post']].dropna(how="all").fillna(0)
display(i3.describe())

Unnamed: 0,sec_inv_msa_match_prior,sec_inv_msa_match_post
count,2754.0,2754.0
mean,0.058752,0.097849
std,0.189363,0.215061
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,0.076923
max,1.0,1.0


Unnamed: 0,sec_inv_msa_match_prior,sec_inv_msa_match_post
count,6497.0,6497.0
mean,0.029152,0.077534
std,0.136997,0.198578
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,0.0
max,1.0,1.0


### Number of assignees that cite mobile inventors patents

In [108]:
print(len(set(a1)), len(set(a2)), len(set(new_cite_asgs)), len(set(new_cite_asgs))/len(set(a2)))

4316 10578 8497 0.803270939686141


In [109]:
# Number of citations from new assignees
print(len(c3))

27817
