In [2]:
import numpy as np
import scipy as sp
import pandas as pd
import timeit
import datetime
import time
import pprint
import itertools
import pickle
import sklearn
import dask
import os
os.chdir('/mnt/t48/bighomes-active/sfeng/patentdiffusion/')
import fastparquet
seed = 3
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import h5py
import dask
import dask.dataframe as dd
import dask.array as da
# Percentiles
from scipy.stats import percentileofscore
from IPython.display import display

For sample generation, see: https://sfengc7.stern.nyu.edu:8888/notebooks/patentdiffusion/201808Results/StrategicNonCitations/Previous/1d-InventorMobilityNewCitations-0911.ipynb

### Alternatively: Find control patent for each post-move citation
- Find new firms that cite prior patent post-move
- For each post-move citation at second MSA to inventor's prior move patent, find patent in same primary class that does not cite the patent granted prior to year of  

In [3]:
yv = "appyear"
c2 = pd.read_pickle("DataStore/2018-08/inv_move_cites_0912.pkl")
ip = pd.read_pickle("DataStore/2018-08/inv_move_pats_0912.pkl")

In [5]:
# Use unique assignees
c2 = c2.drop(["cited_asg", "citing_asg"],1)
asgs = fastparquet.ParquetFile("RawData/Cleaned/patent_assignees_unique_0628.parq").to_pandas(["patent", "assignee_id"])
pdf = fastparquet.ParquetFile("RawData/Cleaned/patent_loc_unique_us_0628.parq")\
.to_pandas(["patent", "primclass", "appyear"])
pdf = pdf.merge(asgs, how = "left", on = "patent")

c2 = c2.merge(asgs, how="left", left_on="citing", right_on="patent").rename(columns={"assignee_id": "citing_asg"}).drop("patent",1)
c2 = c2.merge(asgs, how="left", left_on="cited", right_on="patent").rename(columns={"assignee_id": "cited_asg"}).drop("patent",1)
del(asgs)

# New firms that cite prior patent post move
a1 = c2.loc[(c2["citing_appyear"] < c2["sec_fyear"]), "citing_asg"].tolist()
a2 = c2.loc[(c2["citing_appyear"] >= c2["sec_fyear"]), "citing_asg"].tolist()
new_cite_asgs = list(set(a2).difference(set(a1)))

# New cites
c3 = c2.loc[(c2["citing_appyear"] >= c2["sec_fyear"]) & c2["citing_asg"].isin(new_cite_asgs)]

# Merge new citing patents with pdf
c3["citing_primclass"] = c3["citing"].map(dict(zip(pdf["patent"], pdf["primclass"])))
del(c2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [18]:
# Patents by newly citing assignees
pdf = pdf.loc[pdf["assignee_id"].isin(new_cite_asgs)]
len(pdf)

# Sort by assignee, primclass, app year
pdf = pdf.sort_values(["assignee_id", "primclass", yv], ascending = [1,1,0])

# Groupby assignee, primclass, app year
p2 = pdf.groupby(["assignee_id", "primclass"])

# Control patent by assignee, primclass
cdict = {}
for yr in range(1975, 2016):
    print(yr)
    print(datetime.datetime.now())
    # Control patent by assignee, primclass
    p2 = pdf.loc[(pdf["appyear"].isin(range(yr-5,yr+1))), \
        ["appyear", "assignee_id", "primclass", "patent"]].groupby(["assignee_id", "primclass"])
    p2 = {n+(yr,): (g["patent"].tolist() if len(g["patent"].tolist()) >= 1 else None) for n,g in p2}
    cdict.update(p2)
    del(p2)

1975
2018-09-18 18:12:51.280341
1976
2018-09-18 18:12:51.760796
1977
2018-09-18 18:12:52.432847
1978
2018-09-18 18:12:53.267608
1979
2018-09-18 18:12:54.342849
1980
2018-09-18 18:12:55.835915
1981
2018-09-18 18:12:57.041474
1982
2018-09-18 18:12:58.306479
1983
2018-09-18 18:12:59.740788
1984
2018-09-18 18:13:01.185551
1985
2018-09-18 18:13:02.689110
1986
2018-09-18 18:13:04.221915
1987
2018-09-18 18:13:05.832788
1988
2018-09-18 18:13:07.542305
1989
2018-09-18 18:13:09.801002
1990
2018-09-18 18:13:13.006353
1991
2018-09-18 18:13:15.333865
1992
2018-09-18 18:13:17.824082
1993
2018-09-18 18:13:20.480050
1994
2018-09-18 18:13:23.056017
1995
2018-09-18 18:13:26.051266
1996
2018-09-18 18:13:29.206569
1997
2018-09-18 18:13:32.651648
1998
2018-09-18 18:13:36.695311
1999
2018-09-18 18:13:41.250473
2000
2018-09-18 18:13:46.536752
2001
2018-09-18 18:13:52.642300
2002
2018-09-18 18:13:58.755443
2003
2018-09-18 18:14:05.308897
2004
2018-09-18 18:14:12.161804
2005
2018-09-18 18:14:19.666181
2006
201

In [19]:
# Get all of cited patent's citations
c2 = pd.read_pickle("DataStore/2018-08/inv_move_cites_0912.pkl")
c2 = {n:g["citing"].tolist() for n,g in c2[["cited", "citing"]].groupby("cited")}

In [26]:
# Get list of potential control candidates
%time c = [cdict.get((asg, pc, fyr), []) for asg,pc,fyr in zip(c3["citing_asg"], c3["citing_primclass"],\
                                                     c3["citing_appyear"])]
# Remove patents that cite the cited patent
%time c_2 = [list(set(i)-set(c2.get(j, []))) for i,j in zip(c, c3["cited"])]

# Control: since earlier patents have smaller numbers, the control will be one granted closer in date with highest patent number
c_3 = [max(i) if len(i) >= 1 else np.nan for i in c_2]

c3["citing_control_asg_pc"] = c_3
del(c, c_2, c_3, c2)

CPU times: user 128 ms, sys: 2.05 ms, total: 130 ms
Wall time: 125 ms
CPU times: user 729 ms, sys: 0 ns, total: 729 ms
Wall time: 733 ms


In [29]:
c3.to_pickle("DataStore/2018-08/inv_mob_cite_pc_control_0918.pkl")

In [36]:
def grouper(n, iterable):
    """
    >>> list(grouper(3, 'ABCDEFG'))
    [['A', 'B', 'C'], ['D', 'E', 'F'], ['G']]
    """
    iterable = iter(iterable)
    return iter(lambda: list(itertools.islice(iterable, n)), [])


import scipy.spatial.distance as distance
dms = ["ldavecs", "docvecs"]

print("Getting row values")
print(datetime.datetime.now())
pat_dict = fastparquet.ParquetFile("RawData/Cleaned/patabs7615_us_no_dup.parq").to_pandas(["patent"])["patent"].tolist()
pat_dict = dict(zip(pat_dict, range(len(pat_dict))))


l2 = c3.copy()

for dm in dms:
    print((dm,"started"))
    print("Loading matrix and dict")
    print(datetime.datetime.now())
    
    if dm == "ldavecs":
        ncols = 60
    else:
        ncols = 100

    pm = fastparquet.ParquetFile("DataStore/2018-07-P2/ML/{0}_pats_0712.parq".format(dm))\
    .to_pandas().values
    
    for col in ["citing_control_asg_pc"]:
        l3 = pd.DataFrame({"tp": c3["cited"], "op": c3[col]})
        l3 = l3.dropna(how="any").drop_duplicates()
        # Store copy as array
        l3 = l3.loc[l3["tp"].isin(pat_dict.keys()) & l3["op"].isin(pat_dict.keys())]
        print(len(l3))

        # Convert to chunks
        print("Getting chunks")
        print(datetime.datetime.now())
        # Split into chunks
        n_rows = 3000
        n_chunks = int(np.round(len(l3)/n_rows))
        tp_chunks = grouper(n_rows, pm[[pat_dict[p[1]] for p in l3["tp"].iteritems()]])
        op_chunks = grouper(n_rows, pm[[pat_dict[p[1]] for p in l3["op"].iteritems()]])
        
        chunks = itertools.zip_longest(tp_chunks, op_chunks)

        print("Getting patent pair cosine similarity")
        print(datetime.datetime.now())
        # Cosine

        cos_dis = np.empty(len(l3))

        for r, c in enumerate(chunks):
            cos_dis[r*n_rows:r*n_rows+n_rows] = np.diag(distance.cdist(c[0],c[1], metric = "cosine"))

        l3["sim_{0}_{1}".format(dm,col)] = 1-cos_dis

        # Rename columns
        l3 = l3.rename(columns={"tp": "cited", "op": col})
        l2 = l2.merge(l3, how = "left", on = ["cited", col])
        print(len(l2))
        del(l3)
        print("finished")
        print(datetime.datetime.now())
    del(pm)

Getting row values
2018-09-27 13:16:09.192614
('ldavecs', 'started')
Loading matrix and dict
2018-09-27 13:16:11.902623
7846
Getting chunks
2018-09-27 13:16:23.123945
Getting patent pair cosine similarity
2018-09-27 13:16:23.178828
27817
finished
2018-09-27 13:16:24.592446
('docvecs', 'started')
Loading matrix and dict
2018-09-27 13:16:24.602495
7846
Getting chunks
2018-09-27 13:16:48.076133
Getting patent pair cosine similarity
2018-09-27 13:16:48.173546
27817
finished
2018-09-27 13:16:49.830461


In [37]:
c3.head()

Unnamed: 0,citing,cited,sim_ldavecs,sim_docvecs,cited_inv_msa,citing_inv_msa,cited_appyear,citing_appyear,sec_inv_msa,sec_fyear,sec_inv_msa_match,citing_asg,cited_asg,citing_primclass,citing_control_asg_pc,sim_ldavecs_citing_control_asg_pc,sim_docvecs_citing_control_asg_pc
0,7837428,5765986,0.517405,0.346076,"Reyes Place, CA","Los Angeles-Long Beach-Santa Ana, CA",1995.0,2007.0,"San Francisco-Oakland-Fremont, CA",2000.0,False,ed4a5c14c182669b71d3c3a7dd3fb40d,65e7638ba8d787b699cb8a35381c47cb,414.0,,,
1,6131756,4673112,0.308394,0.026374,"North Conway, NH","Buffalo-Niagara Falls, NY",1985.0,1997.0,"Buffalo-Niagara Falls, NY",1984.0,True,b9e57d6c92ba464bcc64bb7582f2475d,ea0f2ad6019e13b612e3b84b6c97d08b,220.0,,,
2,7300433,5971979,,,"San Francisco-Oakland-Fremont, CA","Boston-Cambridge-Quincy, MA-NH",1997.0,2003.0,"San Jose-Sunnyvale-Santa Clara, CA",1999.0,False,57118d075226aaaa1e8cd261c28e24f5,f3f87efb03e97170334721471a935585,606.0,7097641.0,0.403679,0.46322
3,7246244,5802199,,,"San Francisco-Oakland-Fremont, CA","New York-Northern New Jersey-Long Island, NY-N...",1997.0,2005.0,"Santa Rosa-Petaluma, CA",1996.0,False,2267041f3263c6a765376db6932e65ac,d62b58104e0f118c0b087f21f3df259c,713.0,,,
4,6358993,5011834,0.768476,0.791573,"Eugene-Springfield, OR","Boston-Cambridge-Quincy, MA-NH",1989.0,1999.0,"Los Angeles-Long Beach-Santa Ana, CA",1993.0,False,c9980fda27658ca1dab82b9473d3bffc,e76e5d396a6c43e5a43e4fcf44a1c84f,514.0,6482839.0,0.703835,0.455489


In [38]:
c3.to_pickle("DataStore/2018-08/inv_mob_cite_pc_control_0918.pkl")

In [35]:
l2.dropna(subset=["sim_docvecs", "sim_docvecs_citing_control_asg_pc"], how="any")\
[["sim_docvecs", "sim_docvecs_citing_control_asg_pc"]].mean()

sim_docvecs                          0.278359
sim_docvecs_citing_control_asg_pc    0.233973
dtype: float64