In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import timeit
import datetime
import time
import pprint
import itertools
import pickle
import sklearn
import dask
import os
os.chdir('/mnt/t48/bighomes-active/sfeng/patentdiffusion/')
import fastparquet
seed = 3
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import h5py
import dask
import dask.dataframe as dd
import dask.array as da

  from ._conv import register_converters as _register_converters


## Effect of lawyers on citations

### 1. Create sample of patent pairs from same lawyer and different lawyers
- Use only pairs from the same grant year and primary class
- Compare the % of citations that they overlap

In [57]:
# Patents and lawyers
dc = fastparquet.ParquetFile("DataStore/2018-06/citation_pairs_all_merged_0619.parq").to_pandas()
# ldf = pd.read_csv("RawData/Cleaned/patent_lawyer.csv", index_col=0)
# ldf.head()

In [3]:
dc = dc.merge(ldf.add_prefix("citing_"), left_on = "citing", right_on = "citing_patent").drop("citing_patent",1)
dc = dc.merge(ldf.add_prefix("cited_"), left_on = "cited", right_on = "cited_patent").drop("cited_patent",1)

In [48]:
def get_sample(key, d, num):
    try:
        s = np.random.choice(d[key], size=num, replace=True)
    except Exception:
        s = [np.nan]*num
    return s

# Get unique list of cited patents and their lawyers
dcl = {n:g["cited"].tolist() for n,g in dc[["cited", "cited_lawyer_id"]].drop_duplicates().groupby("cited_lawyer_id")}

In [49]:
%%time

# Lawyer Match sample
l1 = (get_sample(n, dcl, len(g)) for n,g in dcl.items())
l1 = [item for sublist in l1 for item in sublist]
l2 = (get_sample(n, dcl, len(g)) for n,g in dcl.items())
l2 = [item for sublist in l2 for item in sublist]

dcl = pd.DataFrame({"p1": l1, "p2": l2}).drop_duplicates()
print(len(dcl))
dcl = dcl.loc[dcl["p1"] != dcl["p2"]]
print(len(dcl))
del(l1, l2)

1730638
1656170
CPU times: user 4.36 s, sys: 71.4 ms, total: 4.43 s
Wall time: 4.35 s


In [50]:
%%time

# Get sample of patents in same primary class & grant year
pdf = fastparquet.ParquetFile("RawData/Cleaned/patent_loc_unique_us_0628.parq").\
to_pandas(["patent", "gyear", "primclass"])

# Get unique list of patents by primary class & grant year
pdf = {n:g["patent"].tolist() for n,g in pdf.drop_duplicates().groupby(["gyear", "primclass"])}

# Lawyer Match sample
l1 = (get_sample(n, pdf, len(g)) for n,g in pdf.items())
l1 = [item for sublist in l1 for item in sublist]
l2 = (get_sample(n, pdf, len(g)) for n,g in pdf.items())
l2 = [item for sublist in l2 for item in sublist]

pdf = pd.DataFrame({"p1": l1, "p2": l2}).drop_duplicates()
print(len(pdf))
pdf = pdf.loc[pdf["p1"] != pdf["p2"]]
print(len(pdf))
del(l1, l2)

dcl = dcl.append(pdf.sample(frac=0.1), ignore_index=True)
del(pdf)

2240707
2224413
CPU times: user 7.41 s, sys: 243 ms, total: 7.66 s
Wall time: 7.58 s


In [51]:
# Merge back lawyer data
dcl = dcl.merge(ldf.add_prefix("p1_"), left_on = "p1", right_on = "p1_patent").drop("p1_patent",1)
dcl = dcl.merge(ldf.add_prefix("p2_"), left_on = "p2", right_on = "p2_patent").drop("p2_patent",1)

dcl["lawyer_match"] = (dcl["p1_lawyer_id"] == dcl["p2_lawyer_id"])

# Drop duplicates
dcl = dcl.drop_duplicates(["p1", "p2"])
print(len(dcl))

1838403


In [52]:
del(ldf)

In [64]:
# Add primary class and grant year info; only use pairs that are from the same primary class and grant year
dcl = pd.read_pickle("DataStore/2018-08/lawyer_pairs_0816.pkl")[["p1", "p2", "lawyer_match",\
                                                                 "p1_lawyer_id", "p2_lawyer_id"]]
pdf = fastparquet.ParquetFile("RawData/Cleaned/patent_loc_unique_us_0628.parq").to_pandas(["patent", "gyear", "primclass", "inv_msa"])
dcl = dcl.merge(pdf.add_prefix("p1_"), left_on = "p1", right_on = "p1_patent").drop("p1_patent",1)
dcl = dcl.merge(pdf.add_prefix("p2_"), left_on = "p2", right_on = "p2_patent").drop("p2_patent",1)
del(pdf)

# Get grant year and primary class match
dcl = dcl.loc[(dcl["p1_gyear"] == dcl["p2_gyear"]) & (dcl["p1_primclass"] == dcl["p2_primclass"])]
dcl["inv_msa_match"] = (dcl["p1_inv_msa"] == dcl["p2_inv_msa"])
print(len(dcl))

202355


In [65]:
dcl["lawyer_match"].value_counts()

False    184049
True      18306
Name: lawyer_match, dtype: int64

In [75]:
dcl[["inv_msa_match", "lawyer_match", "p1"]].groupby(["inv_msa_match", "lawyer_match"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,p1
inv_msa_match,lawyer_match,Unnamed: 2_level_1
False,False,173191
False,True,6860
True,False,10858
True,True,11446


In [71]:
# Save
dcl.to_pickle("DataStore/2018-08/lawyer_pairs_0816.pkl")

#### Find number of overlapping citations

- i.e. overlapping citations made by patent

In [73]:
# Find number of overlapping citations
dc = fastparquet.ParquetFile("DataStore/2018-06/citation_pairs_all_merged_0619.parq").to_pandas()
dc = dc.loc[dc["citing"].isin(dcl["p1"]) | dc["citing"].isin(dcl["p2"])]
%time dc = {n:g["cited"].tolist() for n,g in dc.groupby("citing")}
%time c_match = (set(dc.get(p1, [])).intersection(set(dc.get(p2, []))) for p1, p2 in zip(dcl["p1"], dcl["p2"]))
dcl["num_common_cites"] = [len(i) for i in c_match]

CPU times: user 45.4 s, sys: 31.7 ms, total: 45.4 s
Wall time: 48.3 s
CPU times: user 10.4 ms, sys: 14 µs, total: 10.4 ms
Wall time: 9.47 ms


In [76]:
dcl[["lawyer_match", "inv_msa_match", "num_common_cites"]].groupby(["lawyer_match", "inv_msa_match"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,num_common_cites
lawyer_match,inv_msa_match,Unnamed: 2_level_1
False,False,0.0141
False,True,0.357064
True,False,1.175073
True,True,2.419098


### Overlapping citations in MSA

1. For each *p1, p2*: Find percentage of cites matching *p1_inv_msa*, *p2_inv_msa*

Code taken from: https://sfengc7.stern.nyu.edu:8888/notebooks/patentdiffusion/Results/JTHReplication/1-ReplicationSample.ipynb

In [100]:
tpats = list(dcl["p1"])+list(dcl["p2"])
cit = dd.read_parquet("RawData/Cleaned/cit_0628.parq")
cit = cit[cit["cited"].isin(tpats)]

# Convert back to pandas
cit = cit.compute()
print(len(cit))

7005163


In [102]:
%%time
# Remove self citations
asgs = pickle.load(open("RawData/Cleaned/patent_assignee_dict_0628.pkl", "rb"))
%time asg_match = (set(asgs.get(cited, [])).intersection(asgs.get(citing, [])) for cited, citing \
                   in zip(cit["cited"], cit["citing"]))
%time asg_match = [len(i) for i in asg_match]
del(asgs)

cit["asg_match"] = asg_match
cit = cit.loc[cit["asg_match"] == 0]
cit = cit[["citing", "cited"]]
print(len(cit))

CPU times: user 378 ms, sys: 17.7 ms, total: 395 ms
Wall time: 381 ms
CPU times: user 19.3 s, sys: 29.8 ms, total: 19.3 s
Wall time: 18.6 s
6329211
CPU times: user 34.5 s, sys: 1.85 s, total: 36.3 s
Wall time: 35 s


In [103]:
# Add tp to each citation
tdict = dict(zip(dcl["p1"], dcl["p1"]))
tdict.update(dict(zip(dcl["p2"], dcl["p1"])))
cit["p1"] = cit["cited"].map(tdict)
del(tdict)

In [104]:
# Merge gyear & location data
pdf = fastparquet.ParquetFile("RawData/Cleaned/patent_loc_unique_us_0628.parq").to_pandas(["patent", "inv_msa", "gyear"])
cit = cit.merge(pdf.add_prefix("citing_"), left_on = "citing", right_on = "citing_patent").drop("citing_patent",1)
# Cited patent's grant year
cit = cit.merge(pdf.add_prefix("cited_"), left_on = "cited", right_on = "cited_patent").drop("cited_patent",1)
# Target patent's location
cit = cit.merge(pdf[["patent", "inv_msa"]].add_prefix("p1_"), left_on = "p1", right_on = "p1_patent").drop("p1_patent",1)
del(pdf)

# Citation occured within 10 years of cited patent's grant date
cit["year_diff"] = cit["citing_gyear"]-cit["cited_gyear"]
print(len(cit))

# Citing inv msa matches tp inv msa
cit["inv_msa_match"] = (cit["p1_inv_msa"] == cit["citing_inv_msa"])

# Is cited patent target
cit["is_p1"] = (cit["p1"] == cit["cited"])

4210045


In [105]:
cit.head()

Unnamed: 0,citing,cited,p1,citing_inv_msa,citing_gyear,cited_inv_msa,cited_gyear,p1_inv_msa,year_diff,inv_msa_match,is_p1
0,8563689,5858784,5882929,"Durham-Chapel Hill, NC",2013,"San Francisco-Oakland-Fremont, CA",1999,"Boston-Cambridge-Quincy, MA-NH",14,False,False
1,6797276,5858784,5882929,"Washington-Arlington-Alexandria, DC-VA-MD-WV",2004,"San Francisco-Oakland-Fremont, CA",1999,"Boston-Cambridge-Quincy, MA-NH",5,False,False
2,8246934,5858784,5882929,"San Francisco-Oakland-Fremont, CA",2012,"San Francisco-Oakland-Fremont, CA",1999,"Boston-Cambridge-Quincy, MA-NH",13,False,False
3,8349294,5858784,5882929,"San Diego-Carlsbad-San Marcos, CA",2013,"San Francisco-Oakland-Fremont, CA",1999,"Boston-Cambridge-Quincy, MA-NH",14,False,False
4,8153602,5858784,5882929,"San Diego-Carlsbad-San Marcos, CA",2012,"San Francisco-Oakland-Fremont, CA",1999,"Boston-Cambridge-Quincy, MA-NH",13,False,False


In [106]:
# MSA Match Rate for each Patent
match_10 = cit.loc[(cit["year_diff"] <= 10), ["citing", "inv_msa_match"]].groupby(["citing"]).mean()

# Merge with lawyer data
dcl["p1_msa_match_p1"] = dcl["p1"].map(pd.Series(match_10["inv_msa_match"]))
dcl["p2_msa_match_p1"] = dcl["p2"].map(pd.Series(match_10["inv_msa_match"]))

In [109]:
d2 = dcl[["lawyer_match", "inv_msa_match", "p1_msa_match_p1", "p2_msa_match_p1"]]
d2.groupby(["lawyer_match", "inv_msa_match"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,p1_msa_match_p1,p2_msa_match_p1
lawyer_match,inv_msa_match,Unnamed: 2_level_1,Unnamed: 3_level_1
False,False,0.080094,0.080269
False,True,0.177832,0.167734
True,False,0.082771,0.083026
True,True,0.12931,0.130651


In [93]:
d2

Unnamed: 0,lawyer_match,p1_msa_match_p1,p2_msa_match_p2
0,True,,
4,True,,
18,True,,
39,True,,
40,True,,
47,False,,
52,False,,
54,False,,
57,True,,
62,True,,


In [None]:
# Remove non-cited patents from sample: some may have vanished from self-citation cleaning
print(len(ts))

tp_match_5 = cit.loc[(cit["is_tp"] == True) & (cit["year_diff"] <= 5), ["tp", "inv_msa_match"]].groupby(["tp"]).mean()
cp_match_5 = cit.loc[(cit["is_tp"] == False) & (cit["year_diff"] <= 5), ["tp", "inv_msa_match"]].groupby(["tp"]).mean()

tp_match_10 = cit.loc[(cit["is_tp"] == True) & (cit["year_diff"] <= 10), ["tp", "inv_msa_match"]].groupby(["tp"]).mean()
cp_match_10 = cit.loc[(cit["is_tp"] == False) & (cit["year_diff"] <= 10), ["tp", "inv_msa_match"]].groupby(["tp"]).mean()

ts["tp_match_5"] = ts["tp"].map(pd.Series(tp_match_5["inv_msa_match"]))
ts["cp_match_5"] = ts["tp"].map(pd.Series(cp_match_5["inv_msa_match"]))

ts["tp_match_10"] = ts["tp"].map(pd.Series(tp_match_10["inv_msa_match"]))
ts["cp_match_10"] = ts["tp"].map(pd.Series(cp_match_10["inv_msa_match"]))

# Drop all values that have all 4 columns missing; some self-citation removal might have made this empty
ts = ts.dropna(subset=["tp_match_5", "cp_match_5", "tp_match_10", "cp_match_10"], how="all")
print(len(ts))