In [14]:
import numpy as np
import scipy as sp
import pandas as pd
import timeit
import datetime
import time
import pprint
import itertools
import pickle
import sklearn
import dask
import dask.dataframe as dd
import dask.array as da
import os
os.chdir('/mnt/t48/bighomes-active/sfeng/patentdiffusion/')
import fastparquet
seed = 3
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
# Distances
import scipy.spatial.distance as distance
# KL
from scipy.stats import entropy
# Normalize
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Pairwise distances
from sklearn.metrics.pairwise import pairwise_distances
import h5py

## Claims similarities
- Source code here: https://sfengc7.stern.nyu.edu:8888/notebooks/patentdiffusion/201808Results/LocalRegressions/2a-Similarities-0824.ipynb
- If I also want to do PC similarity: https://sfengc7.stern.nyu.edu:8888/notebooks/patentdiffusion/201808Results/MeanSim0930/1-MeanPCSim.ipynb

In [15]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.basicConfig(level=logging.INFO)
logger.addHandler(logging.FileHandler('Logs/pairsim_{0}.log'.format(datetime.datetime.now().\
                                                            strftime("%Y-%m-%d"), 'a')))
print = logging.info
print('good day to you madam fiona')
print('started')
print(datetime.datetime.now())

INFO:root:good day to you madam fiona
INFO:root:started
INFO:root:2018-11-26 14:41:27.483220


In [17]:
def grouper(n, iterable):
    """
    >>> list(grouper(3, 'ABCDEFG'))
    [['A', 'B', 'C'], ['D', 'E', 'F'], ['G']]
    """
    iterable = iter(iterable)
    return iter(lambda: list(itertools.islice(iterable, n)), [])


import scipy.spatial.distance as distance
dms = ["docvecs"]

cols = ["patent", "gyear", "inv_msa", "naics_name", "primclass"]
tp_cols = ["tp", "tp_gyear", "tp_inv_msa", "tp_naics_name", "tp_primclass"]
op_cols = ["op", "op_gyear", "op_inv_msa", "op_naics_name", "op_primclass"]


try:
    for k in ["naics_name", "primclass"]:
#         for k in ["primclass"]:
        # 1. Load data

        print("computing dataframe")
        print(datetime.datetime.now())
        l2 = fastparquet.ParquetFile("DataStore/2018-10/Reg1016/{0}_all_1016.parq".format(k)).to_pandas(["tp", "op"])
        l2 = l2.drop_duplicates()
        print(len(l2))
        print("Finished")
        print(datetime.datetime.now())

        print("Getting row values")
        print(datetime.datetime.now())
        pat_dict = pd.read_pickle("RawData/Cleaned/pat_dict_claims_1120.pkl").to_dict()
        
        for dm in dms:
            print((k,dm,"started"))
            print("Loading matrix and dict")
            print(datetime.datetime.now())
            # Store copy as array
            l3 = l2.loc[l2["tp"].isin(pat_dict.keys()) & l2["op"].isin(pat_dict.keys()), ["tp", "op"]].copy()

            if dm == "ldavecs":
                ncols = 60
            else:
                ncols = 100

            pm = fastparquet.ParquetFile("DataStore/2018-07-P2/ML/{0}_claims_pats_1116.parq".format(dm))\
    .to_pandas().values

            # Convert to chunks
            print("Getting chunks")
            print(datetime.datetime.now())
            # Split into chunks
            n_rows = 3000
            n_chunks = int(np.round(len(l3)/n_rows))
            tp_chunks = grouper(n_rows, pm[[pat_dict[p[1]] for p in l3["tp"].iteritems()]])
            op_chunks = grouper(n_rows, pm[[pat_dict[p[1]] for p in l3["op"].iteritems()]])
            del(pm)
            chunks = itertools.zip_longest(tp_chunks, op_chunks)

            print("Getting patent pair cosine similarity")
            print(datetime.datetime.now())
            # Cosine

            cos_dis = np.empty(len(l3))

            for r, c in enumerate(chunks):
                cos_dis[r*n_rows:r*n_rows+n_rows] = np.diag(distance.cdist(c[0],c[1], metric = "cosine"))

            l3["sim_claims_{0}".format(dm)] = 1-cos_dis
            # Drop missing similarity
            l3 = l3.dropna(subset=["sim_claims_{0}".format(dm)])
            # Add norm
            l3["norm_sim_claims_{0}".format(dm)] = scaler.fit_transform(l3["sim_claims_{0}".format(dm)]\
                                                                        .values.reshape(-1,1))
            
            l2 = l2.merge(l3, how = "left", on = ["tp", "op"]).drop_duplicates()
            del(l3)
            print("finished")
            print(datetime.datetime.now())
        
        # Merge with original
        cols = ['tp', 'op', 'sim_docvecs', 'sim_ldavecs', 'tp_gyear', 'tp_naics_name',
       'op_naics_name', 'op_primclass', 'op_inv_msa', 'inv_msa_match',
       'primclass_match', 'norm_sim_ldavecs', 'norm_sim_docvecs', 'year_group',
       'common_est_inv', 'common_pat_inv', 'lawyer_match', 'num_common_cited',
       'norm_num_common_cited', 'tp_pct_common_cited',
       'norm_tp_pct_common_cited', 'common_cited_match', 'common_npc_match',
       'mean_sim_docvecs_pc','mean_sim_docvecs_pc_msa',
       'norm_mean_sim_docvecs_pc','norm_mean_sim_docvecs_pc_msa',
       'num_common_npc',
       'norm_num_common_npc', 'tp_primclass_FE', 'tp_inv_msa_FE',
       'tp_examiner_FE', 'tp_lawyer_FE', 'examiner_match']
        l1 = fastparquet.ParquetFile("DataStore/2018-10/Reg1016/{0}_all_1016.parq".format(k)).to_pandas(cols)
        print(len(l1))
        l2 = l2.merge(l1, how = "left", on = ["tp", "op"]).drop_duplicates()
        print(len(l2))
        fastparquet.write("DataStore/2018-11/{0}_sim_claims_1120.parq".format(k), l2, compression="GZIP")
        del(l2)
except Exception as e:
    logging.exception("message")  


INFO:root:computing dataframe
INFO:root:2018-11-26 14:44:14.768235
INFO:root:1498184
INFO:root:Finished
INFO:root:2018-11-26 14:44:15.889062
INFO:root:Getting row values
INFO:root:2018-11-26 14:44:15.890911
INFO:root:('naics_name', 'docvecs', 'started')
INFO:root:Loading matrix and dict
INFO:root:2018-11-26 14:44:16.453753
INFO:root:Getting chunks
INFO:root:2018-11-26 14:44:30.856074
INFO:root:Getting patent pair cosine similarity
INFO:root:2018-11-26 14:44:44.385805
INFO:root:finished
INFO:root:2018-11-26 14:48:48.329301
INFO:root:1498184
INFO:root:1498184
INFO:root:computing dataframe
INFO:root:2018-11-26 14:50:47.282074
INFO:root:1369833
INFO:root:Finished
INFO:root:2018-11-26 14:50:47.996860
INFO:root:Getting row values
INFO:root:2018-11-26 14:50:47.999432
INFO:root:('primclass', 'docvecs', 'started')
INFO:root:Loading matrix and dict
INFO:root:2018-11-26 14:50:48.679639
INFO:root:Getting chunks
INFO:root:2018-11-26 14:51:03.022831
INFO:root:Getting patent pair cosine similarity
IN

In [24]:
k = "naics_name"
l2 = fastparquet.ParquetFile("DataStore/2018-11/{0}_sim_claims_1120.parq".format(k)).to_pandas()

In [25]:
l2[["sim_claims_docvecs", "norm_sim_claims_docvecs", "sim_docvecs", "norm_sim_docvecs"]].corr()

Unnamed: 0,sim_claims_docvecs,norm_sim_claims_docvecs,sim_docvecs,norm_sim_docvecs
sim_claims_docvecs,1.0,1.0,0.265871,0.265871
norm_sim_claims_docvecs,1.0,1.0,0.265871,0.265871
sim_docvecs,0.265871,0.265871,1.0,1.0
norm_sim_docvecs,0.265871,0.265871,1.0,1.0


In [26]:
np.round(l2[["sim_claims_docvecs", "norm_sim_claims_docvecs", "sim_docvecs", "norm_sim_docvecs"]].describe(),2)

Unnamed: 0,sim_claims_docvecs,norm_sim_claims_docvecs,sim_docvecs,norm_sim_docvecs
count,1498129.0,1498129.0,1481000.0,1481000.0
mean,0.1,0.0,0.13,0.0
std,0.14,1.0,0.14,1.0
min,-0.76,-6.31,-0.42,-4.05
25%,0.01,-0.67,0.04,-0.68
50%,0.09,-0.07,0.12,-0.06
75%,0.18,0.58,0.21,0.61
max,0.94,6.2,0.68,4.07


In [27]:
l2[["sim_claims_docvecs", "sim_docvecs", "primclass_match"]].groupby("primclass_match").mean()

Unnamed: 0_level_0,sim_claims_docvecs,sim_docvecs
primclass_match,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.086232,0.120031
True,0.162213,0.195126


In [28]:
l2[["sim_claims_docvecs", "sim_docvecs", "inv_msa_match"]].groupby("inv_msa_match").mean()

Unnamed: 0_level_0,sim_claims_docvecs,sim_docvecs
inv_msa_match,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.094907,0.128775
True,0.103066,0.135926


In [29]:
l2[["norm_sim_claims_docvecs", "norm_sim_docvecs", "primclass_match"]].groupby("primclass_match").mean()

Unnamed: 0_level_0,norm_sim_claims_docvecs,norm_sim_docvecs
primclass_match,Unnamed: 1_level_1,Unnamed: 2_level_1
False,-0.078224,-0.077736
True,0.478605,0.47849


In [31]:
l2[["norm_sim_claims_docvecs", "norm_sim_docvecs", "inv_msa_match"]].groupby("inv_msa_match").mean()

Unnamed: 0_level_0,norm_sim_claims_docvecs,norm_sim_docvecs
inv_msa_match,Unnamed: 1_level_1,Unnamed: 2_level_1
False,-0.014653,-0.012972
True,0.04514,0.040001
