In [2]:
import scipy as sp
import numpy as np
import pandas as pd
import timeit
import matplotlib.pyplot as plt
import requests
import pprint
import sklearn
import gensim
import datetime
import json
import pickle
import fastparquet
import string
seed = 3

import os
os.chdir('/mnt/t48/bighomes-active/sfeng/patentdiffusion/')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


## Loading claims text

In [4]:
%%time
iter_csv = pd.read_csv("RawData/PatentsView/claim.tsv", sep='\t', iterator=True, chunksize=100000)
cl = pd.concat([chunk[chunk['sequence'] <= 3] for chunk in iter_csv])

CPU times: user 21min 2s, sys: 1min 5s, total: 22min 8s
Wall time: 25min 4s


In [5]:
# Get unduplicated US patents
pdf = fastparquet.ParquetFile("RawData/Cleaned/patent_loc_unique_us_0628.parq")\
.to_pandas(["patent"])
dup_pats = pd.read_pickle("RawData/Cleaned/duplicate_pattext_0712.pkl").tolist()
# Get relevant US Patents
pdf = pdf.loc[~pdf["patent"].isin(dup_pats), "patent"].tolist()
del(dup_pats)

In [7]:
# # Number of texts
# print(len(cl))

# # Remove non utility patents
# %time cl = cl.loc[cl["patent_id"].apply(lambda x: x.isdigit() == True)]
# cl["patent_id"] = cl["patent_id"].astype(int)
# print(len(cl))

# # Remove missing text
# %time cl = cl.dropna(subset=["patent_id", "text"])
print(len(cl))

# Remove patents not in relevant US patents
cl = cl.loc[cl["patent_id"].isin(pdf)]
print(len(cl))

# Reset index
cl = cl.reset_index(drop=True)

# Words to drop; have to set regex-True
rem_dict = ["1.", "2.", "3.", "4.", "5.", "claim 1", "claim 2", "claim 3", "claim 4", "claim 5"]
rem_dict = dict(zip(rem_dict, [""]*len(rem_dict)))
%time cl_text = cl["text"].replace(rem_dict, regex=True)
cl["text"] = cl_text
del(cl_text)

17735902
6551471
CPU times: user 2min 9s, sys: 2.11 s, total: 2min 11s
Wall time: 2min 10s


In [9]:
cl.head()

Unnamed: 0,patent_id,text
0,4968079,A golf ball retriever having no moving parts c...
1,6035330,A system for navigating a plurality of compute...
2,4031147,A process according to claim wherein said elec...
3,6188117,A gate electrode comprising:an insulative laye...
4,6883717,The secure credit card of claim wherein the c...


### Issues saving claims text
- Getting errors from fastparquet because there are too many rows? Solution using pyarrow here: https://stackoverflow.com/questions/50782252/pandas-to-parquet-fails-on-large-datasets

In [13]:
cl = cl[["patent_id", "text"]]
cl.to_parquet("RawData/Cleaned/claims1112.parq", engine="pyarrow", compression="gzip")
# fastparquet.write("RawData/Cleaned/claims1112.parq", cl, compression="GZIP", row_group_offsets=20000000)

## Cleaning claims text

In [27]:
# Load first
%time cl = pd.concat((df_partial for df_partial in fastparquet.ParquetFile("RawData/Cleaned/claims1112.parq").iter_row_groups()), axis=0)

CPU times: user 58.8 s, sys: 1.2 s, total: 60 s
Wall time: 1min 1s


In [36]:
# Put all patents and claims together
# %time cl = cl.groupby("patent_id")
%time claims = ((n, g["text"].tolist()) for n,g in cl)
%time claims = [(n, " ".join(c)) for n, c in claims]

cl2 = pd.DataFrame({"patent": [i[0] for i in claims], "claims": [i[1] for i in claims]})
print(len(cl2))

CPU times: user 46 µs, sys: 0 ns, total: 46 µs
Wall time: 57.5 µs
CPU times: user 10min 47s, sys: 3.52 s, total: 10min 50s
Wall time: 10min 51s
2220680


In [44]:
# Patent to three merged claims texts
# cl2.to_parquet("RawData/Cleaned/claims1112.parq", engine="pyarrow", compression="gzip")
del(cl, claims)

In [41]:
s = cl2.loc[0, "claims"]

In [None]:
from gensim.parsing.preprocessing import *
CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation, remove_stopwords, stem_text]
%time cl_st = [" ".join(preprocess_string(c, CUSTOM_FILTERS)) for c in iter(cl2["claims"])]
cl2["claims_stemmed"] = cl_st

In [None]:
cl2.to_parquet("RawData/Cleaned/claims1112.parq", engine="pyarrow", compression="gzip")

In [47]:
cl2.head()

Unnamed: 0,claims,patent,claims_stemmed
0,"A bed arrangement comprisinga bed frame,a side...",3930273,bed arrang comprisinga bed frame rail assembl ...
1,An assembly as defined in claim whereinthe col...,3930274,assembl defin claim whereinth collaps contain ...
2,The method of claim wherein the heat-sealing o...,3930275,method claim heat seal step e carri moder die ...
3,"In a vehicle washing device, a passageway for ...",3930276,vehicl wash devic passagewai vehicl wheel have...
4,The device of claim in which said plate means ...,3930277,devic claim said plate mean includ flat plate ...


### Further preprocessing

- Filtering min_df at least 0.0001 gave me only 9995 terms!
- Filtering min_df at 0.00001 gave me 30889

#### 1. First, find list of terms to keep

In [84]:
%time pats = pd.concat((df_partial for df_partial in fastparquet.ParquetFile("RawData/Cleaned/claims1112.parq").iter_row_groups()), axis=0)

CPU times: user 54.8 s, sys: 589 ms, total: 55.4 s
Wall time: 56.6 s


In [42]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df=0.1, min_df=0.00001, stop_words="english")
%time filt = cv.fit_transform(pats["claims_stemmed"].tolist())

# Get features
filt = cv.get_feature_names()
print(len(filt))
del(cv)

pd.Series(filt).to_pickle("RawData/Cleaned/claims_features_1120.pkl")

CPU times: user 5min 15s, sys: 1.75 s, total: 5min 16s
Wall time: 5min 14s
30787


#### 2. Filter each claims text
Using generator gives huge speedup! Need to remember this one

In [66]:
test = pats.loc[10, "claims_stemmed"]
test

'beverag contain dispens compris flexibl bag have sealabl open fill bag b valv attach wall bag said valv have oper member capabl movement close posit close valv close movement serv punctur bag said oper member movabl dispens posit open valv allow dispens beverag bag c support structur have oppos wall join diverg have flat join lower edg wall act base support structur bag upright displai dispens posit andd said valv project wall support structur beverag dispens construct compris structur claim combin rigid self support hous have open end said area support structur separ fit open end contain dispens claim wall support structur valv project area surround valv weaken readili remov remain attach valv'

In [75]:
def tokenize(text, filter_list):
    for word in text.split():
        if word in filter_list:
            yield word

In [78]:
%timeit " ".join(tokenize(test, filt))
print(" ".join(tokenize(test, filt)))

%timeit " ".join(tokenize(test, set(filt)))
print(" ".join(tokenize(test, filt)))

48.6 ms ± 333 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
1.77 ms ± 38.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


'beverag dispens flexibl bag sealabl bag valv wall bag valv capabl movement close close valv close movement serv punctur bag movabl dispens valv allow dispens beverag bag structur oppos wall join diverg flat join lower edg wall act structur bag upright displai dispens andd valv project wall structur beverag dispens construct structur combin rigid self hous area structur separ fit dispens wall structur valv project area surround valv weaken readili remov remain valv'

In [70]:
%timeit t = " ".join([x for x in test.split(" ") if x in filt])
print(" ".join([x for x in test.split(" ") if x in filt]))
%timeit t = " ".join([x for x in test.split(" ") if x in set(filt)])
print(" ".join([x for x in test.split(" ") if x in set(filt)]))
%timeit t = " ".join((x for x in test.split(" ") if x in set(filt)))
print(" ".join((x for x in test.split(" ") if x in set(filt))))

INFO:root:beverag dispens flexibl bag sealabl bag valv wall bag valv capabl movement close close valv close movement serv punctur bag movabl dispens valv allow dispens beverag bag structur oppos wall join diverg flat join lower edg wall act structur bag upright displai dispens andd valv project wall structur beverag dispens construct structur combin rigid self hous area structur separ fit dispens wall structur valv project area surround valv weaken readili remov remain valv


44.6 ms ± 563 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


INFO:root:beverag dispens flexibl bag sealabl bag valv wall bag valv capabl movement close close valv close movement serv punctur bag movabl dispens valv allow dispens beverag bag structur oppos wall join diverg flat join lower edg wall act structur bag upright displai dispens andd valv project wall structur beverag dispens construct structur combin rigid self hous area structur separ fit dispens wall structur valv project area surround valv weaken readili remov remain valv


181 ms ± 731 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
191 ms ± 11 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


INFO:root:beverag dispens flexibl bag sealabl bag valv wall bag valv capabl movement close close valv close movement serv punctur bag movabl dispens valv allow dispens beverag bag structur oppos wall join diverg flat join lower edg wall act structur bag upright displai dispens andd valv project wall structur beverag dispens construct structur combin rigid self hous area structur separ fit dispens wall structur valv project area surround valv weaken readili remov remain valv


Trying the generator because the list comp scales speed O(n^2)

In [81]:
pl = pats["claims_stemmed"][:100]
%timeit t = [" ".join([x for x in test.split(" ") if x in filt]) for test in pl]
# %timeit t = [" ".join([x for x in test.split(" ") if x in set(filt)]) for test in pl]
# %timeit t = [" ".join((x for x in test.split(" ") if x in set(filt))) for test in pl]
# %timeit t = [" ".join(tokenize(test, filt)) for test in pl]
%timeit t = [" ".join(tokenize(test, set(filt))) for test in pl]

8.15 s ± 410 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
179 ms ± 2.71 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
print(datetime.datetime.now())
# %time cl = (c.split() for c in iter(pats["claims_stemmed"]))
%time cl1 = (tokenize(c, set(filt)) for c in iter(pats["claims_stemmed"]))
%time cl = [" ".join(c) for c in cl1]
print(datetime.datetime.now())

INFO:root:2018-11-21 13:12:59.177647


CPU times: user 143 ms, sys: 1 ms, total: 144 ms
Wall time: 142 ms


In [None]:
print(datetime.datetime.now())
print("Saving")
pats["claims_stemmed"] = cl
pats.to_parquet("RawData/Cleaned/claims1112.parq", engine="pyarrow", compression="gzip")
print(datetime.datetime.now())
print("Finished saving")
del(cl)

## Fitting DV on Claims

1. First attent was just copying and pasting https://sfengc7.stern.nyu.edu:8888/notebooks/patentdiffusion/201808Results/FittingModels/1b-FitDV.ipynb, but produced terrible results
2. Need to preprocess text to get rid of common and extremely uncommon terms as language is very common across all claims text

In [50]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.basicConfig(level=logging.INFO)
logger.addHandler(logging.FileHandler('Logs/DV_{0}.log'.format(datetime.datetime.now().\
                                                            strftime("%Y-%m-%d"), 'a')))
print = logging.info
print('good day to you madam fiona')

INFO:root:good day to you madam fiona


In [None]:
# Load first
print("Loading files")
%time pats = pd.concat((df_partial for df_partial in fastparquet.ParquetFile("RawData/Cleaned/claims1112.parq").iter_row_groups()), axis=0)
pats = pats.sample(frac=0.7, random_state = seed)
claims_stemmed = pats["claims_stemmed"].tolist()
pat_labels = pats["patent"].astype(str).tolist()
del(pats)

In [None]:
from gensim import models
from gensim.models.doc2vec import TaggedDocument

class DocIterator(object):
    def __init__(self, doc_list, labels_list):
        self.labels_list = labels_list
        self.doc_list = doc_list

    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
            # print 'creating tagged document...%d' % idx
            yield TaggedDocument(words=doc.split(), tags=[self.labels_list[idx]])


iterator = DocIterator(claims_stemmed, pat_labels)
print('building vocabulary')

%time model = gensim.models.Doc2Vec(size=100, window=10, min_count=50, workers=4, alpha=0.025, min_alpha=0.025)
%time model.build_vocab(iterator)

print('done building vocabulary')
print('start training the model')

starttime = datetime.datetime.now()
print("start")
print(starttime)
model.train(iterator, total_examples=model.corpus_count, epochs=30)
endtime = datetime.datetime.now()
print("end")
print(endtime)

model.save("DataStore/2018-07-P2/ML/doc2vec_claims_1119.model")
print("Finished saving model")
print(datetime.datetime.now())

In [None]:
%time model = gensim.models.doc2vec.Doc2Vec.load("DataStore/2018-07-P2/ML/doc2vec_claims_1119.model")
%time pa = pd.concat((df_partial for df_partial in fastparquet.ParquetFile("RawData/Cleaned/claims1112.parq").iter_row_groups()), axis=0)
%time pa_pats = pd.Series(list(range(len(pa))), index=pa["patent"].tolist())
%time pa_pats.to_pickle("RawData/Cleaned/pat_dict_claims_1120.pkl")
del(pa_pats)

pa = pa["claims_stemmed"].tolist()
print(len(pa))
try:
    texts = [t.split() for t in pa]
    print((len(texts), "Text Length"))
    print("inferring new vectors")
    print(datetime.datetime.now())
    new_vecs = [model.infer_vector(t) for t in texts]
    print("finished inferring new vectors")
    print(datetime.datetime.now())
    nv = pd.DataFrame(new_vecs, columns = [str(i) for i in range(100)], index = range(len(new_vecs)))
    print((len(nv), "New Vectors Length"))
    fastparquet.write("DataStore/2018-07-P2/ML/docvecs_claims_pats_1116.parq", nv, compression = "GZIP")
    
except Exception as e:
    logging.exception("message")

In [4]:
len(nv)

2220680

In [8]:
pa_pats = pd.read_pickle("RawData/Cleaned/pat_dict_claims_1120.pkl")
len(pa_pats)

2220680