## Patent Predict

In [72]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding
from tensorflow.keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers
from tensorflow.keras.preprocessing import text, sequence

import spacy
from gensim.models import Word2Vec
from nltk import word_tokenize
from nltk.tokenize import word_tokenize

import pandas as pd
import numpy as np
import requests
import json

from pandas.io.json import json_normalize
import pickle
from collections import ChainMap

In [2]:
np.random.seed(3)

#### Import data from PatentsView API

In [3]:
# pd.set_option('display.max_colwidth', -1)
pd.options.display.max_columns = 50
pd.set_option('display.max_rows', 50)

In [4]:
# patents endpoint
endpoint_url = 'http://www.patentsview.org/api/patents/query'

In [5]:
# build list from file of possible fields that endpoint request will return
df = pd.read_excel("data/patents_view_patents_fields.xlsx")
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
pat_fields = df.api_field_name.values.tolist()
len(pat_fields) # 184 possible fields

184

#### Construct and run GET request

In [6]:
# pass directly into browser
# http://www.patentsview.org/api/patents/query?q={"_text_any":{"patent_abstract":"natural langugage processing"}}
# patents = []

query={"_or":[{"_text_phrase":{"patent_title":"natural language"}},{"_text_phrase":{"patent_abstract":"natural language"}}]}
fields=pat_fields
options={"per_page":2500}
sort=[{"patent_date":"desc"}]

params={'q': json.dumps(query),
        'f': json.dumps(fields),
        'o': json.dumps(options),
        's': json.dumps(sort)}

# options (works) = {"page":1, "per_page":10}

# other queries - uncomment to run
# query (works) ={"_text_all":{"patent_abstract":"nlp"}},{"_text_all":{"patent_abstract":"natural language processing"}}]}
# 529 results: {"_text_phrase":{"patent_abstract":"natural language processing"}} 
# 858 results: {"_text_all":{"patent_abstract":"natural language processing"}} 
# 957 results: query={"_or":[{"_text_all":{"patent_title":"natural language processing"}},{"_text_all":{"patent_abstract":"natural language processing"}}]}

In [7]:
# request and results
resp = requests.get(endpoint_url, params=params)
results = resp.json()

#### Inspect results from GET request

In [8]:
# extract metadata from response
print("status code:", resp.status_code,';', "reason:", resp.reason)
total_patent_count = results["total_patent_count"]
patents_per_page = results['count']
print("total_patent_count:",total_patent_count,';', "patents_per_page:", patents_per_page)

status code: 200 ; reason: OK
total_patent_count: 2482 ; patents_per_page: 2482


In [9]:
# extract data from response
data = results['patents']
# data[0]
df = pd.DataFrame(data)
df.head(3)

Unnamed: 0,IPCs,application_citations,applications,assignees,cited_patents,citedby_patents,cpcs,detail_desc_length,examiners,foreign_priority,gov_interests,inventors,lawyers,nbers,patent_abstract,patent_average_processing_time,patent_date,patent_firstnamed_assignee_city,patent_firstnamed_assignee_country,patent_firstnamed_assignee_id,patent_firstnamed_assignee_latitude,patent_firstnamed_assignee_location_id,patent_firstnamed_assignee_longitude,patent_firstnamed_assignee_state,patent_firstnamed_inventor_city,patent_firstnamed_inventor_country,patent_firstnamed_inventor_id,patent_firstnamed_inventor_latitude,patent_firstnamed_inventor_location_id,patent_firstnamed_inventor_longitude,patent_firstnamed_inventor_state,patent_kind,patent_num_cited_by_us_patents,patent_num_claims,patent_num_combined_citations,patent_num_foreign_citations,patent_num_us_application_citations,patent_num_us_patent_citations,patent_number,patent_processing_time,patent_title,patent_type,patent_year,pct_data,rawinventors,uspcs,wipos
0,"[{'ipc_action_date': '2019-03-12', 'ipc_class'...","[{'appcit_app_number': '2002/20020077823', 'ap...","[{'app_country': 'US', 'app_date': '2013-07-26...","[{'assignee_city': 'Burlington', 'assignee_cou...",[{'cited_patent_category': 'cited by examiner'...,"[{'citedby_patent_category': None, 'citedby_pa...","[{'cpc_category': None, 'cpc_first_seen_date':...",11570,"[{'examiner_first_name': 'Michael N', 'examine...","[{'forprior_country': None, 'forprior_date': N...","[{'govint_contract_award_number': None, 'govin...","[{'inventor_city': 'Newton', 'inventor_country...","[{'lawyer_first_name': None, 'lawyer_first_see...","[{'nber_category_id': None, 'nber_category_tit...",Designing a natural language understanding (NL...,,2019-03-12,Burlington,US,org_ID497r4tFbCIaMBjGAST,42.5047,42.5047|-71.1961,-71.1961,MA,Newton,US,7788103-1,42.3369,42.3369|-71.2097,-71.2097,MA,B2,0,19,31,0,26,5,10229106,2055,Initializing a workspace for building a natura...,utility,2019,"[{'pct_102_date': None, 'pct_371_date': None, ...","[{'rawinventor_first_name': 'Jeffrey N.', 'raw...","[{'uspc_first_seen_date': None, 'uspc_last_see...","[{'wipo_field_id': None, 'wipo_field_title': N..."
1,"[{'ipc_action_date': '2019-03-12', 'ipc_class'...","[{'appcit_app_number': '2002/20020138265', 'ap...","[{'app_country': 'US', 'app_date': '2017-09-11...","[{'assignee_city': 'Mountain View', 'assignee_...",[{'cited_patent_category': 'cited by applicant...,"[{'citedby_patent_category': None, 'citedby_pa...","[{'cpc_category': None, 'cpc_first_seen_date':...",28118,"[{'examiner_first_name': 'Shreyans A', 'examin...","[{'forprior_country': None, 'forprior_date': N...","[{'govint_contract_award_number': None, 'govin...","[{'inventor_city': 'Adliswil', 'inventor_count...","[{'lawyer_first_name': None, 'lawyer_first_see...","[{'nber_category_id': None, 'nber_category_tit...","Methods, systems, and apparatus, including com...",,2019-03-12,Mountain View,US,org_p6ofWD2xFNSnyYkj6wpA,37.3861,37.3861|-122.0828,-122.083,CA,Adliswil,CH,8352247-1,47.3119,47.3119|8.5287,8.5287,,B1,0,20,15,0,7,8,10229109,547,Allowing spelling of arbitrary words,utility,2019,"[{'pct_102_date': None, 'pct_371_date': None, ...","[{'rawinventor_first_name': 'Evgeny A.', 'rawi...","[{'uspc_first_seen_date': None, 'uspc_last_see...","[{'wipo_field_id': None, 'wipo_field_title': N..."
2,"[{'ipc_action_date': '2019-03-12', 'ipc_class'...","[{'appcit_app_number': '2001/20010029455', 'ap...","[{'app_country': 'US', 'app_date': '2016-09-28...","[{'assignee_city': 'Seattle', 'assignee_countr...",[{'cited_patent_category': 'cited by applicant...,"[{'citedby_patent_category': None, 'citedby_pa...","[{'cpc_category': None, 'cpc_first_seen_date':...",119654,"[{'examiner_first_name': 'Jialong', 'examiner_...","[{'forprior_country': None, 'forprior_date': N...","[{'govint_contract_award_number': None, 'govin...","[{'inventor_city': 'Seattle', 'inventor_countr...","[{'lawyer_first_name': None, 'lawyer_first_see...","[{'nber_category_id': None, 'nber_category_tit...",A content management system (CMS) and a transl...,,2019-03-12,Seattle,US,org_Vbc6obpnxWM42d0HjlXY,47.6064,47.6064|-122.3308,-122.331,WA,Seattle,US,9177341-1,47.6064,47.6064|-122.3308,-122.331,WA,B1,0,20,74,0,48,26,10229113,895,Leveraging content dimensions during the trans...,utility,2019,"[{'pct_102_date': None, 'pct_371_date': None, ...","[{'rawinventor_first_name': 'Thibault Pierre',...","[{'uspc_first_seen_date': None, 'uspc_last_see...","[{'wipo_field_id': None, 'wipo_field_title': N..."


In [10]:
# ser = df_assignees['assignee_id'].apply(pd.Series)
# len(ser)
# ser.duplicated()

#### Subset dataframe

In [11]:
df.columns

Index(['IPCs', 'application_citations', 'applications', 'assignees',
       'cited_patents', 'citedby_patents', 'cpcs', 'detail_desc_length',
       'examiners', 'foreign_priority', 'gov_interests', 'inventors',
       'lawyers', 'nbers', 'patent_abstract', 'patent_average_processing_time',
       'patent_date', 'patent_firstnamed_assignee_city',
       'patent_firstnamed_assignee_country', 'patent_firstnamed_assignee_id',
       'patent_firstnamed_assignee_latitude',
       'patent_firstnamed_assignee_location_id',
       'patent_firstnamed_assignee_longitude',
       'patent_firstnamed_assignee_state', 'patent_firstnamed_inventor_city',
       'patent_firstnamed_inventor_country', 'patent_firstnamed_inventor_id',
       'patent_firstnamed_inventor_latitude',
       'patent_firstnamed_inventor_location_id',
       'patent_firstnamed_inventor_longitude',
       'patent_firstnamed_inventor_state', 'patent_kind',
       'patent_num_cited_by_us_patents', 'patent_num_claims',
       'paten

In [12]:
df = df[['patent_number', 
         'patent_date', 
         'patent_title',
         'patent_abstract', 
         'patent_firstnamed_assignee_id', 
         'patent_year', 
         'patent_type', 
         'patent_kind']]
df.head(3)

# other field options - uncomment to use
# df = df[['patent_number', 
#          'patent_date', 
#          'patent_title',
#          'patent_abstract', 
#          'patent_firstnamed_assignee_id',
#          'patent_firstnamed_assignee_location_id',
#          'patent_firstnamed_assignee_latitude',
#          'patent_firstnamed_assignee_longitude',
#          'patent_firstnamed_assignee_city',
#          'patent_firstnamed_assignee_state',
#          'patent_firstnamed_assignee_country', 
#          'patent_firstnamed_inventor_id',
#          'patent_firstnamed_inventor_location_id',
#          'patent_firstnamed_inventor_latitude',
#          'patent_firstnamed_inventor_longitude',
#          'patent_firstnamed_inventor_city',
#          'patent_firstnamed_inventor_state',
#          'patent_firstnamed_inventor_country',
#          'patent_year', 
#          'patent_type', 
#          'patent_kind',
#          'patent_processing_time', 
#          'patent_num_us_application_citations', 
#          'patent_num_us_patent_citations', 
#          'patent_num_foreign_citations', 
#          'patent_num_combined_citations', 
#          'patent_num_claims', 
#          'patent_num_cited_by_us_patents',
#          'detail_desc_length']]

Unnamed: 0,patent_number,patent_date,patent_title,patent_abstract,patent_firstnamed_assignee_id,patent_year,patent_type,patent_kind
0,10229106,2019-03-12,Initializing a workspace for building a natura...,Designing a natural language understanding (NL...,org_ID497r4tFbCIaMBjGAST,2019,utility,B2
1,10229109,2019-03-12,Allowing spelling of arbitrary words,"Methods, systems, and apparatus, including com...",org_p6ofWD2xFNSnyYkj6wpA,2019,utility,B1
2,10229113,2019-03-12,Leveraging content dimensions during the trans...,A content management system (CMS) and a transl...,org_Vbc6obpnxWM42d0HjlXY,2019,utility,B1


In [13]:
len(df)

2482

In [14]:
df.columns

Index(['patent_number', 'patent_date', 'patent_title', 'patent_abstract',
       'patent_firstnamed_assignee_id', 'patent_year', 'patent_type',
       'patent_kind'],
      dtype='object')

In [15]:
df['patent_title_abstract'] = df.patent_title + ' ' + df.patent_abstract
df.patent_title_abstract.head(3)

0    Initializing a workspace for building a natura...
1    Allowing spelling of arbitrary words Methods, ...
2    Leveraging content dimensions during the trans...
Name: patent_title_abstract, dtype: object

In [16]:
# 561 different assignees
len(df.patent_firstnamed_assignee_id.unique())

561

In [17]:
df.patent_firstnamed_assignee_id.value_counts()[:10]

org_q9Bn28RHhpYrQjKvraAH    497
org_JZguWDMfFOBX2wBI9pnD    129
org_ID497r4tFbCIaMBjGAST     88
org_rDyHZBYWMcBEtnkHt05L     80
org_p6ofWD2xFNSnyYkj6wpA     57
org_EilEWQcC6UiqHcSGx9mb     56
org_ccMMcUijAIsKIxUqMTyP     49
org_Vbc6obpnxWM42d0HjlXY     41
org_9D8x1qL3IRASp6GG7Glu     29
org_2wAdIFKssfcLHpZq0u4H     26
Name: patent_firstnamed_assignee_id, dtype: int64

In [18]:
# list of assignees with > 20 patents in df dataset
assignees_list = ['org_q9Bn28RHhpYrQjKvraAH', 'org_JZguWDMfFOBX2wBI9pnD', 'org_ID497r4tFbCIaMBjGAST', 
                  'org_rDyHZBYWMcBEtnkHt05L', 'org_p6ofWD2xFNSnyYkj6wpA', 'org_EilEWQcC6UiqHcSGx9mb',
                  'org_ccMMcUijAIsKIxUqMTyP', 'org_Vbc6obpnxWM42d0HjlXY', 'org_9D8x1qL3IRASp6GG7Glu',
                  'org_2wAdIFKssfcLHpZq0u4H', 'org_iwO2oOJ6VIBd9fAuP7G6', 'org_70D1lR89kQnFiCFdJ6s5',
                  'org_vojVnDkT9CamDETqbqJC']

In [19]:
df.head(3)

Unnamed: 0,patent_number,patent_date,patent_title,patent_abstract,patent_firstnamed_assignee_id,patent_year,patent_type,patent_kind,patent_title_abstract
0,10229106,2019-03-12,Initializing a workspace for building a natura...,Designing a natural language understanding (NL...,org_ID497r4tFbCIaMBjGAST,2019,utility,B2,Initializing a workspace for building a natura...
1,10229109,2019-03-12,Allowing spelling of arbitrary words,"Methods, systems, and apparatus, including com...",org_p6ofWD2xFNSnyYkj6wpA,2019,utility,B1,"Allowing spelling of arbitrary words Methods, ..."
2,10229113,2019-03-12,Leveraging content dimensions during the trans...,A content management system (CMS) and a transl...,org_Vbc6obpnxWM42d0HjlXY,2019,utility,B1,Leveraging content dimensions during the trans...


In [20]:
df_20pats = df[df['patent_firstnamed_assignee_id'].isin(assignees_list) ]

In [21]:
df_20pats.head(3)

Unnamed: 0,patent_number,patent_date,patent_title,patent_abstract,patent_firstnamed_assignee_id,patent_year,patent_type,patent_kind,patent_title_abstract
0,10229106,2019-03-12,Initializing a workspace for building a natura...,Designing a natural language understanding (NL...,org_ID497r4tFbCIaMBjGAST,2019,utility,B2,Initializing a workspace for building a natura...
1,10229109,2019-03-12,Allowing spelling of arbitrary words,"Methods, systems, and apparatus, including com...",org_p6ofWD2xFNSnyYkj6wpA,2019,utility,B1,"Allowing spelling of arbitrary words Methods, ..."
2,10229113,2019-03-12,Leveraging content dimensions during the trans...,A content management system (CMS) and a transl...,org_Vbc6obpnxWM42d0HjlXY,2019,utility,B1,Leveraging content dimensions during the trans...


In [22]:
df_20pats.head(3)

Unnamed: 0,patent_number,patent_date,patent_title,patent_abstract,patent_firstnamed_assignee_id,patent_year,patent_type,patent_kind,patent_title_abstract
0,10229106,2019-03-12,Initializing a workspace for building a natura...,Designing a natural language understanding (NL...,org_ID497r4tFbCIaMBjGAST,2019,utility,B2,Initializing a workspace for building a natura...
1,10229109,2019-03-12,Allowing spelling of arbitrary words,"Methods, systems, and apparatus, including com...",org_p6ofWD2xFNSnyYkj6wpA,2019,utility,B1,"Allowing spelling of arbitrary words Methods, ..."
2,10229113,2019-03-12,Leveraging content dimensions during the trans...,A content management system (CMS) and a transl...,org_Vbc6obpnxWM42d0HjlXY,2019,utility,B1,Leveraging content dimensions during the trans...


In [23]:
# see error message
df_20pats.sort_values(by=['patent_date'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


#### Partition data into train and test sets

In [24]:
df_20pats[:5]

Unnamed: 0,patent_number,patent_date,patent_title,patent_abstract,patent_firstnamed_assignee_id,patent_year,patent_type,patent_kind,patent_title_abstract
2479,4502128,1985-02-26,Translation between natural languages,An input sentence described by a first natural...,org_70D1lR89kQnFiCFdJ6s5,1985,utility,A,Translation between natural languages An input...
2477,4599612,1986-07-08,Displaying and correcting method for machine t...,In a system wherein a first text in a first na...,org_70D1lR89kQnFiCFdJ6s5,1986,utility,A,Displaying and correcting method for machine t...
2475,4661924,1987-04-28,Multiple-parts-of-speech disambiguating method...,A machine translation system comprises input m...,org_70D1lR89kQnFiCFdJ6s5,1987,utility,A,Multiple-parts-of-speech disambiguating method...
2471,4736296,1988-04-05,Method and apparatus of intelligent guidance i...,A method and apparatus of intelligent guidance...,org_70D1lR89kQnFiCFdJ6s5,1988,utility,A,Method and apparatus of intelligent guidance i...
2466,4887212,1989-12-12,Parser for natural language text,An improved natural language text parser is di...,org_q9Bn28RHhpYrQjKvraAH,1989,utility,A,Parser for natural language text An improved n...


In [25]:
train_20pats = df_20pats[:894]
len(train_20pats)

894

In [26]:
train_20pats[:5]

Unnamed: 0,patent_number,patent_date,patent_title,patent_abstract,patent_firstnamed_assignee_id,patent_year,patent_type,patent_kind,patent_title_abstract
2479,4502128,1985-02-26,Translation between natural languages,An input sentence described by a first natural...,org_70D1lR89kQnFiCFdJ6s5,1985,utility,A,Translation between natural languages An input...
2477,4599612,1986-07-08,Displaying and correcting method for machine t...,In a system wherein a first text in a first na...,org_70D1lR89kQnFiCFdJ6s5,1986,utility,A,Displaying and correcting method for machine t...
2475,4661924,1987-04-28,Multiple-parts-of-speech disambiguating method...,A machine translation system comprises input m...,org_70D1lR89kQnFiCFdJ6s5,1987,utility,A,Multiple-parts-of-speech disambiguating method...
2471,4736296,1988-04-05,Method and apparatus of intelligent guidance i...,A method and apparatus of intelligent guidance...,org_70D1lR89kQnFiCFdJ6s5,1988,utility,A,Method and apparatus of intelligent guidance i...
2466,4887212,1989-12-12,Parser for natural language text,An improved natural language text parser is di...,org_q9Bn28RHhpYrQjKvraAH,1989,utility,A,Parser for natural language text An improved n...


In [27]:
test_20pats = df_20pats[894:]
len(test_20pats)

224

In [28]:
# TODO (Lee) - find better way to partition based on dates by percentage
1118 * .8

1118 *.2

1118 * .8 + 1118 *.2

1118.0

#### Explore nested datasets - assignees

In [29]:
df_assignees = json_normalize(results['patents'], record_path=['assignees'], meta=['patent_number'])

In [30]:
df_assignees[df_assignees['assignee_id'] == "org_SEywROQVbKV7Zj6CtfEE"]

Unnamed: 0,assignee_city,assignee_country,assignee_county,assignee_county_fips,assignee_first_name,assignee_first_seen_date,assignee_id,assignee_key_id,assignee_last_name,assignee_last_seen_date,assignee_lastknown_city,assignee_lastknown_country,assignee_lastknown_latitude,assignee_lastknown_location_id,assignee_lastknown_longitude,assignee_lastknown_state,assignee_latitude,assignee_location_id,assignee_longitude,assignee_organization,assignee_sequence,assignee_state,assignee_state_fips,assignee_total_num_inventors,assignee_total_num_patents,assignee_type,patent_number
1911,Tokyo,JP,,0,,2007-10-16,org_SEywROQVbKV7Zj6CtfEE,344976,,2007-10-16,Tokyo,JP,35.685,35.685|139.7514,139.751,,35.685,35.685|139.7514,139.751,"Fuji Xexox Co., Ltd.",0,,0,4,1,3,7283958


In [31]:
df_assignees[df_assignees['patent_number'] == "10210245"]

Unnamed: 0,assignee_city,assignee_country,assignee_county,assignee_county_fips,assignee_first_name,assignee_first_seen_date,assignee_id,assignee_key_id,assignee_last_name,assignee_last_seen_date,assignee_lastknown_city,assignee_lastknown_country,assignee_lastknown_latitude,assignee_lastknown_location_id,assignee_lastknown_longitude,assignee_lastknown_state,assignee_latitude,assignee_location_id,assignee_longitude,assignee_organization,assignee_sequence,assignee_state,assignee_state_fips,assignee_total_num_inventors,assignee_total_num_patents,assignee_type,patent_number
36,Beijing,CN,,0,,1990-04-17,org_myRnscKfY7JOy5h8LVrg,267177,,2019-02-19,Beijing,CN,39.9042,39.9042|116.4074,116.407,,39.9042,39.9042|116.4074,116.407,Peking University,0,,0,463,224,3,10210245
37,Shenzhen,CN,,0,,2009-06-23,org_O0GfNE8msswIVOwTLezZ,282280,,2019-03-12,Shenzhen,CN,22.5333,22.5333|114.1333,114.133,,22.5333,22.5333|114.1333,114.133,TENCENT TECHNOLOGY (SHENZHEN) COMPANY LIMITED,1,,0,1977,1421,3,10210245


In [32]:
df[df['patent_number'] == "10210245"]

Unnamed: 0,patent_number,patent_date,patent_title,patent_abstract,patent_firstnamed_assignee_id,patent_year,patent_type,patent_kind,patent_title_abstract
36,10210245,2019-02-19,Natural language question answering method and...,A natural language question answering method a...,org_myRnscKfY7JOy5h8LVrg,2019,utility,B2,Natural language question answering method and...


In [33]:
# other nested tables for investigation - uncomment to use

# json_normalize(results['patents'][0], record_path='applications')

# inspect nested datasets, column by column

# json_normalize(results['patents'][0])
# json_normalize(results['patents'][0], record_path='IPCs')
# json_normalize(results['patents'][0], record_path='application_citations')
# json_normalize(results['patents'][0], record_path='applications')
# json_normalize(results['patents'][2], record_path='assignees')
# json_normalize(results['patents'][0], record_path='cited_patents')
# json_normalize(results['patents'][0], record_path='citedby_patents')
# json_normalize(results['patents'][24], record_path='cpcs')
# json_normalize(results['patents'][0], record_path='examiners')
# json_normalize(results['patents'][0], record_path='foreign_priority')
# json_normalize(results['patents'][0], record_path='gov_interests')
# json_normalize(results['patents'][0], record_path='inventors')
# json_normalize(results['patents'][0], record_path='lawyers')
# json_normalize(results['patents'][0], record_path='nbers')
# json_normalize(results['patents'][0], record_path='pct_data')
# json_normalize(results['patents'][0], record_path='rawinventors')
# json_normalize(results['patents'][0:5], record_path='uspcs')
# json_normalize(results['patents'][0], record_path='examiners')
# json_normalize(results['patents'][0], record_path='wipos')

### Word2Vec

In [34]:
# map values of series according to input correspondence
# substitute each value in series derived from NLTK word_tokenize function
text_data = df['patent_title_abstract'].map(word_tokenize)

In [35]:
# inspect the first 3 items in `data` to see how everything looks 
text_data[:3]

0    [Initializing, a, workspace, for, building, a,...
1    [Allowing, spelling, of, arbitrary, words, Met...
2    [Leveraging, content, dimensions, during, the,...
Name: patent_title_abstract, dtype: object

In [36]:
# instantiate word2vec model
# window: maximum distance between the current and predicted word within a sentence
# size: number of dimensions for word vectors
# min_count: min word frequency in vocab cutoff threshhold
# workers param: number of worker threads to train model, for faster training with multicore machines
model_w2v = Word2Vec(text_data, size=100, window=5, min_count=1, workers=4)
model_w2v.save("word2vec.model")

W0621 09:41:41.776201 4563789248 smart_open_lib.py:379] this function is deprecated, use smart_open.open instead


In [37]:
# 'corpus_count' returns number of sentences in dataset, in this case, 200K sentences
model_w2v.corpus_count

2482

In [38]:
# train updates the model’s neural weights from a sequence of sentences
# training is streamed, meaning sentences can be a generator that reads input data from disk on-the-fly,
# without loading the entire corpus into RAM. This also means you can continue training the model later:

model_w2v.train(text_data, total_examples=model_w2v.corpus_count, epochs=10)

W0621 09:41:48.238798 4563789248 base_any2vec.py:596] Effective 'alpha' higher than previous training cycles


(2425955, 3583700)

In [39]:
# .wv separates trained word vectors in a KeyedVectors instance and assigns to var so don't need full model state
# (don’t need to continue training) by discarding state, we have a much smaller and faster object that can be
# mapped for fast loading and sharing the vectors in RAM between processes

# uncomment to run
# word_vectors = model_w2v.wv

In [40]:
model_w2v.trainables

<gensim.models.word2vec.Word2VecTrainables at 0x1a47b89048>

In [41]:
context_words_list = ['computer', 'language', 'user']

In [42]:
# gets the probability distribution of the center word given context words
model_w2v.predict_output_word(context_words_list, topn=10)

[("'s", 0.05287172),
 ('program', 0.037009317),
 ('interface', 0.03312036),
 ('readable', 0.009747167),
 ('A', 0.0062622274),
 ('implemented', 0.00519008),
 ('resource', 0.0037051453),
 ('Natural', 0.003327496),
 ('running', 0.0031767657),
 ('instructions', 0.0030982874)]

In [43]:
# compute cosine similarity & return most similar words to a word passed to function
word_vectors.most_similar(positive='generation')

NameError: name 'word_vectors' is not defined

In [46]:
# get word vector for a given word
word_vectors['generate']

# returns word vectors for entire vocabulary(dictionary)
word_vectors.vectors.shape

(9476, 100)

### Glove Model

In [44]:
# features
data = train_20pats['patent_title_abstract'].map(word_tokenize).values

In [45]:
data[0][:10]

['Translation',
 'between',
 'natural',
 'languages',
 'An',
 'input',
 'sentence',
 'described',
 'by',
 'a']

In [46]:
# target
target_train = train_20pats.patent_firstnamed_assignee_id

In [47]:
target_train[:3]

2479    org_70D1lR89kQnFiCFdJ6s5
2477    org_70D1lR89kQnFiCFdJ6s5
2475    org_70D1lR89kQnFiCFdJ6s5
Name: patent_firstnamed_assignee_id, dtype: object

In [48]:
# download zip file of GloVe model pretrained weights from Stanford NLP
# !wget http://nlp.stanford.edu/data/glove.6B.zip

In [49]:
# # calculate total vocab of our dataset by adding every word in the dataset into a python set object. 
vocab = set(word for doc in data for word in doc)

In [50]:
# # number of tokens in this dataset
len(vocab)

5279

In [51]:
# code
glove = {}
with open('data/glove.6B.50d.txt', 'rb') as f:
    for line in f:
        parts = line.split()
        word = parts[0].decode('utf-8')
        if word in vocab:
            vector = np.array(parts[1:], dtype=np.float32)
            glove[word] = vector

In [52]:
glove['generate']

array([ 0.94418 ,  0.099466,  1.5637  ,  0.19514 , -0.18374 ,  0.21001 ,
        0.41893 , -0.4262  ,  0.45778 ,  1.3884  , -0.15093 , -0.11383 ,
        0.77912 , -0.47679 ,  0.11494 ,  0.19519 ,  0.75934 ,  0.51346 ,
       -0.26984 , -1.2975  ,  0.90748 , -1.1802  ,  0.17354 , -0.53419 ,
        0.57519 , -0.21494 , -0.11276 , -0.43246 ,  0.73511 ,  0.10268 ,
        2.8403  ,  0.68922 ,  0.075201, -0.7718  , -0.51294 ,  0.081105,
       -0.39304 , -0.049972,  0.1209  , -0.33339 ,  0.28529 , -0.16663 ,
       -0.30613 ,  0.44213 , -0.51871 ,  0.15192 ,  0.36517 ,  0.86671 ,
       -0.24538 ,  0.15246 ], dtype=float32)

In [53]:
# # code
# class W2vVectorizer(object):
    
#     def __init__(self, w2v):
#         # takes in a dictionary of words and vectors as input
#         self.w2v = w2v
#         if len(w2v) == 0:
#             self.dimensions = 0
#         else:
#             self.dimensions = len(w2v[next(iter(glove))])
    
#     # Note from Mike: Even though it doesn't do anything, it's required that this object implement a fit method or else
#     # It can't be used in a sklearn Pipeline. 
#     def fit(self, X, y):
#         return self
            
#     def transform(self, X):
#         return np.array([
#             np.mean([self.w2v[w] for w in words if w in self.w2v]
#                    or [np.zeros(self.dimensions)], axis=0) for words in X])

### Text classification with neural network

In [80]:
# pre-process features data

# tokenize features
# data = train_20pats['patent_title_abstract'].map(word_tokenize).values

In [55]:
# TODO (Lee) - find alternate way to do this with tf?
y_train_20pats = pd.get_dummies(target_train).values

y_train_20pats[0]

# uncomment to continue trying with tf.one_hot
# tf.one_hot(target_train, depth=13, axis=-1)

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8)

In [None]:
# instantiate tf tokenizer
tokenizer = text.Tokenizer()

In [81]:
# train set features - pre-processing
# train set - tokenize, lower, clean punctuation in train set
tokenizer.fit_on_texts(list(train_20pats.patent_title_abstract))

# train set - transform each word(token?) in document to sequence of integers that index word strings
tokenized_docs = tokenizer.texts_to_sequences(train_20pats.patent_title_abstract)

# train set - pad sequences to max length of title and abstract
X_train_20pats = sequence.pad_sequences(tokenized_docs)

# train set - inspect shape
X_train_20pats.shape

(894, 277)

In [84]:
# TODO (Lee) - resolve process to pre-process steps of word tokenization etc. on test data
# test set features - pre-processing

# test set - tokenize, lower, clean punctuation
tokenizer.fit_on_texts(list(test_20pats.patent_title_abstract))

# test set - transform each word(token?) in document to sequence of integers that index word strings
tokenized_docs_test = tokenizer.texts_to_sequences(test_20pats.patent_title_abstract)

# test set - pad sequences to max length of title and abstract
X_test_20pats = sequence.pad_sequences(tokenized_docs_test)

# test set - inspect shape
X_test_20pats.shape

(224, 296)

In [76]:
# pre-process test labels
target_test = test_20pats.patent_firstnamed_assignee_id

# TODO (Lee) - find alternate way to do this with tf?
y_test_20pats = pd.get_dummies(target_test).values

y_test_20pats[0]

# uncomment to continue trying with tf.one_hot
# tf.one_hot(target_train, depth=13, axis=-1)

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0], dtype=uint8)

### Construct neural network

- LSTMs
- learn what is important to remember by constantly updating their internal state
- similar to GRUs (gated recurrent units)
- comprise 3 gates:
    - input gate, which determines how much of the cell state that was passed along should be kept
    - forget gate, which determines how much of the current state should be forgotten
    - output gate, which determines how much of the current state should be exposed to the next layers in the network
- learn patterns from sequences, even when sequences are long and extremely complex

https://github.com/glmack/dsc-04-46-04-LSTMs-and-GRUs-seattle-ds-career-040119.git

In [63]:
# tf.keras.model.Model groups layers into an object with training and inference features
# instantiate Model in "functional API" approach by starting from Input, chaining layer calls for 
# forward pass, and then creating model from inputs and outputs

# input layer - # TODO (Lee) - shape?
inputs = Input(shape=(277,))

# chain layer calls to specify model's forward pass

# inputs = tf.keras.Input(shape=(3,))
# x = tf.keras.layers.Dense(4, activation=tf.nn.relu)(inputs)
# outputs = tf.keras.layers.Dense(5, activation=tf.nn.softmax)(x)

# construct embedding layer to convert positive integers (indexes) into dense vectors of fixed size
    # arg input_dim=5280 specifies size of vocab (maximum integer index) + 1
    # arg output_dim=100 specifies dimension of dense embedding
    # TODO (Lee) - describe (input_)
# TODO (Lee) - reset to 277 dimensionality after error - test
x = Embedding(5280, 100)(inputs)

# construct LSTM layer (tf.keras.layers.LSTM) based on Long Short-Term Memory paper from Hochreiter 1997
# arg units=25 specifies dimensionality of output space
x = LSTM(25, return_sequences=True)(x)

# construct global max pooling operation for temporal data
x = GlobalMaxPool1D()(x)

# construct dropout layer to help prevent overfitting by randomly setting fraction rate
# of input units to 0 at each update during training time
# arg rate=0.5 specifies fraction of input units to drop
x = Dropout(0.5)(x)

# construct dense layer, a regular densely-connected NN layer, that implements the operation:
# output = activation(dot(input, kernel) + bias) where:
    # activation is element-wise activation function passed as activation argument,
    # kernel is weights matrix created by layer, 
    # bias is bias vector created by layer (only applicable if use_bias is True)
# arg activation=relu specifies rectified linear unit as activation function
# arg units=50 specifies dimensionality of output space.
x = Dense(50, activation='relu')(x)

# construct dropout layer to help prevent overfitting by randomly setting fraction rate
# of input units to 0 at each update during training time
# arg rate=0.5 specifies fraction of input units to drop
x = Dropout(0.5)(x)

# construct output layer as a dense layer with softmax activation function
# specify arg units of 13, the dimensionality of output space for 13 classes
# specify'softmax' activation function to output vector of predicted probability that example is class
outputs = Dense(13, activation='softmax')(x)

# create model from inputs and outputs
model = Model(inputs=inputs, outputs=outputs)

In [64]:
# embedding_size = 128
# input_ = Input(shape=(100,))
# x = Embedding(20000, embedding_size)(input_)
# x = LSTM(25, return_sequences=True)(x)
# x = GlobalMaxPool1D()(x)
# x = Dropout(0.5)(x)
# x = Dense(50, activation='relu')(x)
# x = Dropout(0.5)(x)
# # There are 41 different possible classes, so we use 41 neurons in our output layer
# x = Dense(41, activation='softmax')(x)

# model = Model(inputs=input_, outputs=x)

In [65]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [66]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 277)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 277, 100)          528000    
_________________________________________________________________
unified_lstm (UnifiedLSTM)   (None, 277, 25)           12600     
_________________________________________________________________
global_max_pooling1d (Global (None, 25)                0         
_________________________________________________________________
dropout (Dropout)            (None, 25)                0         
_________________________________________________________________
dense (Dense)                (None, 50)                1300      
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0     

In [69]:
model.fit(X_train_20pats, y_train_20pats, epochs=20, batch_size=32, validation_split=0.2)

Train on 715 samples, validate on 179 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1a4b8c65c0>

In [96]:
model.fit(X_train_20pats, y_train_20pats, epochs=5, batch_size=32, validation_split=0.1)

Train on 804 samples, validate on 90 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x1a3cc28f28>

In [None]:
After 1 epoch, our model does about as well as the shallow algorithms we tried above.
However, our LSTM Network was able to achieve a validation accuracy of over 40% after only 3 epochs of training.
It's likely that if we trained for more epochs or added in the rest of the data, our performance would improve 
even further (but our run time would get much, much longer).

It's common to embedding layers in LSTM networks, 
because both are special tools most commonly used for text data. 
# embedding layer creates it's own vectors based on the language in the text data it trains on,
and then passes that information on to the LSTM network one word at a time.

### Notes

In [None]:
# (TODO) - Lee - resolve
# remove all tokens that are not alphabetic
for patent in data:
    words = [w.lower() for w in document if w.isalpha()]


words
# note that there is word loss here, e.g. the word non-expert, which contains a hypothesis, appears excluded

### build RNN model, v2

In [78]:
# create RNN model with tf.keras.Sequential
# RNN processes sequence input by iterating through elements and passing outputs
# from one timestep to their input, and then to the next
    
model = Sequential([
    # embedding layer - stores one vector per word, converts sequences of word idxs to sequences of vectors
    # -vectors are trainable
    # this index-lookup is much more efficient than equivalent operation of passing a one-hot encoded vector 
    # through a tf.keras.layers.Dense layer
    Embedding(5280, 277),
    # bidirectional wrapper helps RNN learn long range dependencies by propagating input
    # forward and backwards through RNN layer and then concatenating output 
    Bidirectional(LSTM(277)),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [82]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [85]:
history = model.fit(X_train_20pats, epochs=10,
                    validation_data=X_test_20pats)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()