## Techniche - Topic Model

In [182]:
import pandas as pd
import numpy as np
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.phrases import Phrases, Phraser
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, punkt, RegexpTokenizer

import json
import requests
import re

from topic_model import tokenize_docs

import matplotlib.pyplot as plt
%matplotlib inline
import pyLDAvis
import pyLDAvis.gensim
from pprint import pprint

In [79]:
np.random.seed(3)

In [160]:
# uncomment to download stop words from nltk
# nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/lee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### import data from PatentsView API

In [70]:
# pd.set_option('display.max_colwidth', -1)
pd.options.display.max_columns = 50
pd.set_option('display.max_rows', 50)

# patents endpoint
endpoint_url = 'http://www.patentsview.org/api/patents/query'

# build list of possible fields that endpoint request will return
df = pd.read_excel("data/patents_view_patents_fields.xlsx")
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
pat_fields = df.api_field_name.values.tolist()

# build query
query={"_or":[{"_text_phrase":{"patent_title":"natural language"}},{"_text_phrase":{"patent_abstract":"natural language"}}]}
fields=pat_fields
options={"per_page":2500}
sort=[{"patent_date":"desc"}]

params={'q': json.dumps(query),
        'f': json.dumps(fields),
        'o': json.dumps(options),
        's': json.dumps(sort)}

# request and results
resp = requests.get(endpoint_url, params=params)
results = resp.json()

#### structure data

In [71]:
# extract metadata from response
print("status code:", resp.status_code,';', "reason:", resp.reason)
total_patent_count = results["total_patent_count"]
patents_per_page = results['count']
print("total_patent_count:",total_patent_count,';', "patents_per_page:", patents_per_page)

# extract data from response
data = results['patents']
# data[0]
df = pd.DataFrame(data)
df.head(3)

# df.columns

df = df[['patent_number', 
         'patent_date', 
         'patent_title',
         'patent_abstract', 
         'patent_firstnamed_assignee_id', 
         'patent_year', 
         'patent_type', 
         'patent_kind']]

df['patent_title_abstract'] = df.patent_title + ' ' + df.patent_abstract
df.patent_title_abstract.head(3)

# 561 different assignees
len(df.patent_firstnamed_assignee_id.unique())

df.patent_firstnamed_assignee_id.value_counts()[:10]

# list of assignees with > 20 patents in df dataset
assignees_list = ['org_q9Bn28RHhpYrQjKvraAH', 'org_JZguWDMfFOBX2wBI9pnD', 'org_ID497r4tFbCIaMBjGAST', 
                  'org_rDyHZBYWMcBEtnkHt05L', 'org_p6ofWD2xFNSnyYkj6wpA', 'org_EilEWQcC6UiqHcSGx9mb',
                  'org_ccMMcUijAIsKIxUqMTyP', 'org_Vbc6obpnxWM42d0HjlXY', 'org_9D8x1qL3IRASp6GG7Glu',
                  'org_2wAdIFKssfcLHpZq0u4H', 'org_iwO2oOJ6VIBd9fAuP7G6', 'org_70D1lR89kQnFiCFdJ6s5',
                  'org_vojVnDkT9CamDETqbqJC']

df_20pats = df[df['patent_firstnamed_assignee_id'].isin(assignees_list) ]

df_20pats.sort_values(by=['patent_date'], inplace=True)

df_20pats[['patent_number','patent_title_abstract', 'patent_firstnamed_assignee_id']].head(1)

status code: 200 ; reason: OK
total_patent_count: 2482 ; patents_per_page: 2482


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,patent_number,patent_title_abstract,patent_firstnamed_assignee_id
2479,4502128,Translation between natural languages An input...,org_70D1lR89kQnFiCFdJ6s5


#### Partition data into train and test sets

In [72]:
train_20pats = df_20pats[:894]
len(train_20pats)

test_20pats = df_20pats[894:]
len(test_20pats)

224

In [180]:
def tokenize_docs(docs):
    for doc in docs:
        tokenized_docs = gensim.utils.simple_preprocess(str(doc), deacc=True)
    return tokenized_docs

In [184]:
data = list(tokenize_docs(train_20pats.patent_title_abstract))
data

NameError: name 'word_tokenize' is not defined

In [149]:
data = train_20pats.patent_title_abstract.map(word_tokenize).tolist()

In [165]:
punkt(data)

TypeError: 'module' object is not callable

In [150]:
tokenizer = RegexpTokenizer(r'\w+')

In [154]:
data

[['Translation',
  'between',
  'natural',
  'languages',
  'An',
  'input',
  'sentence',
  'described',
  'by',
  'a',
  'first',
  'natural',
  'language',
  'is',
  'sectioned',
  'into',
  'individual',
  'words',
  '.',
  'Parts',
  'of',
  'speech',
  'corresponding',
  'to',
  'the',
  'individual',
  'words',
  'are',
  'retrieved',
  'from',
  'a',
  'lexical',
  'word',
  'storage',
  ',',
  'whereby',
  'the',
  'input',
  'sentence',
  'is',
  'described',
  'by',
  'a',
  'corresponding',
  'string',
  'of',
  'the',
  'parts-of-speech',
  'as',
  'retrieved',
  '.',
  'A',
  'translation',
  'pattern',
  'table',
  'is',
  'previously',
  'prepared',
  'which',
  'defines',
  'correspondence',
  'between',
  'patterns',
  'of',
  'strings',
  'of',
  'parts-of-speech',
  'for',
  'the',
  'first',
  'natural',
  'language',
  'and',
  'those',
  'for',
  'a',
  'second',
  'natural',
  'language',
  'by',
  'which',
  'corresponding',
  'output',
  'sentence',
  'is',
  

In [156]:
for doc in data:
    words = [w.lower() for w in doc if w.isalpha()]

In [159]:
data[0]

['Translation',
 'between',
 'natural',
 'languages',
 'An',
 'input',
 'sentence',
 'described',
 'by',
 'a',
 'first',
 'natural',
 'language',
 'is',
 'sectioned',
 'into',
 'individual',
 'words',
 '.',
 'Parts',
 'of',
 'speech',
 'corresponding',
 'to',
 'the',
 'individual',
 'words',
 'are',
 'retrieved',
 'from',
 'a',
 'lexical',
 'word',
 'storage',
 ',',
 'whereby',
 'the',
 'input',
 'sentence',
 'is',
 'described',
 'by',
 'a',
 'corresponding',
 'string',
 'of',
 'the',
 'parts-of-speech',
 'as',
 'retrieved',
 '.',
 'A',
 'translation',
 'pattern',
 'table',
 'is',
 'previously',
 'prepared',
 'which',
 'defines',
 'correspondence',
 'between',
 'patterns',
 'of',
 'strings',
 'of',
 'parts-of-speech',
 'for',
 'the',
 'first',
 'natural',
 'language',
 'and',
 'those',
 'for',
 'a',
 'second',
 'natural',
 'language',
 'by',
 'which',
 'corresponding',
 'output',
 'sentence',
 'is',
 'to',
 'be',
 'described',
 '.',
 'By',
 'referring',
 'to',
 'the',
 'translation',
 

In [153]:
tokenizer.tokenize('dog')

['dog']

In [125]:
train_20pats.patent_title_abstract.values.tolist()
data

['Translation between natural languages An input sentence described by a first natural language is sectioned into individual words. Parts of speech corresponding to the individual words are retrieved from a lexical word storage, whereby the input sentence is described by a corresponding string of the parts-of-speech as retrieved. A translation pattern table is previously prepared which defines correspondence between patterns of strings of parts-of-speech for the first natural language and those for a second natural language by which corresponding output sentence is to be described. By referring to the translation pattern table, the string of the parts-of-speech of the input sentence is transformed into a corresponding string of the parts-of-speech for the second natural language. The output sentence described by the second natural language is generated by sequencing target words in accordance with the sequential order of the parts of speech of the string pattern obtained after the tran

In [None]:
stop_words = stopwords.words('english')

In [123]:
# train phrases model
phrases = Phrases(data, min_count=1, threshold=1)

In [124]:
phrases[data[0]]



['T',
 'r',
 'a',
 'n',
 's',
 'l',
 'a',
 't',
 'i',
 'o',
 'n',
 ' ',
 'b',
 'e',
 't',
 'w',
 'e',
 'e',
 'n',
 ' ',
 'n',
 'a',
 't',
 'u',
 'r',
 'a',
 'l',
 ' ',
 'l',
 'a',
 'n',
 'g',
 'u',
 'a',
 'g',
 'e',
 's',
 ' ',
 'A',
 'n',
 ' ',
 'i',
 'n',
 'p',
 'u',
 't',
 ' ',
 's',
 'e',
 'n',
 't',
 'e',
 'n',
 'c',
 'e',
 ' ',
 'd',
 'e',
 's',
 'c',
 'r',
 'i',
 'b',
 'e',
 'd',
 ' ',
 'b',
 'y',
 ' ',
 'a',
 ' ',
 'f',
 'i',
 'r',
 's',
 't',
 ' ',
 'n',
 'a',
 't',
 'u',
 'r',
 'a',
 'l',
 ' ',
 'l',
 'a',
 'n',
 'g',
 'u',
 'a',
 'g',
 'e',
 ' ',
 'i',
 's',
 ' ',
 's',
 'e',
 'c',
 't',
 'i',
 'o',
 'n',
 'e',
 'd',
 ' ',
 'i',
 'n',
 't',
 'o',
 ' ',
 'i',
 'n',
 'd',
 'i',
 'v',
 'i',
 'd',
 'u',
 'a',
 'l',
 ' ',
 'w',
 'o',
 'r',
 'd',
 's',
 '.',
 ' ',
 'P',
 'a',
 'r',
 't',
 's',
 ' ',
 'o',
 'f',
 ' ',
 's',
 'p',
 'e',
 'e',
 'c',
 'h',
 ' ',
 'c',
 'o',
 'r',
 'r',
 'e',
 's',
 'p',
 'o',
 'n',
 'd',
 'i',
 'n',
 'g',
 ' ',
 't',
 'o',
 ' ',
 't',
 'h',
 'e',
 ' '

In [105]:
# create bigram model
bigram = gensim.models.Phrases(train_20pats.patent_title_abstract, min_count=5, threshold=100)

In [106]:
# create trigram
trigram = gensim.models.Phrases(bigram[train_20pats.patent_title_abstract], threshold=100)



In [107]:
# initiatialize bigram and trigram models
bigram_model = gensim.models.phrases.Phraser(bigram)
trigram_model = gensim.models.phrases.Phraser(trigram)

In [109]:
print(bigram_model[train_20pats.patent_title_abstract[0]])

KeyError: 0

In [111]:
print(trigram_mod[bigram_mod[train_20pats.patent_title_abstract[0]]])

KeyError: 0

In [None]:
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])