## Techniche - Topic Modelling

In [1]:
import pandas as pd
import numpy as np

import gensim
import gensim.corpora as corpora
from gensim.corpora import mmcorpus
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.phrases import Phrases, Phraser
from gensim.models.ldamodel import LdaModel
from gensim.models import AuthorTopicModel, atmodel
from gensim.test.utils import common_dictionary, datapath, temporary_file
from smart_open import smart_open

import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, punkt, RegexpTokenizer, wordpunct_tokenize
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer

import json
from pandas.io.json import json_normalize
import requests
import re
import os
import calendar
import sys

from test_model import tokenize_docs, clean_docs, lower_words, remove_stopwords#, (TODO) Lee convert_bytes

from smart_open import smart_open

import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim

from pprint import pprint

import pyspark
import pyspark.sql.functions as F



In [2]:
%load_ext autoreload

# pd.set_option('display.max_colwidth', -1)
pd.options.display.max_columns = 10
pd.set_option('display.max_rows', 10)

In [3]:
np.random.seed(3)

In [5]:
# uncomment to download stop words from nltk and language package from spacy
nltk.download('stopwords')
nltk.download('punkt')
!python -m spacy download en

[nltk_data] Downloading package stopwords to /Users/lee/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/lee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/anaconda3/lib/python3.6/site-packages/en_core_web_sm -->
/anaconda3/lib/python3.6/site-packages/spacy/data/en
You can now load the model via spacy.load('en')


### Import Data

#### Import data from PatentsView API

In [6]:
# patents endpoint
endpoint_url = 'http://www.patentsview.org/api/patents/query'

# build list of possible fields that endpoint request will return
df = pd.read_excel("/Users/lee/Documents/techniche/techniche/data/patents_view_patents_fields.xlsx")
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
pat_fields = df.api_field_name.values.tolist()

#### Import initial dataset

In [7]:
# build query - initial small dataset
query={"_or":[{"_text_phrase":{"patent_title":"natural language"}},{"_text_phrase":{"patent_abstract":"natural language"}}]}
# uncomment to use alternate query options
# query={"cpc_subgroup_id":"G06T3/4046"}
# query = {"_and":[{"_gte":{"patent_date":"2017-01-01"}},{"_lte":{"patent_date":"2017-01-31"}}]}
# query={"_and":
#         [{"_or":
#             [{"_text_phrase":{"patent_title":"natural language"}}
#             ,{"_text_phrase":{"patent_abstract":"natural language"}}]}
#         ,{"_and":
#       [{"patent_year":2016}]}]} 
# query = {"_and":[{"_gte":{"patent_date":"2017-01-01"}},{"_lte":{"patent_date":"2017-01-31"}}]}
fields=pat_fields
options={"per_page":2500}
sort=[{"patent_date":"desc"}]

params={'q': json.dumps(query),
        'f': json.dumps(fields),
        'o': json.dumps(options),
        's': json.dumps(sort)}

# request and results
response = requests.get(endpoint_url, params=params)
status = response.status_code
print("status:", status)
results = response.json()
count = results.get("count")
total_pats = results.get("total_patent_count")
print("patents on current page:",count,';', "total patents:",total_pats)

status: 200
patents on current page: 2482 ; total patents: 2482


#### Structure data

In [8]:
# extract metadata from response
print("status code:", response.status_code,';', "reason:", response.reason)
total_patent_count = results["total_patent_count"]
patents_per_page = results['count']
print("total_patent_count:",total_patent_count,';', "patents_per_page:", patents_per_page)

# extract data from response
data_resp = results['patents']
# data_resp[0]

raw_df = pd.DataFrame(data_resp)
raw_df.head(3)

status code: 200 ; reason: OK
total_patent_count: 2482 ; patents_per_page: 2482


Unnamed: 0,IPCs,application_citations,applications,assignees,cited_patents,...,patent_year,pct_data,rawinventors,uspcs,wipos
0,"[{'ipc_action_date': '2019-03-12', 'ipc_class'...","[{'appcit_app_number': '2002/20020077823', 'ap...","[{'app_country': 'US', 'app_date': '2013-07-26...","[{'assignee_city': 'Burlington', 'assignee_cou...",[{'cited_patent_category': 'cited by examiner'...,...,2019,"[{'pct_102_date': None, 'pct_371_date': None, ...","[{'rawinventor_first_name': 'Jeffrey N.', 'raw...","[{'uspc_first_seen_date': None, 'uspc_last_see...","[{'wipo_field_id': None, 'wipo_field_title': N..."
1,"[{'ipc_action_date': '2019-03-12', 'ipc_class'...","[{'appcit_app_number': '2002/20020138265', 'ap...","[{'app_country': 'US', 'app_date': '2017-09-11...","[{'assignee_city': 'Mountain View', 'assignee_...",[{'cited_patent_category': 'cited by applicant...,...,2019,"[{'pct_102_date': None, 'pct_371_date': None, ...","[{'rawinventor_first_name': 'Evgeny A.', 'rawi...","[{'uspc_first_seen_date': None, 'uspc_last_see...","[{'wipo_field_id': None, 'wipo_field_title': N..."
2,"[{'ipc_action_date': '2019-03-12', 'ipc_class'...","[{'appcit_app_number': '2001/20010029455', 'ap...","[{'app_country': 'US', 'app_date': '2016-09-28...","[{'assignee_city': 'Seattle', 'assignee_countr...",[{'cited_patent_category': 'cited by applicant...,...,2019,"[{'pct_102_date': None, 'pct_371_date': None, ...","[{'rawinventor_first_name': 'Thibault Pierre',...","[{'uspc_first_seen_date': None, 'uspc_last_see...","[{'wipo_field_id': None, 'wipo_field_title': N..."


#### Subset dataframe

In [9]:
# subset dataframe - comment/uncomment to include fields
df = raw_df[['patent_number', 
         'patent_date', 
         'patent_title',
         'patent_abstract', 
         'patent_firstnamed_assignee_id',
         'patent_firstnamed_assignee_location_id',
         'patent_firstnamed_assignee_latitude',
         'patent_firstnamed_assignee_longitude',
         'patent_firstnamed_assignee_city',
         'patent_firstnamed_assignee_state',
         'patent_firstnamed_assignee_country', 
         'patent_firstnamed_inventor_id',
         'patent_firstnamed_inventor_location_id',
         'patent_firstnamed_inventor_latitude',
         'patent_firstnamed_inventor_longitude',
         'patent_firstnamed_inventor_city',
         'patent_firstnamed_inventor_state',
         'patent_firstnamed_inventor_country',
         'patent_year', 
         'patent_type', 
         'patent_kind',
         'inventors'
            ]]
df.head(3)

Unnamed: 0,patent_number,patent_date,patent_title,patent_abstract,patent_firstnamed_assignee_id,...,patent_firstnamed_inventor_country,patent_year,patent_type,patent_kind,inventors
0,10229106,2019-03-12,Initializing a workspace for building a natura...,Designing a natural language understanding (NL...,org_ID497r4tFbCIaMBjGAST,...,US,2019,utility,B2,"[{'inventor_city': 'Newton', 'inventor_country..."
1,10229109,2019-03-12,Allowing spelling of arbitrary words,"Methods, systems, and apparatus, including com...",org_p6ofWD2xFNSnyYkj6wpA,...,CH,2019,utility,B1,"[{'inventor_city': 'Adliswil', 'inventor_count..."
2,10229113,2019-03-12,Leveraging content dimensions during the trans...,A content management system (CMS) and a transl...,org_Vbc6obpnxWM42d0HjlXY,...,US,2019,utility,B1,"[{'inventor_city': 'Seattle', 'inventor_countr..."


#### Explore data

In [10]:
# 561 different assignees
len(df.patent_firstnamed_assignee_id.unique())

561

#### Create new column

In [11]:
# create new column that combines the patent title and the patent abstract columns into a single string
df['patent_title_abstract'] = df.patent_title + ' ' + df.patent_abstract
df.patent_title_abstract.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0    Initializing a workspace for building a natura...
1    Allowing spelling of arbitrary words Methods, ...
2    Leveraging content dimensions during the trans...
Name: patent_title_abstract, dtype: object

In [12]:
df.sort_values(by=['patent_date'])

Unnamed: 0,patent_number,patent_date,patent_title,patent_abstract,patent_firstnamed_assignee_id,...,patent_year,patent_type,patent_kind,inventors,patent_title_abstract
2481,3980994,1976-09-14,Text editing and display system having text in...,A natural language text editing and display sy...,org_1UVZxxNbuUPJkuDfvvpa,...,1976,utility,A,"[{'inventor_city': 'Andover', 'inventor_countr...",Text editing and display system having text in...
2480,4057849,1977-11-08,Text editing and display system,A text-editing and display system for editing ...,org_1UVZxxNbuUPJkuDfvvpa,...,1977,utility,A,"[{'inventor_city': 'Andover', 'inventor_countr...",Text editing and display system A text-editing...
2479,4502128,1985-02-26,Translation between natural languages,An input sentence described by a first natural...,org_70D1lR89kQnFiCFdJ6s5,...,1985,utility,A,"[{'inventor_city': 'Yokohama', 'inventor_count...",Translation between natural languages An input...
2478,4586160,1986-04-29,Method and apparatus for analyzing the syntact...,An automatic syntax analyzing method is applie...,org_hDziASDpeFilN1JsnVK0,...,1986,utility,A,"[{'inventor_city': 'Yokohama', 'inventor_count...",Method and apparatus for analyzing the syntact...
2477,4599612,1986-07-08,Displaying and correcting method for machine t...,In a system wherein a first text in a first na...,org_70D1lR89kQnFiCFdJ6s5,...,1986,utility,A,"[{'inventor_city': 'Fujisawa', 'inventor_count...",Displaying and correcting method for machine t...
...,...,...,...,...,...,...,...,...,...,...,...
11,10229687,2019-03-12,Scalable endpoint-dependent natural language u...,A computer-implemented technique is described ...,org_EilEWQcC6UiqHcSGx9mb,...,2019,utility,B2,"[{'inventor_city': 'Sammamish', 'inventor_coun...",Scalable endpoint-dependent natural language u...
12,10230677,2019-03-12,Identifying an entity associated with an onlin...,An approach is described for identifying an en...,org_q9Bn28RHhpYrQjKvraAH,...,2019,utility,B2,"[{'inventor_city': 'Sewickley', 'inventor_coun...",Identifying an entity associated with an onlin...
13,10230680,2019-03-12,Intelligently splitting text in messages poste...,"A method, system and computer program product ...",org_q9Bn28RHhpYrQjKvraAH,...,2019,utility,B2,"[{'inventor_city': 'Shanghai', 'inventor_count...",Intelligently splitting text in messages poste...
7,10229673,2019-03-12,System and method for providing follow-up resp...,"In certain implementations, follow-up response...",org_9D8x1qL3IRASp6GG7Glu,...,2019,utility,B2,"[{'inventor_city': 'Bellevue', 'inventor_count...",System and method for providing follow-up resp...


In [13]:
text_data = df.patent_title_abstract.tolist()
text_data[:3]

['Initializing a workspace for building a natural language understanding system Designing a natural language understanding (NLU) model for an application from scratch can be difficult for non-experts. A system can simplify the design process by providing an interface allowing a designer to input example usage sentences and build an NLU model based on presented matches to those example sentences. In one embodiment, a method for initializing a workspace for building an NLU system includes parsing a sample sentence to select at least one candidate stub grammar from among multiple candidate stub grammars. The method can include presenting, to a user, respective representations of the candidate stub grammars selected by the parsing of the sample sentence. The method can include enabling the user to choose one of the respective representations of the candidate stub grammars. The method can include adding to the workspace a stub grammar corresponding to the representation of the candidate stu

In [14]:
# partition data
len(text_data)
text_train = text_data[:round(len(text_data)*.8)]
text_test = text_data[round(len(text_data)*.8):]
print(len(text_data), len(text_train), len(text_test), len(text_train)+len(text_test))

2482 1986 496 2482


### Pre-process text data

In [None]:
# uncomment to download stop words from nltk and language package from spacy
# nltk.download('stopwords')
# nltk.download('punkt')
# !python -m spacy download en

#### Tokenize

In [15]:
# tokenize documents

def tokenize_docs(docs):
    tokenized_docs = []
    for doc in docs:
        tokenized_docs.append(word_tokenize(doc))
    return tokenized_docs

tokenized_docs = tokenize_docs(text_train)

#### Clean punctuation

In [16]:
# clean punctuation
def clean_docs(tokenized_docs):
    clean_docs = []
    for doc in tokenized_docs:
       clean_docs.append([word for word in doc if word.isalpha()])  
    return clean_docs

In [17]:
cleaned_data = clean_docs(tokenized_docs)
cleaned_data[0]

['Initializing',
 'a',
 'workspace',
 'for',
 'building',
 'a',
 'natural',
 'language',
 'understanding',
 'system',
 'Designing',
 'a',
 'natural',
 'language',
 'understanding',
 'NLU',
 'model',
 'for',
 'an',
 'application',
 'from',
 'scratch',
 'can',
 'be',
 'difficult',
 'for',
 'A',
 'system',
 'can',
 'simplify',
 'the',
 'design',
 'process',
 'by',
 'providing',
 'an',
 'interface',
 'allowing',
 'a',
 'designer',
 'to',
 'input',
 'example',
 'usage',
 'sentences',
 'and',
 'build',
 'an',
 'NLU',
 'model',
 'based',
 'on',
 'presented',
 'matches',
 'to',
 'those',
 'example',
 'sentences',
 'In',
 'one',
 'embodiment',
 'a',
 'method',
 'for',
 'initializing',
 'a',
 'workspace',
 'for',
 'building',
 'an',
 'NLU',
 'system',
 'includes',
 'parsing',
 'a',
 'sample',
 'sentence',
 'to',
 'select',
 'at',
 'least',
 'one',
 'candidate',
 'stub',
 'grammar',
 'from',
 'among',
 'multiple',
 'candidate',
 'stub',
 'grammars',
 'The',
 'method',
 'can',
 'include',
 'presen

#### Convert to lowercase

In [18]:
# convert to lowercase
def lower_words(docs):
    lowered_words = []
    for doc in docs:
        lowered_words.append([word.lower() for word in doc])
    return lowered_words

lowered_data = lower_words(cleaned_data)
lowered_data[0]

['initializing',
 'a',
 'workspace',
 'for',
 'building',
 'a',
 'natural',
 'language',
 'understanding',
 'system',
 'designing',
 'a',
 'natural',
 'language',
 'understanding',
 'nlu',
 'model',
 'for',
 'an',
 'application',
 'from',
 'scratch',
 'can',
 'be',
 'difficult',
 'for',
 'a',
 'system',
 'can',
 'simplify',
 'the',
 'design',
 'process',
 'by',
 'providing',
 'an',
 'interface',
 'allowing',
 'a',
 'designer',
 'to',
 'input',
 'example',
 'usage',
 'sentences',
 'and',
 'build',
 'an',
 'nlu',
 'model',
 'based',
 'on',
 'presented',
 'matches',
 'to',
 'those',
 'example',
 'sentences',
 'in',
 'one',
 'embodiment',
 'a',
 'method',
 'for',
 'initializing',
 'a',
 'workspace',
 'for',
 'building',
 'an',
 'nlu',
 'system',
 'includes',
 'parsing',
 'a',
 'sample',
 'sentence',
 'to',
 'select',
 'at',
 'least',
 'one',
 'candidate',
 'stub',
 'grammar',
 'from',
 'among',
 'multiple',
 'candidate',
 'stub',
 'grammars',
 'the',
 'method',
 'can',
 'include',
 'presen

#### Clean stopwords

In [19]:
# clean stopwords

stop_words = stopwords.words('english')

In [20]:
def filter_stopwords(docs):
    filtered_docs = []
    for doc in docs:
       filtered_docs.append([word for word in doc if word not in stop_words])
    return filtered_docs

# remove stopwords
filtered_data = filter_stopwords(lowered_data)
filtered_data
# TODO (Lee) - resolve un-lowered stopwords "A" and "An", 'By', 'The'

[['initializing',
  'workspace',
  'building',
  'natural',
  'language',
  'understanding',
  'system',
  'designing',
  'natural',
  'language',
  'understanding',
  'nlu',
  'model',
  'application',
  'scratch',
  'difficult',
  'system',
  'simplify',
  'design',
  'process',
  'providing',
  'interface',
  'allowing',
  'designer',
  'input',
  'example',
  'usage',
  'sentences',
  'build',
  'nlu',
  'model',
  'based',
  'presented',
  'matches',
  'example',
  'sentences',
  'one',
  'embodiment',
  'method',
  'initializing',
  'workspace',
  'building',
  'nlu',
  'system',
  'includes',
  'parsing',
  'sample',
  'sentence',
  'select',
  'least',
  'one',
  'candidate',
  'stub',
  'grammar',
  'among',
  'multiple',
  'candidate',
  'stub',
  'grammars',
  'method',
  'include',
  'presenting',
  'user',
  'respective',
  'representations',
  'candidate',
  'stub',
  'grammars',
  'selected',
  'parsing',
  'sample',
  'sentence',
  'method',
  'include',
  'enabling',
 

#### Construct bigrams and trigrams

In [21]:
# train bigram phrases model
bigram_model = Phrases(filtered_data, min_count=1, threshold=1)

# train trigram phrases model
trigram_model = Phrases(bigram_model[filtered_data], threshold=100)  



In [22]:
# bigrams
def bigrams(docs):
    """create bigrams"""
    return [bigram_model[doc] for doc in docs]

In [23]:
# initialize bigram and trigram models
bigram_model = gensim.models.phrases.Phraser(bigram_model)
trigram_model = gensim.models.phrases.Phraser(trigram_model)

In [24]:
bigrams(filtered_data)[0]

['initializing_workspace',
 'building_natural',
 'language_understanding',
 'system_designing',
 'natural_language',
 'understanding_nlu',
 'model',
 'application',
 'scratch',
 'difficult',
 'system',
 'simplify',
 'design_process',
 'providing_interface',
 'allowing',
 'designer',
 'input',
 'example',
 'usage',
 'sentences_build',
 'nlu_model',
 'based',
 'presented',
 'matches',
 'example',
 'sentences',
 'one_embodiment',
 'method',
 'initializing_workspace',
 'building',
 'nlu',
 'system_includes',
 'parsing_sample',
 'sentence',
 'select',
 'least_one',
 'candidate_stub',
 'grammar',
 'among_multiple',
 'candidate_stub',
 'grammars',
 'method_include',
 'presenting',
 'user',
 'respective_representations',
 'candidate_stub',
 'grammars',
 'selected',
 'parsing_sample',
 'sentence',
 'method_include',
 'enabling_user',
 'choose',
 'one',
 'respective_representations',
 'candidate_stub',
 'grammars',
 'method_include',
 'adding',
 'workspace',
 'stub_grammar',
 'corresponding',
 '

In [25]:
# def trigrams(docs):
#     """create trigrams"""
#     return [trigram_model[bigram_model[doc]] for doc in docs]

In [26]:
# trigrams(filtered_data)[0]

#### Stem and Lemmatize

In [27]:
# def lemmatize_docs(docs, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
#     """lemmatize documents"""
#     lemmatized_docs = []
#     for doc in docs: 
#         lemmatized_docs.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
#     return lemmatized_docs

In [28]:
# # TODO (Lee)
# # TODO (Lee) - lemmatize_docs(cleaned_data)

# lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
# # for doc in cleaned_data:
# #     for token in doc:
# #         token.lemma_

# # uncomment to use
# # download english model with "python -m spacy download en"

# # for token in doc:
# #     print(token, token.lemma, token.lemma_)



In [29]:
len(texts)

NameError: name 'texts' is not defined

#### Create dictionary and convert tokens into frequency counts by document

In [30]:
# specify corpus - list of patent-list of tokenized words
texts = filtered_data

In [31]:
# build dictionary – a mapping between words and their integer ids
id_to_word = corpora.Dictionary(filtered_data)

In [32]:
# .dfs returns frequency of documents containing given token in tuple (token_id, count of documents that contain this token)
id_to_word.dfs.items()

dict_items([(22, 2), (54, 2), (6, 55), (31, 1935), (25, 1944), (51, 176), (50, 1123), (13, 2), (32, 45), (29, 238), (3, 217), (43, 2), (14, 5), (48, 2), (11, 39), (37, 197), (38, 183), (24, 280), (1, 20), (12, 4), (23, 480), (17, 168), (52, 24), (47, 92), (5, 31), (4, 914), (35, 116), (27, 61), (33, 930), (15, 160), (28, 1017), (21, 508), (34, 116), (42, 21), (46, 138), (44, 58), (26, 449), (7, 89), (49, 1), (18, 97), (2, 56), (30, 140), (19, 22), (20, 358), (36, 59), (53, 809), (41, 73), (40, 45), (45, 194), (16, 35), (8, 3), (0, 30), (10, 247), (39, 122), (9, 14), (77, 8), (56, 10), (82, 176), (68, 460), (79, 475), (55, 248), (65, 254), (57, 378), (72, 40), (62, 31), (78, 97), (66, 83), (71, 807), (73, 287), (63, 319), (81, 167), (61, 261), (64, 292), (74, 231), (69, 189), (76, 79), (80, 150), (75, 230), (59, 31), (60, 245), (58, 14), (70, 90), (67, 2), (99, 5), (88, 246), (90, 3), (121, 107), (98, 50), (103, 88), (86, 2), (119, 2), (123, 19), (97, 54), (102, 9), (120, 13), (101, 116

In [33]:
# apply term document frequency - convert docs in corpus to bag-of-words format, a list of (token_id, token_count) tuples
corpus = [id_to_word.doc2bow(text) for text in texts]
corpus

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 2),
  (7, 5),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 2),
  (18, 3),
  (19, 3),
  (20, 3),
  (21, 1),
  (22, 2),
  (23, 1),
  (24, 1),
  (25, 2),
  (26, 1),
  (27, 1),
  (28, 4),
  (29, 2),
  (30, 1),
  (31, 2),
  (32, 3),
  (33, 3),
  (34, 2),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 2),
  (41, 2),
  (42, 2),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 2),
  (47, 2),
  (48, 1),
  (49, 6),
  (50, 3),
  (51, 2),
  (52, 1),
  (53, 3),
  (54, 3)],
 [(1, 1),
  (21, 1),
  (23, 2),
  (25, 1),
  (31, 1),
  (33, 2),
  (38, 1),
  (45, 1),
  (53, 3),
  (55, 1),
  (56, 1),
  (57, 2),
  (58, 2),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 4),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 2),
  (69, 6),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 3),
  (74, 6),
  (75, 2),
  (76, 2),
  (77, 2),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 2),
  (82, 1)

In [34]:
corpus[0]

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 2),
 (7, 5),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 1),
 (15, 1),
 (16, 1),
 (17, 2),
 (18, 3),
 (19, 3),
 (20, 3),
 (21, 1),
 (22, 2),
 (23, 1),
 (24, 1),
 (25, 2),
 (26, 1),
 (27, 1),
 (28, 4),
 (29, 2),
 (30, 1),
 (31, 2),
 (32, 3),
 (33, 3),
 (34, 2),
 (35, 1),
 (36, 1),
 (37, 1),
 (38, 1),
 (39, 1),
 (40, 2),
 (41, 2),
 (42, 2),
 (43, 1),
 (44, 1),
 (45, 1),
 (46, 2),
 (47, 2),
 (48, 1),
 (49, 6),
 (50, 3),
 (51, 2),
 (52, 1),
 (53, 3),
 (54, 3)]

In [35]:
# view formatted corpus (term-doc-frequency)
[[(id_to_word[id], freq) for id, freq in text] for text in corpus]

[[('adding', 1),
  ('allowing', 1),
  ('among', 1),
  ('application', 1),
  ('based', 1),
  ('build', 1),
  ('building', 2),
  ('candidate', 5),
  ('choose', 1),
  ('chosen', 1),
  ('corresponding', 1),
  ('design', 1),
  ('designer', 1),
  ('designing', 1),
  ('difficult', 1),
  ('embodiment', 1),
  ('enabling', 1),
  ('example', 2),
  ('grammar', 3),
  ('grammars', 3),
  ('include', 3),
  ('includes', 1),
  ('initializing', 2),
  ('input', 1),
  ('interface', 1),
  ('language', 2),
  ('least', 1),
  ('matches', 1),
  ('method', 4),
  ('model', 2),
  ('multiple', 1),
  ('natural', 2),
  ('nlu', 3),
  ('one', 3),
  ('parsing', 2),
  ('presented', 1),
  ('presenting', 1),
  ('process', 1),
  ('providing', 1),
  ('representation', 1),
  ('representations', 2),
  ('respective', 2),
  ('sample', 2),
  ('scratch', 1),
  ('select', 1),
  ('selected', 1),
  ('sentence', 2),
  ('sentences', 2),
  ('simplify', 1),
  ('stub', 6),
  ('system', 3),
  ('understanding', 2),
  ('usage', 1),
  ('user'

### Model - model #1

In [36]:
# TODO (Lee) - deprecation warnings
# construct LDA model
model_lda = LdaModel(corpus=corpus,
                     id2word=id_to_word,
                     num_topics=25, 
                     random_state=100,
                     update_every=1,
                     chunksize=100,
                     passes=10,
                     alpha='auto',
                     per_word_topics=True)

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

In [37]:
doc = corpus[0]

In [38]:
def get_topics(doc, k=3, model_lda=model_lda):
    topic_id = sorted(model_lda[doc][0], key=lambda x: -x[1])
    top_k_topics = [x[0] for x in topic_id[:k]]
    return [(i, model_lda.print_topic(i)) for i in top_k_topics]

In [40]:
doc_id = 675
get_topics(corpus[doc_id], k=5), text_train[doc_id]

([(22,
   '0.130*"language" + 0.118*"natural" + 0.069*"method" + 0.044*"information" + 0.039*"one" + 0.036*"processing" + 0.032*"based" + 0.028*"system" + 0.027*"set" + 0.025*"includes"'),
  (21,
   '0.264*"text" + 0.129*"semantic" + 0.041*"target" + 0.035*"list" + 0.029*"lexical" + 0.023*"digital" + 0.022*"structures" + 0.021*"produce" + 0.017*"types" + 0.014*"texts"'),
  (10,
   '0.091*"document" + 0.056*"documents" + 0.043*"code" + 0.023*"categories" + 0.014*"classifying" + 0.013*"keywords" + 0.012*"available" + 0.011*"predefined" + 0.011*"wherein" + 0.011*"databases"'),
  (19,
   '0.087*"speech" + 0.079*"user" + 0.056*"system" + 0.049*"interface" + 0.043*"voice" + 0.042*"recognition" + 0.027*"context" + 0.026*"module" + 0.023*"commands" + 0.023*"command"'),
  (14,
   '0.292*"data" + 0.051*"process" + 0.048*"processes" + 0.037*"structure" + 0.029*"stored" + 0.023*"statistical" + 0.021*"models" + 0.019*"programming" + 0.019*"creating" + 0.014*"instructions"')],
 'System and method fo

In [43]:
# text = 'virtual dictionary lexicon enablement voice'.split()
text = 'smart assistant transformer model translation'.split()

In [42]:
def keywords_string(keywords):
    keyword_string = str(keywords)
    return keyword_string

In [50]:
# clean punctuation
def clean_doc(tokenized_doc):
    clean_doc = []
    clean_doc.append([word for word in tokenized_doc if word.isalpha()])  
    return clean_docs

In [60]:
def lower_doc(doc):
    lowered_words = []
    lowered_words.append([word.lower() for word in doc])
    return lowered_words

In [67]:
def process_keywords(keywords):
    from test_model import tokenize_docs, clean_docs, lower_words, remove_stopwords
    tokenized_text = str(keywords).split()
    #cleaned_text = clean_doc(tokenized_text)
    #lowered_text = lower_doc(cleaned_text)
    return tokenized_text

In [68]:
text = process_keywords("smart assistant multilingual")

In [69]:
text


['smart', 'assistant', 'multilingual']

In [28]:
tokenized_test

NameError: name 'tokenized_test' is not defined

In [26]:
text

[['s', 'm', 'a', 'r', 't'],
 ['a', 's', 's', 'i', 's', 't', 'a', 'n', 't'],
 ['t', 'r', 'a', 'n', 's', 'f', 'o', 'r', 'm', 'e', 'r'],
 ['m', 'o', 'd', 'e', 'l'],
 ['t', 'r', 'a', 'n', 's', 'l', 'a', 't', 'i', 'o', 'n']]

In [44]:
get_topics(id_to_word.doc2bow(text))

[(22,
  '0.130*"language" + 0.118*"natural" + 0.069*"method" + 0.044*"information" + 0.039*"one" + 0.036*"processing" + 0.032*"based" + 0.028*"system" + 0.027*"set" + 0.025*"includes"'),
 (24,
  '0.056*"language" + 0.049*"system" + 0.045*"systems" + 0.043*"methods" + 0.038*"natural" + 0.032*"computer" + 0.019*"translation" + 0.018*"disclosed" + 0.017*"source" + 0.016*"output"'),
 (7,
  '0.057*"elements" + 0.055*"syntactic" + 0.051*"software" + 0.030*"test" + 0.029*"identification" + 0.028*"element" + 0.023*"actions" + 0.021*"pattern" + 0.021*"goals" + 0.021*"application"')]

In [None]:
def get_documents()

In [None]:
text_train[0]

In [None]:
model_lda[doc]

In [59]:
# print keywords in n topics
sorted(model_lda.print_topics(), key=lambda x: x[1])

[(1,
  '0.050*"product" + 0.037*"program" + 0.036*"computer" + 0.034*"topic" + 0.027*"group" + 0.026*"displayed" + 0.022*"category" + 0.022*"informational" + 0.021*"topics" + 0.020*"assigned"'),
 (4,
  '0.054*"database" + 0.054*"sentence" + 0.047*"word" + 0.037*"words" + 0.026*"sentences" + 0.026*"call" + 0.025*"system" + 0.024*"web" + 0.020*"engine" + 0.018*"form"'),
 (24,
  '0.056*"language" + 0.049*"system" + 0.045*"systems" + 0.043*"methods" + 0.038*"natural" + 0.032*"computer" + 0.019*"translation" + 0.018*"disclosed" + 0.017*"source" + 0.016*"output"'),
 (7,
  '0.057*"elements" + 0.055*"syntactic" + 0.051*"software" + 0.030*"test" + 0.029*"identification" + 0.028*"element" + 0.023*"actions" + 0.021*"pattern" + 0.021*"goals" + 0.021*"application"'),
 (11,
  '0.059*"tokens" + 0.055*"containing" + 0.041*"token" + 0.028*"intelligent" + 0.027*"entry" + 0.020*"comparing" + 0.020*"validation" + 0.018*"calendar" + 0.017*"finite" + 0.017*"classes"'),
 (20,
  '0.065*"model" + 0.060*"lingui

In [60]:
# print keywords in n topics
sorted(model_lda.print_topics(), key=lambda x: x[0])

[(0,
  '0.075*"question" + 0.071*"agent" + 0.068*"system" + 0.066*"questions" + 0.034*"automated" + 0.024*"areas" + 0.024*"customer" + 0.020*"handling" + 0.018*"highly" + 0.018*"utilizes"'),
 (1,
  '0.050*"product" + 0.037*"program" + 0.036*"computer" + 0.034*"topic" + 0.027*"group" + 0.026*"displayed" + 0.022*"category" + 0.022*"informational" + 0.021*"topics" + 0.020*"assigned"'),
 (2,
  '0.120*"may" + 0.076*"user" + 0.030*"include" + 0.023*"message" + 0.022*"application" + 0.022*"system" + 0.017*"interpretation" + 0.016*"utterance" + 0.016*"uses" + 0.016*"used"'),
 (3,
  '0.108*"invention" + 0.071*"rules" + 0.057*"present" + 0.039*"rule" + 0.035*"domain" + 0.030*"new" + 0.021*"business" + 0.021*"inference" + 0.017*"two" + 0.017*"relates"'),
 (4,
  '0.054*"database" + 0.054*"sentence" + 0.047*"word" + 0.037*"words" + 0.026*"sentences" + 0.026*"call" + 0.025*"system" + 0.024*"web" + 0.020*"engine" + 0.018*"form"'),
 (5,
  '0.070*"user" + 0.067*"request" + 0.059*"response" + 0.058*"com

In [64]:
pprint(model_lda.show_topic(0))

[('question', 0.07506775),
 ('agent', 0.07063202),
 ('system', 0.06845748),
 ('questions', 0.06615356),
 ('automated', 0.034095865),
 ('areas', 0.023958758),
 ('customer', 0.023829246),
 ('handling', 0.020369032),
 ('highly', 0.018248532),
 ('utilizes', 0.018100064)]


In [52]:
pprint(model_lda.show_topics())

[(0,
  '0.075*"question" + 0.071*"agent" + 0.068*"system" + 0.066*"questions" + '
  '0.034*"automated" + 0.024*"areas" + 0.024*"customer" + 0.020*"handling" + '
  '0.018*"highly" + 0.018*"utilizes"'),
 (1,
  '0.050*"product" + 0.037*"program" + 0.036*"computer" + 0.034*"topic" + '
  '0.027*"group" + 0.026*"displayed" + 0.022*"category" + '
  '0.022*"informational" + 0.021*"topics" + 0.020*"assigned"'),
 (13,
  '0.084*"processor" + 0.069*"answer" + 0.056*"answers" + 0.053*"entity" + '
  '0.033*"memory" + 0.029*"abstract" + 0.028*"universal" + 0.026*"executed" + '
  '0.025*"question" + 0.022*"analyze"'),
 (11,
  '0.059*"tokens" + 0.055*"containing" + 0.041*"token" + 0.028*"intelligent" + '
  '0.027*"entry" + 0.020*"comparing" + 0.020*"validation" + 0.018*"calendar" + '
  '0.017*"finite" + 0.017*"classes"'),
 (17,
  '0.212*"first" + 0.158*"second" + 0.038*"format" + 0.035*"color" + '
  '0.023*"character" + 0.022*"description" + 0.016*"parameter" + '
  '0.015*"solution" + 0.014*"variable" 

In [61]:
# print keywords in n topics
pprint(model_lda.print_topics())

[(0,
  '0.075*"question" + 0.071*"agent" + 0.068*"system" + 0.066*"questions" + '
  '0.034*"automated" + 0.024*"areas" + 0.024*"customer" + 0.020*"handling" + '
  '0.018*"highly" + 0.018*"utilizes"'),
 (1,
  '0.050*"product" + 0.037*"program" + 0.036*"computer" + 0.034*"topic" + '
  '0.027*"group" + 0.026*"displayed" + 0.022*"category" + '
  '0.022*"informational" + 0.021*"topics" + 0.020*"assigned"'),
 (13,
  '0.084*"processor" + 0.069*"answer" + 0.056*"answers" + 0.053*"entity" + '
  '0.033*"memory" + 0.029*"abstract" + 0.028*"universal" + 0.026*"executed" + '
  '0.025*"question" + 0.022*"analyze"'),
 (11,
  '0.059*"tokens" + 0.055*"containing" + 0.041*"token" + 0.028*"intelligent" + '
  '0.027*"entry" + 0.020*"comparing" + 0.020*"validation" + 0.018*"calendar" + '
  '0.017*"finite" + 0.017*"classes"'),
 (17,
  '0.212*"first" + 0.158*"second" + 0.038*"format" + 0.035*"color" + '
  '0.023*"character" + 0.022*"description" + 0.016*"parameter" + '
  '0.015*"solution" + 0.014*"variable" 

In [None]:
# print top 10 keywords that comprise topic with index of 0
pprint(model_lda.print_topic(24))
# the most import keywords, and the respective weight, that form topic 0 are

In [None]:
# print top 10 keywords that comprise topic with index of 1
pprint(model_lda.print_topic(1))

In [None]:
# TODO (Lee) - infer topic from keywords?

### Evaluate - model #1

In [None]:
# calculate perplexity metrics
perplexity = model_lda.log_perplexity(corpus)
perplexity

In [None]:
# TODO (Lee) - confirm that filtered_data is indeed the correct dataset to pass to texts param
# calculate coherence metric
coherence = CoherenceModel(model=model_lda, texts=filtered_data, dictionary=id_to_word, coherence='c_v')
coherence_1 = coherence.get_coherence()
coherence_1

In [None]:
model_lda.

In [None]:
# calculate coherence metric or each of the n topicss
coherence_1 = coherence.get_coherence_per_topic()
coherence_1

In [None]:
# explore topics
pyLDAvis.enable_notebook()
viz_topics_1 = pyLDAvis.gensim.prepare(model_lda, corpus, id_to_word)
viz_topics_1
# TODO (Lee) - salient vs relevant terms in pyLDA ?

### Model 2-  Mallet model

In [None]:
# uncomment to download Mallet topic model
# !wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
# update this path
path_mallet = 'data/mallet-2.0.8/bin/mallet'

In [None]:
model_2 = gensim.models.wrappers.LdaMallet(path_mallet, corpus=corpus, num_topics=25, id2word=id_to_word)

In [None]:
# topics
pprint(model_2.show_topics(formatted=False))

In [None]:
# calculate coherence metric
coherence_model_2 = CoherenceModel(model=model_2, texts=filtered_data, dictionary=id_to_word, coherence='c_v')
coherence_model_2 = coherence_model_2.get_coherence()
coherence_model_2

### Model 3 - Author topic model

#### pre-process

In [None]:
tokenized_docs_at = tokenize_docs(text_data)
cleaned_data_at = clean_docs(tokenized_docs_at)
lowered_data_at = lower_words(cleaned_data_at)
filtered_data_at = filter_stopwords(lowered_data_at)

In [None]:
len(filtered_data_at)

#### build dictionary and corpus from processed text

In [None]:
# build dictionary
id_to_word_at = corpora.Dictionary(filtered_data_at)

# build corpus
texts_at = filtered_data_at

# apply term document frequency - converts docs in corpus to bag-of-words format via list of (token_id, token_count) tuples
corpus_at = [id_to_word_at.doc2bow(text) for text in texts_at]

In [None]:
(next(iter(id_to_word_at.items())))

In [None]:
type(id_to_word_at.keys()[0])

In [None]:
type(id_to_word_at.values())

#### construct inventor-doc mapping from nested inventors column in json api response

In [None]:
# extract nested inventors table from api response
df_inventors = json_normalize(results['patents'], record_path=['inventors'], meta=['patent_number', 'patent_date'])
df_inventors = df_inventors[['inventor_id', 'patent_number', 'patent_date']]
df_inventors.sort_values(by=['patent_date'])
df_inventors.pop("patent_date")
df_inventors.head(3)

In [None]:
# TODO (Lee) - resolve workaround
# df_idx = df
# df_idx['idx'] = df.index
# df_idx
# # df_idx_1 = df_idx[['patent_number', 'idx', 'inventors']]
# df_idx_2 = df_idx_1.set_index('patent_number')
# df_idx_2.pop('inventors')
# df_idx_2
# df_pat_idx = df_idx_2.T.to_dict('records')
# for i in df_pat_idx:
#     df_pat_idx = dict(i)
# df_pat_idx

# df_pat_idx = df_idx_2.T.to_dict('records')
# for i in df_pat_idx:
#     df_pat_idx = dict(i)
# df_pat_idx

In [None]:
dict_pat2inv =df_inventors.set_index('patent_number').T.to_dict('list')
# dict_pat2inv

In [None]:
# for k, v in pat2inv.items():
#     name_dict[new_key] = name_dict.pop(k)
#     time.sleep(4)

# pprint.pprint(name_dict)

# d = {'x':1, 'y':2, 'z':3}
# d1 = {'x':'a', 'y':'b', 'z':'c'}

# dict((d1[key], value) for (key, value) in d.items())
# {'a': 1, 'b': 2, 'c': 3}

# idx_pat_map = df.patent_number.to_dict()
# idx_pat_map = {str(key): value for key, value in idx_pat_map.items()}
# import itertools
# x = list(itertools.islice(idx_pat_map.items(), 0, 4))
# x[:4]

In [None]:
pat2inv_dict = {k: list(v) for k,v in df_inventors.groupby("patent_number")["inventor_id"]}

In [None]:
# {k: list(v) for k,v in df_pat2inv.groupby("patent_number")["inventor_id"]}

In [None]:
# df2 = df_inventors.groupby("patent_number")["inventor_id"]

In [None]:
# df3 = df_idx_pat_inv_map.groupby("patent_number")["inventor_id"]

In [None]:
pat2inv = {k: list(str(v)) for k,v in df_inventors.groupby("patent_number")["inventor_id"]}
len(pat2inv.items())

In [None]:
pat2inv.items()
type(next(iter(pat2inv)))

In [None]:
pat2inv_2 = {str(k): list(v) for k,v in df_inventors.groupby("patent_number")["inventor_id"]}
len(pat2inv_2)

In [None]:
patdf2inv_2 = dict((df_inventors[key], value) for (key, value) in pat2inv.items())
patdf2inv_2

In [None]:
patdf2inv = dict((df_pat_idx[key], value) for (key, value) in pat2inv.items())
patdf2inv

#### Construct author-topic model

In [None]:
# construct author-topic model
model_at = AuthorTopicModel(corpus=corpus_at,
                         doc2author=patdf2inv,
                         id2word=id_to_word_at, 
                         num_topics=25)

In [None]:
# construct vectors for authors
author_vecs = [model_at.get_author_topics(author) for author in model_at.id2author.values()]
author_vecs

In [None]:
# inspect topic distribution for author with id# 7788103-1
# each topic has a probability of being expressed given the particular inventor, but only the ones above a certain threshold are shown.

model_at['7788103-1']

In [None]:
# def show_author(name):
#     print('\n%s' % name)
#     print('Docs:', model.author2doc[name])
#     print('Topics:')
#     pprint([(topic_labels[topic[0]], topic[1]) for topic in model[name]])

In [None]:
# calculate per-word bound, which is a measure of the model's predictive performance (reconstruction error?)

build doc2author dictionary

doc2author = atmodel.construct_doc2author(model.corpus, model.author2doc)

In [None]:

doc2author = atmodel.construct_doc2author(model.corpus, model.author2doc)

In [None]:
gensim.models.atmodel.construct_author2doc(doc2author)
# construct mapping from author IDs to document IDs

Parameters:	doc2author (dict of (int, list of str)) – Mapping of document id to authors.
Returns:	Mapping of authors to document ids.
Return type:	dict of (str, list of int)

In [None]:
gensim.models.atmodel.construct_doc2author(corpus, author2doc)
construct mapping from document IDs to author IDs

Parameters:	
corpus (iterable of list of (int, float)) – Corpus in BoW format.
author2doc (dict of (str, list of int)) – Mapping of authors to documents.
Returns:	
Document to Author mapping.

Return type:	
dict of (int, list of str)