## Techniche - Topic Modelling

In [105]:
import pandas as pd
import numpy as np

import gensim
import gensim.corpora as corpora
from gensim.corpora import mmcorpus
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.phrases import Phrases, Phraser
from gensim.models.ldamodel import LdaModel
from gensim.models import AuthorTopicModel, atmodel
from gensim.test.utils import common_dictionary, datapath, temporary_file
from smart_open import smart_open

import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, punkt, RegexpTokenizer, wordpunct_tokenize
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer

import json
from pandas.io.json import json_normalize
import requests
import re
import os
import calendar

from topic_model import tokenize_docs#, (TODO) Lee convert_bytes

from smart_open import smart_open

import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim

from pprint import pprint

import pyspark
import pyspark.sql.functions as F

In [3]:
%load_ext autoreload

# pd.set_option('display.max_colwidth', -1)
# pd.options.display.max_columns = 50
# pd.set_option('display.max_rows', 10)

In [2]:
np.random.seed(3)

In [5]:
# uncomment to download stop words from nltk and language package from spacy
# nltk.download('stopwords')
# nltk.download('punkt')
# !python -m spacy download en

### Import Data

#### Import data from PatentsView API

In [4]:
# patents endpoint
endpoint_url = 'http://www.patentsview.org/api/patents/query'

# build list of possible fields that endpoint request will return
df = pd.read_excel("data/patents_view_patents_fields.xlsx")
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
pat_fields = df.api_field_name.values.tolist()
# pat_fields = ['appcit_app_number', 'appcit_category', 'appcit_date', 'appcit_kind', 'appcit_sequence',
#      'app_country', 'app_date', 'app_number', 'app_type', 'assignee_city', 'assignee_country',
#      'assignee_county', 'assignee_county_fips', 'assignee_first_name', 'assignee_first_seen_date',
#      'assignee_id', 'assignee_last_name', 'assignee_last_seen_date', 'assignee_lastknown_city',
#      'assignee_lastknown_country', 'assignee_lastknown_latitude', 'assignee_lastknown_location_id',
#      'assignee_lastknown_longitude', 'assignee_lastknown_state', 'assignee_latitude', 
#      'assignee_location_id', 'assignee_longitude', 'assignee_organization', 'assignee_sequence',
#      'assignee_state', 'assignee_state_fips', 'assignee_total_num_inventors', 
#      'assignee_total_num_patents', 'assignee_type', 'cited_patent_category', 'cited_patent_date',
#      'cited_patent_kind', 'cited_patent_number', 'cited_patent_sequence', 'cited_patent_title',
#      'citedby_patent_category', 'citedby_patent_date', 'citedby_patent_kind',
#      'citedby_patent_number', 'citedby_patent_title', 'cpc_category', 'cpc_first_seen_date',
#      'cpc_group_id', 'cpc_group_title', 'cpc_last_seen_date', 'cpc_section_id', 'cpc_sequence',
#      'cpc_subgroup_id', 'cpc_subgroup_title', 'cpc_subsection_id', 'cpc_subsection_title',
#      'cpc_total_num_assignees', 'cpc_total_num_inventors', 'cpc_total_num_patents',
#      'detail_desc_length', 'examiner_first_name', 'examiner_id', 'examiner_last_name',
#      'examiner_role', 'examiner_group', 'forprior_country', 'forprior_date', 'forprior_docnumber',
#      'forprior_kind', 'forprior_sequence', 'govint_contract_award_number', 'govint_org_id',
#      'govint_org_level_one', 'govint_org_level_two', 'govint_org_level_three', 'govint_org_name',
#      'govint_raw_statement', 'inventor_city', 'inventor_country', 'inventor_county',
#      'inventor_county_fips', 'inventor_first_name', 'inventor_first_seen_date', 'inventor_id',
#      'inventor_last_name', 'inventor_last_seen_date', 'inventor_lastknown_city',
#      'inventor_lastknown_country', 'inventor_lastknown_latitude', 'inventor_lastknown_location_id',
#      'inventor_lastknown_longitude', 'inventor_lastknown_state', 'inventor_latitude',
#      'inventor_location_id', 'inventor_longitude', 'inventor_sequence', 'inventor_state',
#      'inventor_state_fips', 'inventor_total_num_patents', 'ipc_action_date', 'ipc_class',
#      'ipc_classification_data_source', 'ipc_classification_value', 'ipc_first_seen_date',
#      'ipc_last_seen_date', 'ipc_main_group', 'ipc_section', 'ipc_sequence', 'ipc_subclass',
#      'ipc_subgroup', 'ipc_symbol_position', 'ipc_total_num_assignees', 'ipc_total_num_inventors',
#      'ipc_version_indicator', 'lawyer_first_name', 'lawyer_first_seen_date', 'lawyer_id',
#      'lawyer_last_name', 'lawyer_last_seen_date', 'lawyer_organization', 'lawyer_sequence',
#      'lawyer_total_num_assignees', 'lawyer_total_num_inventors', 'lawyer_total_num_patents',
#      'nber_category_id', 'nber_category_title', 'nber_first_seen_date', 'nber_last_seen_date',
#      'nber_subcategory_id', 'nber_subcategory_title', 'nber_total_num_assignees',
#      'nber_total_num_inventors', 'nber_total_num_patents', 'patent_abstract',
#      'patent_average_processing_time', 'patent_date', 'patent_firstnamed_assignee_city',
#      'patent_firstnamed_assignee_country', 'patent_firstnamed_assignee_id', 
#      'patent_firstnamed_assignee_latitude', 'patent_firstnamed_assignee_location_id',
#      'patent_firstnamed_assignee_longitude', 'patent_firstnamed_assignee_state',
#      'patent_firstnamed_inventor_city', 'patent_firstnamed_inventor_country',
#      'patent_firstnamed_inventor_id', 'patent_firstnamed_inventor_latitude',
#      'patent_firstnamed_inventor_location_id', 'patent_firstnamed_inventor_longitude',
#      'patent_firstnamed_inventor_state', 'patent_kind', 'patent_num_cited_by_us_patents',
#      'patent_num_claims', 'patent_num_combined_citations', 'patent_num_foreign_citations',
#      'patent_num_us_application_citations', 'patent_num_us_patent_citations', 'patent_number',
#      'patent_processing_time', 'patent_title', 'patent_type', 'patent_year', 'pct_102_date',
#      'pct_371_date', 'pct_date', 'pct_docnumber', 'pct_doctype', 'pct_kind',
#      'rawinventor_first_name', 'rawinventor_last_name', 'uspc_first_seen_date',
#      'uspc_last_seen_date', 'uspc_mainclass_id', 'uspc_mainclass_title', 'uspc_sequence',
#      'uspc_subclass_id', 'uspc_subclass_title', 'uspc_total_num_assignees', 
#      'uspc_total_num_inventors', 'uspc_total_num_patents', 'wipo_field_id','wipo_field_title',
#      'wipo_sector_title','wipo_sequence']

#### Import initial dataset

In [5]:
# build query - initial small dataset
query={"_or":[{"_text_phrase":{"patent_title":"natural language"}},{"_text_phrase":{"patent_abstract":"natural language"}}]}
# uncomment to use alternate query options
# query={"cpc_subgroup_id":"G06T3/4046"}
# query = {"_and":[{"_gte":{"patent_date":"2017-01-01"}},{"_lte":{"patent_date":"2017-01-31"}}]}
# query={"_and":
#         [{"_or":
#             [{"_text_phrase":{"patent_title":"natural language"}}
#             ,{"_text_phrase":{"patent_abstract":"natural language"}}]}
#         ,{"_and":
#       [{"patent_year":2016}]}]} 
# query = {"_and":[{"_gte":{"patent_date":"2017-01-01"}},{"_lte":{"patent_date":"2017-01-31"}}]}
fields=pat_fields
options={"per_page":2500}
sort=[{"patent_date":"desc"}]

params={'q': json.dumps(query),
        'f': json.dumps(fields),
        'o': json.dumps(options),
        's': json.dumps(sort)}

# request and results
response = requests.get(endpoint_url, params=params)
status = response.status_code
print("status:", status)
results = response.json()
count = results.get("count")
total_pats = results.get("total_patent_count")
print("patents on current page:",count,';', "total patents:",total_pats)

status: 200
patents on current page: 2482 ; total patents: 2482


#### Structure data

In [6]:
# extract metadata from response
print("status code:", response.status_code,';', "reason:", response.reason)
total_patent_count = results["total_patent_count"]
patents_per_page = results['count']
print("total_patent_count:",total_patent_count,';', "patents_per_page:", patents_per_page)

# extract data from response
data_resp = results['patents']
# data_resp[0]

raw_df = pd.DataFrame(data_resp)
raw_df.head(3)

status code: 200 ; reason: OK
total_patent_count: 2482 ; patents_per_page: 2482


Unnamed: 0,IPCs,application_citations,applications,assignees,cited_patents,citedby_patents,cpcs,detail_desc_length,examiners,foreign_priority,...,patent_num_us_patent_citations,patent_number,patent_processing_time,patent_title,patent_type,patent_year,pct_data,rawinventors,uspcs,wipos
0,"[{'ipc_action_date': '2019-03-12', 'ipc_class'...","[{'appcit_app_number': '2002/20020077823', 'ap...","[{'app_country': 'US', 'app_date': '2013-07-26...","[{'assignee_city': 'Burlington', 'assignee_cou...",[{'cited_patent_category': 'cited by examiner'...,"[{'citedby_patent_category': None, 'citedby_pa...","[{'cpc_category': None, 'cpc_first_seen_date':...",11570,"[{'examiner_first_name': 'Michael N', 'examine...","[{'forprior_country': None, 'forprior_date': N...",...,5,10229106,2055,Initializing a workspace for building a natura...,utility,2019,"[{'pct_102_date': None, 'pct_371_date': None, ...","[{'rawinventor_first_name': 'Jeffrey N.', 'raw...","[{'uspc_first_seen_date': None, 'uspc_last_see...","[{'wipo_field_id': None, 'wipo_field_title': N..."
1,"[{'ipc_action_date': '2019-03-12', 'ipc_class'...","[{'appcit_app_number': '2002/20020138265', 'ap...","[{'app_country': 'US', 'app_date': '2017-09-11...","[{'assignee_city': 'Mountain View', 'assignee_...",[{'cited_patent_category': 'cited by applicant...,"[{'citedby_patent_category': None, 'citedby_pa...","[{'cpc_category': None, 'cpc_first_seen_date':...",28118,"[{'examiner_first_name': 'Shreyans A', 'examin...","[{'forprior_country': None, 'forprior_date': N...",...,8,10229109,547,Allowing spelling of arbitrary words,utility,2019,"[{'pct_102_date': None, 'pct_371_date': None, ...","[{'rawinventor_first_name': 'Evgeny A.', 'rawi...","[{'uspc_first_seen_date': None, 'uspc_last_see...","[{'wipo_field_id': None, 'wipo_field_title': N..."
2,"[{'ipc_action_date': '2019-03-12', 'ipc_class'...","[{'appcit_app_number': '2001/20010029455', 'ap...","[{'app_country': 'US', 'app_date': '2016-09-28...","[{'assignee_city': 'Seattle', 'assignee_countr...",[{'cited_patent_category': 'cited by applicant...,"[{'citedby_patent_category': None, 'citedby_pa...","[{'cpc_category': None, 'cpc_first_seen_date':...",119654,"[{'examiner_first_name': 'Jialong', 'examiner_...","[{'forprior_country': None, 'forprior_date': N...",...,26,10229113,895,Leveraging content dimensions during the trans...,utility,2019,"[{'pct_102_date': None, 'pct_371_date': None, ...","[{'rawinventor_first_name': 'Thibault Pierre',...","[{'uspc_first_seen_date': None, 'uspc_last_see...","[{'wipo_field_id': None, 'wipo_field_title': N..."


#### Subset dataframe

In [7]:
# subset dataframe - comment/uncomment to include fields
df = raw_df[['patent_number', 
         'patent_date', 
         'patent_title',
         'patent_abstract', 
         'patent_firstnamed_assignee_id',
         'patent_firstnamed_assignee_location_id',
         'patent_firstnamed_assignee_latitude',
         'patent_firstnamed_assignee_longitude',
         'patent_firstnamed_assignee_city',
         'patent_firstnamed_assignee_state',
         'patent_firstnamed_assignee_country', 
         'patent_firstnamed_inventor_id',
         'patent_firstnamed_inventor_location_id',
         'patent_firstnamed_inventor_latitude',
         'patent_firstnamed_inventor_longitude',
         'patent_firstnamed_inventor_city',
         'patent_firstnamed_inventor_state',
         'patent_firstnamed_inventor_country',
         'patent_year', 
         'patent_type', 
         'patent_kind',
         'inventors'
#          'patent_processing_time', 
#          'patent_num_us_application_citations', 
#          'patent_num_us_patent_citations', 
#          'patent_num_foreign_citations', 
#          'patent_num_combined_citations', 
#          'patent_num_claims', 
#          'patent_num_cited_by_us_patents',
#          'detail_desc_length'
            ]]
df.head(3)

Unnamed: 0,patent_number,patent_date,patent_title,patent_abstract,patent_firstnamed_assignee_id,patent_firstnamed_assignee_location_id,patent_firstnamed_assignee_latitude,patent_firstnamed_assignee_longitude,patent_firstnamed_assignee_city,patent_firstnamed_assignee_state,...,patent_firstnamed_inventor_location_id,patent_firstnamed_inventor_latitude,patent_firstnamed_inventor_longitude,patent_firstnamed_inventor_city,patent_firstnamed_inventor_state,patent_firstnamed_inventor_country,patent_year,patent_type,patent_kind,inventors
0,10229106,2019-03-12,Initializing a workspace for building a natura...,Designing a natural language understanding (NL...,org_ID497r4tFbCIaMBjGAST,42.5047|-71.1961,42.5047,-71.1961,Burlington,MA,...,42.3369|-71.2097,42.3369,-71.2097,Newton,MA,US,2019,utility,B2,"[{'inventor_city': 'Newton', 'inventor_country..."
1,10229109,2019-03-12,Allowing spelling of arbitrary words,"Methods, systems, and apparatus, including com...",org_p6ofWD2xFNSnyYkj6wpA,37.3861|-122.0828,37.3861,-122.083,Mountain View,CA,...,47.3119|8.5287,47.3119,8.5287,Adliswil,,CH,2019,utility,B1,"[{'inventor_city': 'Adliswil', 'inventor_count..."
2,10229113,2019-03-12,Leveraging content dimensions during the trans...,A content management system (CMS) and a transl...,org_Vbc6obpnxWM42d0HjlXY,47.6064|-122.3308,47.6064,-122.331,Seattle,WA,...,47.6064|-122.3308,47.6064,-122.331,Seattle,WA,US,2019,utility,B1,"[{'inventor_city': 'Seattle', 'inventor_countr..."


#### Explore data

In [8]:
# 561 different assignees
len(df.patent_firstnamed_assignee_id.unique())

561

#### Create new column

In [9]:
# create new column that combines the patent title and the patent abstract columns into a single string
df['patent_title_abstract'] = df.patent_title + ' ' + df.patent_abstract
df.patent_title_abstract.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0    Initializing a workspace for building a natura...
1    Allowing spelling of arbitrary words Methods, ...
2    Leveraging content dimensions during the trans...
Name: patent_title_abstract, dtype: object

In [13]:
df.sort_values(by=['patent_date'])

Unnamed: 0,patent_number,patent_date,patent_title,patent_abstract,patent_firstnamed_assignee_id,patent_firstnamed_assignee_location_id,patent_firstnamed_assignee_latitude,patent_firstnamed_assignee_longitude,patent_firstnamed_assignee_city,patent_firstnamed_assignee_state,...,patent_firstnamed_inventor_latitude,patent_firstnamed_inventor_longitude,patent_firstnamed_inventor_city,patent_firstnamed_inventor_state,patent_firstnamed_inventor_country,patent_year,patent_type,patent_kind,inventors,patent_title_abstract
2481,3980994,1976-09-14,Text editing and display system having text in...,A natural language text editing and display sy...,org_1UVZxxNbuUPJkuDfvvpa,42.4906|-71.2767,42.4906,-71.2767,Bedford,MA,...,42.6583,-71.1375,Andover,MA,US,1976,utility,A,"[{'inventor_city': 'Andover', 'inventor_countr...",Text editing and display system having text in...
2480,4057849,1977-11-08,Text editing and display system,A text-editing and display system for editing ...,org_1UVZxxNbuUPJkuDfvvpa,42.4906|-71.2767,42.4906,-71.2767,Bedford,MA,...,42.6583,-71.1375,Andover,MA,US,1977,utility,A,"[{'inventor_city': 'Andover', 'inventor_countr...",Text editing and display system A text-editing...
2479,4502128,1985-02-26,Translation between natural languages,An input sentence described by a first natural...,org_70D1lR89kQnFiCFdJ6s5,35.685|139.7514,35.685,139.751,Tokyo,,...,35.4437,139.638,Yokohama,,JP,1985,utility,A,"[{'inventor_city': 'Yokohama', 'inventor_count...",Translation between natural languages An input...
2478,4586160,1986-04-29,Method and apparatus for analyzing the syntact...,An automatic syntax analyzing method is applie...,org_hDziASDpeFilN1JsnVK0,35.5298|139.7024,35.5298,139.702,Kawasaki,,...,35.4437,139.638,Yokohama,,JP,1986,utility,A,"[{'inventor_city': 'Yokohama', 'inventor_count...",Method and apparatus for analyzing the syntact...
2477,4599612,1986-07-08,Displaying and correcting method for machine t...,In a system wherein a first text in a first na...,org_70D1lR89kQnFiCFdJ6s5,35.685|139.7514,35.685,139.751,Tokyo,,...,34.4783,133.933,Tama,,JP,1986,utility,A,"[{'inventor_city': 'Fujisawa', 'inventor_count...",Displaying and correcting method for machine t...
2476,4638445,1987-01-20,Autonomous mobile robot,A vision system for a mobile robot employs at ...,,,,,,,...,42.1875,-71.3069,Medfield,MA,US,1987,utility,A,"[{'inventor_city': 'Medfield', 'inventor_count...",Autonomous mobile robot A vision system for a ...
2475,4661924,1987-04-28,Multiple-parts-of-speech disambiguating method...,A machine translation system comprises input m...,org_70D1lR89kQnFiCFdJ6s5,35.685|139.7514,35.685,139.751,Tokyo,,...,35.685,139.751,Tokyo,,JP,1987,utility,A,"[{'inventor_city': 'Yokohama', 'inventor_count...",Multiple-parts-of-speech disambiguating method...
2474,4688195,1987-08-18,Natural-language interface generating system,A system for interactively generating a natura...,org_C6OV0HJ5yTbw8A4Wv6Ag,32.7833|-96.8,32.7833,-96.8,Dallas,TX,...,33.0197,-96.6986,Plano,TX,US,1987,utility,A,"[{'inventor_city': 'Plano', 'inventor_country'...",Natural-language interface generating system A...
2473,4689737,1987-08-25,Integrated environment computer system control...,A computer system including text input and dis...,,,,,,,...,34.1511,-118.448,Sherman Oaks,CA,US,1987,utility,A,"[{'inventor_city': 'Sherman Oaks', 'inventor_c...",Integrated environment computer system control...
2472,4695975,1987-09-22,Multi-image communications system,A device for automatic translation of natural ...,org_rjvvHMFnrkq3zHsiW0ej,40.7142|-74.0064,40.7142,-74.0064,New York,NY,...,41.7003,-73.9214,Poughkeepsie,NY,US,1987,utility,A,"[{'inventor_city': 'Poughkeepsie', 'inventor_c...",Multi-image communications system A device for...


In [11]:
text_data = df.patent_title_abstract.tolist()
text_data

['Initializing a workspace for building a natural language understanding system Designing a natural language understanding (NLU) model for an application from scratch can be difficult for non-experts. A system can simplify the design process by providing an interface allowing a designer to input example usage sentences and build an NLU model based on presented matches to those example sentences. In one embodiment, a method for initializing a workspace for building an NLU system includes parsing a sample sentence to select at least one candidate stub grammar from among multiple candidate stub grammars. The method can include presenting, to a user, respective representations of the candidate stub grammars selected by the parsing of the sample sentence. The method can include enabling the user to choose one of the respective representations of the candidate stub grammars. The method can include adding to the workspace a stub grammar corresponding to the representation of the candidate stu

In [12]:
# partition data
len(text_data)
text_train = text_data[:round(len(text_data)*.8)]
text_test = text_data[round(len(text_data)*.8):]
print(len(text_data), len(text_train), len(text_test), len(text_train)+len(text_test))

2482 1986 496 2482


### Pre-process text data

In [14]:
# uncomment to download stop words from nltk and language package from spacy
# nltk.download('stopwords')
# nltk.download('punkt')
# !python -m spacy download en

[nltk_data] Downloading package stopwords to /Users/lee/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /Users/lee/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/anaconda3/lib/python3.6/site-packages/en_core_web_sm -->
/anaconda3/lib/python3.6/site-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [None]:
# construct pipeline using Spacy Language object and associated pipeline/components
nlp = spacy.load("en")
pprint(nlp.pipeline)

In [None]:
processed_docs = []   

# process patent documents in pipeline
for doc in nlp.pipe(text_train, n_threads=4, batch_size=100):
   
    ents = doc.ents  # Named entities.

    # Keep only words (no numbers, no punctuation).
    # Lemmatize tokens, remove punctuation and remove stopwords.
    doc = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]

    # Remove common words from a stopword list.
    doc = [token for token in doc if token not in stop_words]

    # Add named entities, but only if they are a compound of more than word.
    doc.extend([str(entity) for entity in ents if len(entity) > 1])
    
    processed_docs.append(doc)

processed_docs[0][:5]

In [None]:
nlp.

In [None]:
[token.text for token in doc]

In [None]:
labels = set([w.label_ for w in doc.ents]) 

In [None]:
for label in labels: 
    entities = [cleanup(e.string, lower=False) for e in document.ents if label==e.label_] 
    entities = list(set(entities)) 
    print(label,entities)

In [None]:
pre_processed_docs = []
for doc in nlp.pipe(docs, n_threads=4, batch_size=100):
    # Process document using Spacy NLP pipeline.
    
    ents = doc.ents  # Named entities.

    # Keep only words (no numbers, no punctuation).
    # Lemmatize tokens, remove punctuation and remove stopwords.
    doc = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]

    # Remove common words from a stopword list.
    #doc = [token for token in doc if token not in STOPWORDS]

    # Add named entities, but only if they are a compound of more than word.
    doc.extend([str(entity) for entity in ents if len(entity) > 1])
    
    pre_processed_docs.append(doc)

#### Tokenize

In [15]:
# tokenize documents

def tokenize_docs(docs):
    tokenized_docs = []
    for doc in docs:
        tokenized_docs.append(word_tokenize(doc))
    return tokenized_docs

tokenized_docs = tokenize_docs(text_train)

#### Clean punctuation

In [16]:
# clean punctuation
def clean_docs(tokenized_docs):
    clean_docs = []
    for doc in tokenized_docs:
       clean_docs.append([word for word in doc if word.isalpha()])  
    return clean_docs

In [17]:
cleaned_data = clean_docs(tokenized_docs)
cleaned_data[0]

['Initializing',
 'a',
 'workspace',
 'for',
 'building',
 'a',
 'natural',
 'language',
 'understanding',
 'system',
 'Designing',
 'a',
 'natural',
 'language',
 'understanding',
 'NLU',
 'model',
 'for',
 'an',
 'application',
 'from',
 'scratch',
 'can',
 'be',
 'difficult',
 'for',
 'A',
 'system',
 'can',
 'simplify',
 'the',
 'design',
 'process',
 'by',
 'providing',
 'an',
 'interface',
 'allowing',
 'a',
 'designer',
 'to',
 'input',
 'example',
 'usage',
 'sentences',
 'and',
 'build',
 'an',
 'NLU',
 'model',
 'based',
 'on',
 'presented',
 'matches',
 'to',
 'those',
 'example',
 'sentences',
 'In',
 'one',
 'embodiment',
 'a',
 'method',
 'for',
 'initializing',
 'a',
 'workspace',
 'for',
 'building',
 'an',
 'NLU',
 'system',
 'includes',
 'parsing',
 'a',
 'sample',
 'sentence',
 'to',
 'select',
 'at',
 'least',
 'one',
 'candidate',
 'stub',
 'grammar',
 'from',
 'among',
 'multiple',
 'candidate',
 'stub',
 'grammars',
 'The',
 'method',
 'can',
 'include',
 'presen

#### Convert to lowercase

In [18]:
# convert to lowercase
def lower_words(docs):
    lowered_words = []
    for doc in docs:
        lowered_words.append([word.lower() for word in doc])
    return lowered_words

lowered_data = lower_words(cleaned_data)
lowered_data[0]

['initializing',
 'a',
 'workspace',
 'for',
 'building',
 'a',
 'natural',
 'language',
 'understanding',
 'system',
 'designing',
 'a',
 'natural',
 'language',
 'understanding',
 'nlu',
 'model',
 'for',
 'an',
 'application',
 'from',
 'scratch',
 'can',
 'be',
 'difficult',
 'for',
 'a',
 'system',
 'can',
 'simplify',
 'the',
 'design',
 'process',
 'by',
 'providing',
 'an',
 'interface',
 'allowing',
 'a',
 'designer',
 'to',
 'input',
 'example',
 'usage',
 'sentences',
 'and',
 'build',
 'an',
 'nlu',
 'model',
 'based',
 'on',
 'presented',
 'matches',
 'to',
 'those',
 'example',
 'sentences',
 'in',
 'one',
 'embodiment',
 'a',
 'method',
 'for',
 'initializing',
 'a',
 'workspace',
 'for',
 'building',
 'an',
 'nlu',
 'system',
 'includes',
 'parsing',
 'a',
 'sample',
 'sentence',
 'to',
 'select',
 'at',
 'least',
 'one',
 'candidate',
 'stub',
 'grammar',
 'from',
 'among',
 'multiple',
 'candidate',
 'stub',
 'grammars',
 'the',
 'method',
 'can',
 'include',
 'presen

#### Clean stopwords

In [19]:
# clean stopwords

stop_words = stopwords.words('english')

In [20]:
def filter_stopwords(docs):
    filtered_docs = []
    for doc in docs:
       filtered_docs.append([word for word in doc if word not in stop_words])
    return filtered_docs

# remove stopwords
filtered_data = filter_stopwords(lowered_data)
filtered_data
# TODO (Lee) - resolve un-lowered stopwords "A" and "An", 'By', 'The'

[['initializing',
  'workspace',
  'building',
  'natural',
  'language',
  'understanding',
  'system',
  'designing',
  'natural',
  'language',
  'understanding',
  'nlu',
  'model',
  'application',
  'scratch',
  'difficult',
  'system',
  'simplify',
  'design',
  'process',
  'providing',
  'interface',
  'allowing',
  'designer',
  'input',
  'example',
  'usage',
  'sentences',
  'build',
  'nlu',
  'model',
  'based',
  'presented',
  'matches',
  'example',
  'sentences',
  'one',
  'embodiment',
  'method',
  'initializing',
  'workspace',
  'building',
  'nlu',
  'system',
  'includes',
  'parsing',
  'sample',
  'sentence',
  'select',
  'least',
  'one',
  'candidate',
  'stub',
  'grammar',
  'among',
  'multiple',
  'candidate',
  'stub',
  'grammars',
  'method',
  'include',
  'presenting',
  'user',
  'respective',
  'representations',
  'candidate',
  'stub',
  'grammars',
  'selected',
  'parsing',
  'sample',
  'sentence',
  'method',
  'include',
  'enabling',
 

#### Construct bigrams and trigrams

In [21]:
# train bigram phrases model
bigram_model = Phrases(filtered_data, min_count=1, threshold=1)

# train trigram phrases model
trigram_model = Phrases(bigram_model[filtered_data], threshold=100)  



In [22]:
# bigrams
def bigrams(docs):
    """create bigrams"""
    return [bigram_model[doc] for doc in docs]

In [23]:
# initialize bigram and trigram models
bigram_model = gensim.models.phrases.Phraser(bigram_model)
trigram_model = gensim.models.phrases.Phraser(trigram_model)

In [24]:
bigrams(filtered_data)[0]

['initializing_workspace',
 'building_natural',
 'language_understanding',
 'system_designing',
 'natural_language',
 'understanding_nlu',
 'model',
 'application',
 'scratch',
 'difficult',
 'system',
 'simplify',
 'design_process',
 'providing_interface',
 'allowing',
 'designer',
 'input',
 'example',
 'usage',
 'sentences_build',
 'nlu_model',
 'based',
 'presented',
 'matches',
 'example',
 'sentences',
 'one_embodiment',
 'method',
 'initializing_workspace',
 'building',
 'nlu',
 'system_includes',
 'parsing_sample',
 'sentence',
 'select',
 'least_one',
 'candidate_stub',
 'grammar',
 'among_multiple',
 'candidate_stub',
 'grammars',
 'method_include',
 'presenting',
 'user',
 'respective_representations',
 'candidate_stub',
 'grammars',
 'selected',
 'parsing_sample',
 'sentence',
 'method_include',
 'enabling_user',
 'choose',
 'one',
 'respective_representations',
 'candidate_stub',
 'grammars',
 'method_include',
 'adding',
 'workspace',
 'stub_grammar',
 'corresponding',
 '

In [25]:
# def trigrams(docs):
#     """create trigrams"""
#     return [trigram_model[bigram_model[doc]] for doc in docs]

In [26]:
# trigrams(filtered_data)[0]

['initializing_workspace',
 'building_natural',
 'language_understanding',
 'system_designing',
 'natural_language',
 'understanding_nlu',
 'model',
 'application',
 'scratch',
 'difficult',
 'system',
 'simplify',
 'design_process',
 'providing_interface',
 'allowing',
 'designer',
 'input',
 'example',
 'usage',
 'sentences_build',
 'nlu_model',
 'based',
 'presented',
 'matches',
 'example',
 'sentences',
 'one_embodiment',
 'method',
 'initializing_workspace',
 'building',
 'nlu',
 'system_includes',
 'parsing_sample',
 'sentence',
 'select',
 'least_one',
 'candidate_stub',
 'grammar',
 'among_multiple',
 'candidate_stub',
 'grammars',
 'method_include',
 'presenting',
 'user',
 'respective_representations',
 'candidate_stub',
 'grammars',
 'selected',
 'parsing_sample',
 'sentence',
 'method_include',
 'enabling_user',
 'choose',
 'one',
 'respective_representations',
 'candidate_stub',
 'grammars',
 'method_include',
 'adding',
 'workspace',
 'stub_grammar',
 'corresponding',
 '

#### Stem and Lemmatize

In [27]:
# def lemmatize_docs(docs, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
#     """lemmatize documents"""
#     lemmatized_docs = []
#     for doc in docs: 
#         lemmatized_docs.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
#     return lemmatized_docs

In [28]:
# # TODO (Lee)
# # TODO (Lee) - lemmatize_docs(cleaned_data)

# lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
# # for doc in cleaned_data:
# #     for token in doc:
# #         token.lemma_

# # uncomment to use
# # download english model with "python -m spacy download en"

# # for token in doc:
# #     print(token, token.lemma, token.lemma_)



#### Create corpus and dictionary

In [30]:
 # build dictionary
id_to_word = corpora.Dictionary(filtered_data)

# build corpus
texts = filtered_data

# apply term document frequency
# converts documents in corpus to bag-of-words format, a list of (token_id, token_count) tuples
corpus = [id_to_word.doc2bow(text) for text in texts]

In [31]:
# view formatted corpus (term-doc-frequency)
[[(id_to_word[id], freq) for id, freq in text] for text in corpus][:1]

[[('adding', 1),
  ('allowing', 1),
  ('among', 1),
  ('application', 1),
  ('based', 1),
  ('build', 1),
  ('building', 2),
  ('candidate', 5),
  ('choose', 1),
  ('chosen', 1),
  ('corresponding', 1),
  ('design', 1),
  ('designer', 1),
  ('designing', 1),
  ('difficult', 1),
  ('embodiment', 1),
  ('enabling', 1),
  ('example', 2),
  ('grammar', 3),
  ('grammars', 3),
  ('include', 3),
  ('includes', 1),
  ('initializing', 2),
  ('input', 1),
  ('interface', 1),
  ('language', 2),
  ('least', 1),
  ('matches', 1),
  ('method', 4),
  ('model', 2),
  ('multiple', 1),
  ('natural', 2),
  ('nlu', 3),
  ('one', 3),
  ('parsing', 2),
  ('presented', 1),
  ('presenting', 1),
  ('process', 1),
  ('providing', 1),
  ('representation', 1),
  ('representations', 2),
  ('respective', 2),
  ('sample', 2),
  ('scratch', 1),
  ('select', 1),
  ('selected', 1),
  ('sentence', 2),
  ('sentences', 2),
  ('simplify', 1),
  ('stub', 6),
  ('system', 3),
  ('understanding', 2),
  ('usage', 1),
  ('user'

### Model - model #1

In [32]:
# TODO (Lee) - deprecation warnings
# construct LDA model
model_lda = LdaModel(corpus=corpus,
                     id2word=id_to_word,
                     num_topics=25, 
                     random_state=100,
                     update_every=1,
                     chunksize=100,
                     passes=10,
                     alpha='auto',
                     per_word_topics=True)

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

In [78]:
# print keywords in n topics
pprint(model_lda.print_topics())

[(0,
  '0.075*"question" + 0.071*"agent" + 0.068*"system" + 0.066*"questions" + '
  '0.034*"automated" + 0.024*"areas" + 0.024*"customer" + 0.020*"handling" + '
  '0.018*"highly" + 0.018*"utilizes"'),
 (1,
  '0.050*"product" + 0.037*"program" + 0.036*"computer" + 0.034*"topic" + '
  '0.027*"group" + 0.026*"displayed" + 0.022*"category" + '
  '0.022*"informational" + 0.021*"topics" + 0.020*"assigned"'),
 (13,
  '0.084*"processor" + 0.069*"answer" + 0.056*"answers" + 0.053*"entity" + '
  '0.033*"memory" + 0.029*"abstract" + 0.028*"universal" + 0.026*"executed" + '
  '0.025*"question" + 0.022*"analyze"'),
 (11,
  '0.059*"tokens" + 0.055*"containing" + 0.041*"token" + 0.028*"intelligent" + '
  '0.027*"entry" + 0.020*"comparing" + 0.020*"validation" + 0.018*"calendar" + '
  '0.017*"finite" + 0.017*"classes"'),
 (17,
  '0.212*"first" + 0.158*"second" + 0.038*"format" + 0.035*"color" + '
  '0.023*"character" + 0.022*"description" + 0.016*"parameter" + '
  '0.015*"solution" + 0.014*"variable" 

In [79]:
# print top 10 keywords that comprise topic with index of 0
pprint(model_lda.print_topic(24))
# the most import keywords, and the respective weight, that form topic 0 are

('0.056*"language" + 0.049*"system" + 0.045*"systems" + 0.043*"methods" + '
 '0.038*"natural" + 0.032*"computer" + 0.019*"translation" + 0.018*"disclosed" '
 '+ 0.017*"source" + 0.016*"output"')


In [80]:
# print top 10 keywords that comprise topic with index of 1
pprint(model_lda.print_topic(1))

('0.050*"product" + 0.037*"program" + 0.036*"computer" + 0.034*"topic" + '
 '0.027*"group" + 0.026*"displayed" + 0.022*"category" + 0.022*"informational" '
 '+ 0.021*"topics" + 0.020*"assigned"')


In [81]:
# TODO (Lee) - infer topic from keywords?

### Evaluate - model #1

In [82]:
# calculate perplexity metrics
perplexity = model_lda.log_perplexity(corpus)
perplexity

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

-6.84433260522691

In [83]:
# TODO (Lee) - confirm that filtered_data is indeed the correct dataset to pass to texts param
# calculate coherence metric
coherence = CoherenceModel(model=model_lda, texts=filtered_data, dictionary=id_to_word, coherence='c_v')
coherence_1 = coherence.get_coherence()
coherence_1

0.3897699923252669

In [84]:
# calculate coherence metric or each of the n topicss
coherence_1 = coherence.get_coherence_per_topic()
coherence_1

[0.32348039739352397,
 0.35170360224714814,
 0.3950224423159941,
 0.2848252291115782,
 0.20291289135865515,
 0.3027047918616517,
 0.4265237682505485,
 0.43851529187478955,
 0.3476302468434924,
 0.4859671593015696,
 0.4664475299209598,
 0.5590420325628574,
 0.3950477611470393,
 0.4527489579742296,
 0.2517553388816768,
 0.39171188223083825,
 0.3201914839227447,
 0.546608689087153,
 0.32377826911718693,
 0.5323612347470144,
 0.41744640101842095,
 0.4466727393652505,
 0.404578560346497,
 0.355147469212505,
 0.3214256380383471]

In [85]:
# explore topics
pyLDAvis.enable_notebook()
viz_topics_1 = pyLDAvis.gensim.prepare(model_lda, corpus, id_to_word)
viz_topics_1
# TODO (Lee) - salient vs relevant terms in pyLDA ?

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


### Model 2-  Mallet model

In [86]:
# uncomment to download Mallet topic model
# !wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
# update this path
# path_mallet = 'data/mallet-2.0.8/bin/mallet'

In [87]:
model_2 = gensim.models.wrappers.LdaMallet(path_mallet, corpus=corpus, num_topics=25, id2word=id_to_word)

In [88]:
# topics
pprint(model_2.show_topics(formatted=False))

[(15,
  [('model', 0.07553272450532725),
   ('features', 0.035578386605783864),
   ('learning', 0.03443683409436834),
   ('training', 0.03291476407914764),
   ('product', 0.031773211567732114),
   ('classification', 0.030821917808219176),
   ('based', 0.02910958904109589),
   ('statistical', 0.0258751902587519),
   ('values', 0.025494672754946726),
   ('attributes', 0.024733637747336376)]),
 (18,
  [('user', 0.3605196679898953),
   ('based', 0.041862143630458315),
   ('service', 0.039335979790689285),
   ('providing', 0.030313966077228437),
   ('media', 0.028148682785997834),
   ('provide', 0.025081198123421147),
   ('web', 0.020570191266690727),
   ('received', 0.01912666907253699),
   ('location', 0.01912666907253699),
   ('topic', 0.016420064958498737)]),
 (22,
  [('method', 0.0908745247148289),
   ('response', 0.06520912547528517),
   ('includes', 0.04961977186311787),
   ('apparatus', 0.04828897338403042),
   ('domain', 0.038403041825095054),
   ('entity', 0.038403041825095054),
 

In [89]:
# calculate coherence metric
coherence_model_2 = CoherenceModel(model=model_2, texts=filtered_data, dictionary=id_to_word, coherence='c_v')
coherence_model_2 = coherence_model_2.get_coherence()
coherence_model_2

0.341201222293478

### Model 3 - Author topic model

In [90]:
# construct inventor-to-doc mapping as df from nested inventors column in json api response
df_inventors = json_normalize(results['patents'], record_path=['inventors'], meta=['patent_number', 'patent_date'])
df_inventors = df_inventors[['inventor_id', 'patent_number', 'patent_date']]
df_inventors.sort_values(by=['patent_date'])
df_inventors.head(3)

Unnamed: 0,inventor_id,patent_number,patent_date
0,7788103-1,10229106,2019-03-12
1,8352247-1,10229109,2019-03-12
2,8515750-2,10229109,2019-03-12


In [91]:
df.head(3)

Unnamed: 0,patent_number,patent_date,patent_title,patent_abstract,patent_firstnamed_assignee_id,patent_firstnamed_assignee_location_id,patent_firstnamed_assignee_latitude,patent_firstnamed_assignee_longitude,patent_firstnamed_assignee_city,patent_firstnamed_assignee_state,...,patent_firstnamed_inventor_longitude,patent_firstnamed_inventor_city,patent_firstnamed_inventor_state,patent_firstnamed_inventor_country,patent_year,patent_type,patent_kind,inventors,patent_title_abstract,idx
0,10229106,2019-03-12,Initializing a workspace for building a natura...,Designing a natural language understanding (NL...,org_ID497r4tFbCIaMBjGAST,42.5047|-71.1961,42.5047,-71.1961,Burlington,MA,...,-71.2097,Newton,MA,US,2019,utility,B2,"[{'inventor_city': 'Newton', 'inventor_country...",Initializing a workspace for building a natura...,0
1,10229109,2019-03-12,Allowing spelling of arbitrary words,"Methods, systems, and apparatus, including com...",org_p6ofWD2xFNSnyYkj6wpA,37.3861|-122.0828,37.3861,-122.083,Mountain View,CA,...,8.5287,Adliswil,,CH,2019,utility,B1,"[{'inventor_city': 'Adliswil', 'inventor_count...","Allowing spelling of arbitrary words Methods, ...",1
2,10229113,2019-03-12,Leveraging content dimensions during the trans...,A content management system (CMS) and a transl...,org_Vbc6obpnxWM42d0HjlXY,47.6064|-122.3308,47.6064,-122.331,Seattle,WA,...,-122.331,Seattle,WA,US,2019,utility,B1,"[{'inventor_city': 'Seattle', 'inventor_countr...",Leveraging content dimensions during the trans...,2


In [92]:
# TODO (Lee) - resolve workaround
df_idx = df
df_idx['idx'] = df.index
df_idx
df_idx_1 = df_idx[['patent_number', 'idx', 'inventors']]
df_idx_2 = df_idx_1.set_index('patent_number')
df_idx_2.pop('inventors')
df_idx_2
df_pat_idx = df_idx_2.T.to_dict('records')
for i in df_pat_idx:
    df_pat_idx = dict(i)
df_pat_idx

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


{'10229106': 0,
 '10229109': 1,
 '10229113': 2,
 '10229156': 3,
 '10229173': 4,
 '10229187': 5,
 '10229189': 6,
 '10229673': 7,
 '10229678': 8,
 '10229679': 9,
 '10229680': 10,
 '10229687': 11,
 '10230677': 12,
 '10230680': 13,
 '10223353': 14,
 '10223355': 15,
 '10223356': 16,
 '10223440': 17,
 '10223445': 18,
 '10223934': 19,
 '10224030': 20,
 '10224035': 21,
 '10224119': 22,
 '10225227': 23,
 '10216719': 24,
 '10216725': 25,
 '10216735': 26,
 '10216736': 27,
 '10216832': 28,
 '10217059': 29,
 '10217377': 30,
 '10217462': 31,
 '10210151': 32,
 '10210156': 33,
 '10210178': 34,
 '10210244': 35,
 '10210245': 36,
 '10210249': 37,
 '10212338': 38,
 '10204032': 39,
 '10204097': 40,
 '10204098': 41,
 '10204146': 42,
 '10204225': 43,
 '10204627': 44,
 '10198069': 45,
 '10198432': 46,
 '10198433': 47,
 '10198479': 48,
 '10198698': 49,
 '10199039': 50,
 '10191721': 51,
 '10191734': 52,
 '10191946': 53,
 '10191970': 54,
 '10191999': 55,
 '10192070': 56,
 '10192425': 57,
 '10192543': 58,
 '10193

In [93]:
df_pat_idx = df_idx_2.T.to_dict('records')
for i in df_pat_idx:
    df_pat_idx = dict(i)
df_pat_idx

{'10229106': 0,
 '10229109': 1,
 '10229113': 2,
 '10229156': 3,
 '10229173': 4,
 '10229187': 5,
 '10229189': 6,
 '10229673': 7,
 '10229678': 8,
 '10229679': 9,
 '10229680': 10,
 '10229687': 11,
 '10230677': 12,
 '10230680': 13,
 '10223353': 14,
 '10223355': 15,
 '10223356': 16,
 '10223440': 17,
 '10223445': 18,
 '10223934': 19,
 '10224030': 20,
 '10224035': 21,
 '10224119': 22,
 '10225227': 23,
 '10216719': 24,
 '10216725': 25,
 '10216735': 26,
 '10216736': 27,
 '10216832': 28,
 '10217059': 29,
 '10217377': 30,
 '10217462': 31,
 '10210151': 32,
 '10210156': 33,
 '10210178': 34,
 '10210244': 35,
 '10210245': 36,
 '10210249': 37,
 '10212338': 38,
 '10204032': 39,
 '10204097': 40,
 '10204098': 41,
 '10204146': 42,
 '10204225': 43,
 '10204627': 44,
 '10198069': 45,
 '10198432': 46,
 '10198433': 47,
 '10198479': 48,
 '10198698': 49,
 '10199039': 50,
 '10191721': 51,
 '10191734': 52,
 '10191946': 53,
 '10191970': 54,
 '10191999': 55,
 '10192070': 56,
 '10192425': 57,
 '10192543': 58,
 '10193

In [94]:
df_inv_test = json_normalize(results['patents'], record_path=['inventors'], meta=['patent_number', 'patent_date'])
df_inv_test.head(3)

Unnamed: 0,inventor_city,inventor_country,inventor_county,inventor_county_fips,inventor_first_name,inventor_first_seen_date,inventor_id,inventor_key_id,inventor_last_name,inventor_last_seen_date,...,inventor_lastknown_state,inventor_latitude,inventor_location_id,inventor_longitude,inventor_sequence,inventor_state,inventor_state_fips,inventor_total_num_patents,patent_number,patent_date
0,Newton,US,Middlesex,25017,Jeffrey N.,2010-08-31,7788103-1,2490444,Marcus,2019-03-12,...,MA,42.3369,42.3369|-71.2097,-71.2097,0,MA,25,10,10229106,2019-03-12
1,Adliswil,CH,,0,Evgeny A.,2013-01-08,8352247-1,2759779,Cherepanov,2019-03-12,...,,47.3119,47.3119|8.5287,8.5287,0,,0,13,10229109,2019-03-12
2,Jersey City,US,Hudson,34017,Petar,2013-08-20,8515750-2,2837991,Aleksic,2019-03-12,...,NJ,40.7281,40.7281|-74.0781,-74.0781,3,NJ,34,21,10229109,2019-03-12


In [95]:
df_idx_pat_inv_map = df[['patent_number', 'inventors']]
df_idx_pat_inv_map.head(3)

Unnamed: 0,patent_number,inventors
0,10229106,"[{'inventor_city': 'Newton', 'inventor_country..."
1,10229109,"[{'inventor_city': 'Adliswil', 'inventor_count..."
2,10229113,"[{'inventor_city': 'Seattle', 'inventor_countr..."


In [96]:
# TODO (Lee) - find out how to get list of patents_view_field names from API - need to replicate response from api with fields

In [106]:
df.patent_title_abstract[0]

'Initializing a workspace for building a natural language understanding system Designing a natural language understanding (NLU) model for an application from scratch can be difficult for non-experts. A system can simplify the design process by providing an interface allowing a designer to input example usage sentences and build an NLU model based on presented matches to those example sentences. In one embodiment, a method for initializing a workspace for building an NLU system includes parsing a sample sentence to select at least one candidate stub grammar from among multiple candidate stub grammars. The method can include presenting, to a user, respective representations of the candidate stub grammars selected by the parsing of the sample sentence. The method can include enabling the user to choose one of the respective representations of the candidate stub grammars. The method can include adding to the workspace a stub grammar corresponding to the representation of the candidate stub

In [107]:
df[:3]

Unnamed: 0,patent_number,patent_date,patent_title,patent_abstract,patent_firstnamed_assignee_id,patent_firstnamed_assignee_location_id,patent_firstnamed_assignee_latitude,patent_firstnamed_assignee_longitude,patent_firstnamed_assignee_city,patent_firstnamed_assignee_state,...,patent_firstnamed_inventor_longitude,patent_firstnamed_inventor_city,patent_firstnamed_inventor_state,patent_firstnamed_inventor_country,patent_year,patent_type,patent_kind,inventors,patent_title_abstract,idx
0,10229106,2019-03-12,Initializing a workspace for building a natura...,Designing a natural language understanding (NL...,org_ID497r4tFbCIaMBjGAST,42.5047|-71.1961,42.5047,-71.1961,Burlington,MA,...,-71.2097,Newton,MA,US,2019,utility,B2,"[{'inventor_city': 'Newton', 'inventor_country...",Initializing a workspace for building a natura...,0
1,10229109,2019-03-12,Allowing spelling of arbitrary words,"Methods, systems, and apparatus, including com...",org_p6ofWD2xFNSnyYkj6wpA,37.3861|-122.0828,37.3861,-122.083,Mountain View,CA,...,8.5287,Adliswil,,CH,2019,utility,B1,"[{'inventor_city': 'Adliswil', 'inventor_count...","Allowing spelling of arbitrary words Methods, ...",1
2,10229113,2019-03-12,Leveraging content dimensions during the trans...,A content management system (CMS) and a transl...,org_Vbc6obpnxWM42d0HjlXY,47.6064|-122.3308,47.6064,-122.331,Seattle,WA,...,-122.331,Seattle,WA,US,2019,utility,B1,"[{'inventor_city': 'Seattle', 'inventor_countr...",Leveraging content dimensions during the trans...,2


In [108]:
df_inventors.set_index('inventor_id').T.to_dict('list')

  """Entry point for launching an IPython kernel.


{'7788103-1': ['8903712', '2014-12-02'],
 '8352247-1': ['9514743', '2016-12-06'],
 '8515750-2': ['9971758', '2018-05-15'],
 '8849675-1': ['9971758', '2018-05-15'],
 '9514743-1': ['9514743', '2016-12-06'],
 '9971758-5': ['9971758', '2018-05-15'],
 '10223356-1': ['10223356', '2019-03-05'],
 '10223356-2': ['10223356', '2019-03-05'],
 '10223356-3': ['10223356', '2019-03-05'],
 '10223356-5': ['10223356', '2019-03-05'],
 '9177341-1': ['10223356', '2019-03-05'],
 '9959271-8': ['10223356', '2019-03-05'],
 '8281187-1': ['10095736', '2018-10-09'],
 '9442919-1': ['10002124', '2018-06-19'],
 '9442919-2': ['9996526', '2018-06-12'],
 '9442919-3': ['10002124', '2018-06-19'],
 '9442919-5': ['9996526', '2018-06-12'],
 '8560468-5': ['9589060', '2017-03-07'],
 '9037568-4': ['9589060', '2017-03-07'],
 '9292545-3': ['9589060', '2017-03-07'],
 '9589060-4': ['9589060', '2017-03-07'],
 '9589060-5': ['9589060', '2017-03-07'],
 '9589060-6': ['9589060', '2017-03-07'],
 '10229187-1': ['10229189', '2019-03-12'],
 

In [100]:
# for k, v in pat2inv.items():
#     name_dict[new_key] = name_dict.pop(k)
#     time.sleep(4)

# pprint.pprint(name_dict)

# d = {'x':1, 'y':2, 'z':3}
# d1 = {'x':'a', 'y':'b', 'z':'c'}

# dict((d1[key], value) for (key, value) in d.items())
# {'a': 1, 'b': 2, 'c': 3}

In [101]:
pat2inv = {k: list(v) for k,v in df_inventors.groupby("patent_number")["inventor_id"]}
pat2inv

{'10002124': ['9442919-1', '9442919-2', '9442919-3', '9442919-5'],
 '10002129': ['9919723-2'],
 '10002188': ['8117606-2', '8881104-1', '9245015-2'],
 '10002201': ['6672422-2', '7343297-3', '7401072-2', 'D529036-3'],
 '10002607': ['10002607-1',
  '10002607-3',
  '10002607-4',
  '10002607-5',
  '10002607-6',
  '10002607-7',
  '10002607-8',
  '9002798-2'],
 '10003559': ['9740579-3', '9916318-3'],
 '10003683': ['10003683-1',
  '10003683-2',
  '10003683-3',
  '10003683-4',
  '10003683-5',
  '10003683-6',
  '10003683-7',
  '10003683-8',
  '10003683-9',
  '6288759-3'],
 '10007658': ['7088873-1', '9817812-1', '9817812-2'],
 '10007659': ['4847805-1', '6823309-4', '9870768-2'],
 '10007660': ['7475015-5', '9047560-3', '9318109-3', '9690776-4'],
 '10009462': ['10009462-1', '10009462-2', '9740765-3'],
 '10013266': ['10013266-1', '10013266-2'],
 '10013404': ['7549132-1', '8214693-3', '8676787-1', '9129213-2'],
 '10013416': ['10013416-1',
  '10013416-10',
  '10013416-11',
  '10013416-12',
  '10013416

In [102]:
patdf2inv = dict((df_pat_idx[key], value) for (key, value) in pat2inv.items())
patdf2inv

{273: ['9442919-1', '9442919-2', '9442919-3', '9442919-5'],
 274: ['9919723-2'],
 275: ['8117606-2', '8881104-1', '9245015-2'],
 276: ['6672422-2', '7343297-3', '7401072-2', 'D529036-3'],
 277: ['10002607-1',
  '10002607-3',
  '10002607-4',
  '10002607-5',
  '10002607-6',
  '10002607-7',
  '10002607-8',
  '9002798-2'],
 278: ['9740579-3', '9916318-3'],
 279: ['10003683-1',
  '10003683-2',
  '10003683-3',
  '10003683-4',
  '10003683-5',
  '10003683-6',
  '10003683-7',
  '10003683-8',
  '10003683-9',
  '6288759-3'],
 269: ['7088873-1', '9817812-1', '9817812-2'],
 270: ['4847805-1', '6823309-4', '9870768-2'],
 271: ['7475015-5', '9047560-3', '9318109-3', '9690776-4'],
 272: ['10009462-1', '10009462-2', '9740765-3'],
 258: ['10013266-1', '10013266-2'],
 259: ['7549132-1', '8214693-3', '8676787-1', '9129213-2'],
 260: ['10013416-1',
  '10013416-10',
  '10013416-11',
  '10013416-12',
  '10013416-2',
  '10013416-3',
  '10013416-4',
  '10013416-6',
  '10013416-7',
  '10013416-8',
  '10013416-9

In [103]:
idx_pat_map = df.patent_number.to_dict()
idx_pat_map = {str(key): value for key, value in idx_pat_map.items()}
import itertools
x = list(itertools.islice(idx_pat_map.items(), 0, 4))
x[:4]

[('0', '10229106'), ('1', '10229109'), ('2', '10229113'), ('3', '10229156')]

#### Construct author-topic model

In [110]:
# construct author-topic model
model_at = AuthorTopicModel(corpus=corpus,
                         doc2author=patdf2inv,
                         id2word=id_to_word, 
                         num_topics=25)

IndexError: list index out of range

In [64]:
# construct vectors for authors
author_vecs = [model_at.get_author_topics(author) for author in model_at.id2author.values()]
author_vecs

NameError: name 'model_at' is not defined

In [None]:
# inspect topic distribution for author with id# 7788103-1
# each topic has a probability of being expressed given the particular author, but only the ones above a certain threshold are shown.

model_at['7788103-1']

In [None]:
# def show_author(name):
#     print('\n%s' % name)
#     print('Docs:', model.author2doc[name])
#     print('Topics:')
#     pprint([(topic_labels[topic[0]], topic[1]) for topic in model[name]])

In [None]:
# calculate per-word bound, which is a measure of the model's predictive performance (reconstruction error?)

build doc2author dictionary

doc2author = atmodel.construct_doc2author(model.corpus, model.author2doc)

In [None]:

doc2author = atmodel.construct_doc2author(model.corpus, model.author2doc)

In [None]:
gensim.models.atmodel.construct_author2doc(doc2author)
# construct mapping from author IDs to document IDs.

Parameters:	doc2author (dict of (int, list of str)) – Mapping of document id to authors.
Returns:	Mapping of authors to document ids.
Return type:	dict of (str, list of int)

In [None]:
gensim.models.atmodel.construct_doc2author(corpus, author2doc)
construct mapping from document IDs to author IDs

Parameters:	
corpus (iterable of list of (int, float)) – Corpus in BoW format.
author2doc (dict of (str, list of int)) – Mapping of authors to documents.
Returns:	
Document to Author mapping.

Return type:	
dict of (int, list of str)