## Techniche - Topic Model

In [158]:
import pandas as pd
import numpy as np
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.phrases import Phrases, Phraser
from gensim.models.ldamodel import LdaModel
import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, punkt, RegexpTokenizer, wordpunct_tokenize
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer

import json
import requests
import re

from topic_model import tokenize_docs

import matplotlib.pyplot as plt
%matplotlib inline
import pyLDAvis
import pyLDAvis.gensim
from pprint import pprint

In [2]:
np.random.seed(3)

In [3]:
# uncomment to download stop words from nltk
# nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/lee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### import data from PatentsView API

In [4]:
# pd.set_option('display.max_colwidth', -1)
pd.options.display.max_columns = 50
pd.set_option('display.max_rows', 50)

# patents endpoint
endpoint_url = 'http://www.patentsview.org/api/patents/query'

# build list of possible fields that endpoint request will return
df = pd.read_excel("data/patents_view_patents_fields.xlsx")
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
pat_fields = df.api_field_name.values.tolist()

# build query
query={"_or":[{"_text_phrase":{"patent_title":"natural language"}},{"_text_phrase":{"patent_abstract":"natural language"}}]}
fields=pat_fields
options={"per_page":2500}
sort=[{"patent_date":"desc"}]

params={'q': json.dumps(query),
        'f': json.dumps(fields),
        'o': json.dumps(options),
        's': json.dumps(sort)}

# request and results
resp = requests.get(endpoint_url, params=params)
results = resp.json()

#### structure data

In [5]:
# extract metadata from response
print("status code:", resp.status_code,';', "reason:", resp.reason)
total_patent_count = results["total_patent_count"]
patents_per_page = results['count']
print("total_patent_count:",total_patent_count,';', "patents_per_page:", patents_per_page)

# extract data from response
data = results['patents']
# data[0]
df = pd.DataFrame(data)
df.head(3)

# df.columns

df = df[['patent_number', 
         'patent_date', 
         'patent_title',
         'patent_abstract', 
         'patent_firstnamed_assignee_id', 
         'patent_year', 
         'patent_type', 
         'patent_kind']]

df['patent_title_abstract'] = df.patent_title + ' ' + df.patent_abstract
df.patent_title_abstract.head(3)

# 561 different assignees
len(df.patent_firstnamed_assignee_id.unique())

df.patent_firstnamed_assignee_id.value_counts()[:10]

# list of assignees with > 20 patents in df dataset
assignees_list = ['org_q9Bn28RHhpYrQjKvraAH', 'org_JZguWDMfFOBX2wBI9pnD', 'org_ID497r4tFbCIaMBjGAST', 
                  'org_rDyHZBYWMcBEtnkHt05L', 'org_p6ofWD2xFNSnyYkj6wpA', 'org_EilEWQcC6UiqHcSGx9mb',
                  'org_ccMMcUijAIsKIxUqMTyP', 'org_Vbc6obpnxWM42d0HjlXY', 'org_9D8x1qL3IRASp6GG7Glu',
                  'org_2wAdIFKssfcLHpZq0u4H', 'org_iwO2oOJ6VIBd9fAuP7G6', 'org_70D1lR89kQnFiCFdJ6s5',
                  'org_vojVnDkT9CamDETqbqJC']

df_20pats = df[df['patent_firstnamed_assignee_id'].isin(assignees_list) ]

df_20pats.sort_values(by=['patent_date'], inplace=True)

df_20pats[['patent_number','patent_title_abstract', 'patent_firstnamed_assignee_id']].head(1)

status code: 200 ; reason: OK
total_patent_count: 2482 ; patents_per_page: 2482


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,patent_number,patent_title_abstract,patent_firstnamed_assignee_id
2479,4502128,Translation between natural languages An input...,org_70D1lR89kQnFiCFdJ6s5


#### Partition data into train and test sets

In [6]:
# partition data
train_20pats = df_20pats[:894]
len(train_20pats)

test_20pats = df_20pats[894:]
len(test_20pats)

# convert text column to list
data = train_20pats.patent_title_abstract.values.tolist()
len(data)

224

In [14]:
# tokenize documents

def tokenize_docs(docs):
    tokenized_docs = []
    for doc in docs:
        tokenized_docs.append(word_tokenize(doc))
    return tokenized_docs

tokenized_docs = tokenize_docs(data)

In [150]:
# clean punctuation
def clean_docs(tokenized_docs):
    clean_docs = []
    for doc in tokenized_docs:
       clean_docs.append([word for word in doc if word.isalpha()])  
    return clean_docs

In [151]:
data = clean_docs(tokenized_docs)

In [154]:
data

[['Translation',
  'between',
  'natural',
  'languages',
  'An',
  'input',
  'sentence',
  'described',
  'by',
  'a',
  'first',
  'natural',
  'language',
  'is',
  'sectioned',
  'into',
  'individual',
  'words',
  'Parts',
  'of',
  'speech',
  'corresponding',
  'to',
  'the',
  'individual',
  'words',
  'are',
  'retrieved',
  'from',
  'a',
  'lexical',
  'word',
  'storage',
  'whereby',
  'the',
  'input',
  'sentence',
  'is',
  'described',
  'by',
  'a',
  'corresponding',
  'string',
  'of',
  'the',
  'as',
  'retrieved',
  'A',
  'translation',
  'pattern',
  'table',
  'is',
  'previously',
  'prepared',
  'which',
  'defines',
  'correspondence',
  'between',
  'patterns',
  'of',
  'strings',
  'of',
  'for',
  'the',
  'first',
  'natural',
  'language',
  'and',
  'those',
  'for',
  'a',
  'second',
  'natural',
  'language',
  'by',
  'which',
  'corresponding',
  'output',
  'sentence',
  'is',
  'to',
  'be',
  'described',
  'By',
  'referring',
  'to',
  '

In [152]:
# convert to lowercase
def lower_words(docs):
    lowered_words = []
    for doc in docs:
        lowered_words.append([word.lower() for word in doc])
    return lowered_words

In [153]:
lower_words(data)

[['translation',
  'between',
  'natural',
  'languages',
  'an',
  'input',
  'sentence',
  'described',
  'by',
  'a',
  'first',
  'natural',
  'language',
  'is',
  'sectioned',
  'into',
  'individual',
  'words',
  'parts',
  'of',
  'speech',
  'corresponding',
  'to',
  'the',
  'individual',
  'words',
  'are',
  'retrieved',
  'from',
  'a',
  'lexical',
  'word',
  'storage',
  'whereby',
  'the',
  'input',
  'sentence',
  'is',
  'described',
  'by',
  'a',
  'corresponding',
  'string',
  'of',
  'the',
  'as',
  'retrieved',
  'a',
  'translation',
  'pattern',
  'table',
  'is',
  'previously',
  'prepared',
  'which',
  'defines',
  'correspondence',
  'between',
  'patterns',
  'of',
  'strings',
  'of',
  'for',
  'the',
  'first',
  'natural',
  'language',
  'and',
  'those',
  'for',
  'a',
  'second',
  'natural',
  'language',
  'by',
  'which',
  'corresponding',
  'output',
  'sentence',
  'is',
  'to',
  'be',
  'described',
  'by',
  'referring',
  'to',
  '

#### Clean stopwords

In [51]:
# clean stopwords

stop_words = stopwords.words('english')

def remove_stopwords(clean_docs):
    filtered_docs = []
    for doc in clean_docs:
       filtered_docs.append([word for word in doc if word not in stop_words])
    return filtered_docs

# remove stopwords
cleaned_data = remove_stopwords(data)

#### Create bigrams and trigrams

In [34]:
# train bigram phrases model
bigram_model = Phrases(data, min_count=1, threshold=1)

# train trigram phrases model
trigram_model = Phrases(bigram_model[data], threshold=100)  

In [81]:
# bigrams
def bigrams(docs):
    """create bigrams"""
    return [bigram_model[doc] for doc in docs]

In [71]:
# initiatialize bigram and trigram models
bigram_model = gensim.models.phrases.Phraser(bigram_model)
trigram_model = gensim.models.phrases.Phraser(trigram_model)

In [63]:
bigrams(cleaned_data)



[['Translation',
  'natural_languages',
  'An_input',
  'sentence_described',
  'first_natural',
  'language',
  'sectioned',
  'individual_words',
  'Parts',
  'speech',
  'corresponding',
  'individual_words',
  'retrieved',
  'lexical',
  'word',
  'storage',
  'whereby',
  'input_sentence',
  'described',
  'corresponding_string',
  'retrieved_A',
  'translation_pattern',
  'table',
  'previously',
  'prepared',
  'defines',
  'correspondence',
  'patterns',
  'strings',
  'first_natural',
  'language',
  'second_natural',
  'language',
  'corresponding',
  'output_sentence',
  'described',
  'By',
  'referring',
  'translation_pattern',
  'table',
  'string',
  'input_sentence',
  'transformed',
  'corresponding_string',
  'second_natural',
  'language',
  'The',
  'output_sentence',
  'described',
  'second_natural',
  'language',
  'generated',
  'sequencing',
  'target_words',
  'accordance',
  'sequential',
  'order',
  'parts',
  'speech',
  'string',
  'pattern',
  'obtained

In [82]:
def trigrams(docs):
    """create trigrams"""
    return [trigram_model[bigram_model[doc]] for doc in docs]

In [74]:
trigrams(cleaned_data)

[['Translation',
  'natural_languages',
  'An_input',
  'sentence_described',
  'first_natural',
  'language',
  'sectioned',
  'individual_words',
  'Parts',
  'speech',
  'corresponding',
  'individual_words',
  'retrieved',
  'lexical',
  'word',
  'storage',
  'whereby',
  'input_sentence',
  'described',
  'corresponding_string',
  'retrieved_A',
  'translation_pattern',
  'table',
  'previously',
  'prepared',
  'defines',
  'correspondence',
  'patterns',
  'strings',
  'first_natural',
  'language',
  'second_natural',
  'language',
  'corresponding',
  'output_sentence',
  'described',
  'By',
  'referring',
  'translation_pattern',
  'table',
  'string',
  'input_sentence',
  'transformed',
  'corresponding_string',
  'second_natural',
  'language',
  'The',
  'output_sentence',
  'described',
  'second_natural',
  'language',
  'generated',
  'sequencing',
  'target_words',
  'accordance',
  'sequential',
  'order',
  'parts',
  'speech',
  'string',
  'pattern',
  'obtained

#### Lemmatize

In [84]:
def lemmatize_docs(docs, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """lemmatize documents"""
    lemmatized_docs = []
    for doc in docs: 
        lemmatized_docs.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return lemmatized_docs

In [186]:
# for doc in cleaned_data:
#     for token in doc:
#         token.lemma_

In [94]:
# make sure your downloaded the english model with "python -m spacy download en"

In [97]:
lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)

In [None]:
for token in doc:
    print(token, token.lemma, token.lemma_)

In [185]:
# lemmatize_docs(cleaned_data)

#### Create corpus and dictionary

In [121]:
 # build dictionary
id_word = corpora.Dictionary(cleaned_data)

# build corpus
texts = cleaned_data

# apply term document frequency
# converts documents in corpus to bag-of-words format, a list of (token_id, token_count) tuples
corpus = [id_word.doc2bow(text) for text in texts]

In [155]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('A', 1),
  ('An', 1),
  ('By', 1),
  ('Parts', 1),
  ('The', 1),
  ('Translation', 1),
  ('accordance', 1),
  ('correspondence', 1),
  ('corresponding', 4),
  ('defines', 1),
  ('described', 4),
  ('first', 2),
  ('generated', 1),
  ('individual', 2),
  ('input', 3),
  ('language', 5),
  ('languages', 1),
  ('lexical', 1),
  ('natural', 6),
  ('obtained', 1),
  ('order', 1),
  ('output', 2),
  ('parts', 1),
  ('pattern', 3),
  ('patterns', 1),
  ('prepared', 1),
  ('previously', 1),
  ('referring', 1),
  ('retrieved', 2),
  ('second', 3),
  ('sectioned', 1),
  ('sentence', 5),
  ('sequencing', 1),
  ('sequential', 1),
  ('speech', 2),
  ('storage', 1),
  ('string', 4),
  ('strings', 1),
  ('table', 2),
  ('target', 1),
  ('transformation', 1),
  ('transformed', 1),
  ('translation', 2),
  ('whereby', 1),
  ('word', 1),
  ('words', 3)]]

In [164]:
# TODO (Lee) - deprecation warnings
# construct LDA model
# Build LDA model
model_lda = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=25, 
                     random_state=100,
                     update_every=1,
                     chunksize=100,
                     passes=10,
                     alpha='auto',
                     per_word_topics=True)

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

In [165]:
# print keywords in 10 topics
pprint(model_lda.print_topics())

[(7,
  '0.128*"social" + 0.088*"network" + 0.043*"parse" + 0.039*"concept" + '
  '0.035*"nodes" + 0.031*"The" + 0.022*"kernel" + 0.020*"hashtags" + '
  '0.017*"node" + 0.016*"outputs"'),
 (5,
  '0.089*"entity" + 0.087*"token" + 0.030*"quality" + 0.029*"disambiguation" + '
  '0.028*"Disclosed" + 0.027*"herein" + 0.019*"tasks" + 0.018*"state" + '
  '0.017*"signal" + 0.015*"manually"'),
 (12,
  '0.079*"NLU" + 0.043*"model" + 0.034*"used" + 0.031*"For" + 0.023*"likely" + '
  '0.022*"understanding" + 0.021*"scanned" + 0.019*"grammatical" + '
  '0.017*"using" + 0.017*"accurate"'),
 (23,
  '0.073*"document" + 0.030*"item" + 0.029*"form" + 0.029*"type" + '
  '0.023*"list" + 0.023*"statements" + 0.021*"items" + 0.021*"The" + '
  '0.019*"defined" + 0.018*"analysis"'),
 (0,
  '0.093*"message" + 0.072*"component" + 0.047*"messages" + 0.029*"string" + '
  '0.025*"metadata" + 0.022*"mode" + 0.021*"statement" + 0.017*"processors" + '
  '0.017*"generate" + 0.016*"character"'),
 (19,
  '0.080*"word" + 

In [168]:
# print keywords in 10 topics
pprint(model_lda.print_topic(0))
# the most import keywords, and the respective weight, that form topic 0 are

('0.093*"message" + 0.072*"component" + 0.047*"messages" + 0.029*"string" + '
 '0.025*"metadata" + 0.022*"mode" + 0.021*"statement" + 0.017*"processors" + '
 '0.017*"generate" + 0.016*"character"')


In [166]:
topic 0: '0.093*"message" + 0.072*"component" + 0.047*"messages" + 0.029*"string" + '
  '0.025*"metadata" + 0.022*"mode" + 0.021*"statement" + 0.017*"processors" + '
  '0.017*"generate" + 0.016*"character"'),

doc_lda = lda_model[corpus]

('0.093*"message" + 0.072*"component" + 0.047*"messages" + 0.029*"string" + '
 '0.025*"metadata" + 0.022*"mode" + 0.021*"statement" + 0.017*"processors" + '
 '0.017*"generate" + 0.016*"character"')


#### Infer topic from keywords

#### Evaluate model

In [182]:
# calculate perplexity metrics
perplexity = model_lda.log_perplexity(corpus)
perplexity

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

-6.800301327541019

In [172]:
# calculate coherence metric
coherence = CoherenceModel(model=model_lda, texts=data, dictionary=id_word, coherence='c_v')

In [181]:
# produces coherence_per_topic
coherence_1 = coherence.get_coherence_per_topic()
coherence_1

[0.4106267971162076,
 0.31722446067803695,
 0.5478579066803337,
 0.4132573830140969,
 0.32067875597825285,
 0.5294790508063428,
 0.49700275912588465,
 0.4066938157686664,
 0.26671360789845366,
 0.25329437453386394,
 0.5161361938345662,
 0.5091895950376919,
 0.35429888480919225,
 0.28219213238278434,
 0.47898701912650177,
 0.36814798745852895,
 0.22946963311075047,
 0.3229132041847006,
 0.29459466613983126,
 0.40453055915491465,
 0.363053418841915,
 0.24193727209161947,
 0.6471154823395759,
 0.45047726778912206,
 0.31518705756004844]

In [183]:
# explore topics
pyLDAvis.enable_notebook()

In [184]:
viz_topics_1 = pyLDAvis.gensim.prepare(model_lda, corpus, id_word)
viz_topics_1

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


### Implement Mallet model

In [191]:
# uncomment to download Mallet topic model
# !wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
# update this path
path_mallet = 'data/mallet-2.0.8/bin/mallet'

In [189]:
model_lda_mallet = gensim.models.wrappers.LdaMallet(path_mallet, corpus=corpus, num_topics=20, id2word=id_word)

NameError: name 'path_mallet' is not defined

In [None]:
# Show Topics
pprint(ldamallet.show_topics(formatted=False))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)