## Topic model

In [3]:
import pandas as pd
import numpy as np

from test_model import (get_patent_fields_list, get_ml_patents, 
                        create_title_abstract_col,trim_data, 
                        structure_dataframe, partition_dataframe, 
                        build_pipeline, process_docs, pat_inv_map, get_topics)
import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary, mmcorpus
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.phrases import Phrases, Phraser
from gensim.models.ldamodel import LdaModel
from gensim.models import AuthorTopicModel
from gensim.test.utils import common_dictionary, datapath, temporary_file
from smart_open import smart_open

import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, punkt, RegexpTokenizer, wordpunct_tokenize
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer

import json
from pandas.io.json import json_normalize
import requests
import re
import os
import calendar
import requests
from bs4 import BeautifulSoup
import pickle

import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim

from pprint import pprint

%load_ext autoreload
%autoreload 2

In [4]:
# TODO (Lee) - resolve deprecation warnings
import warnings; warnings.simplefilter('ignore')

In [5]:
np.random.seed(3)

### Acquire data

In [6]:
%%capture
# acquire dataset of ML patents by call to PatentsView API 
raw_data_1000 = get_ml_patents(pats_per_page=1000)
raw_data_2000 = get_ml_patents(pats_per_page=2000)

#### Acquire data - Structure data - 1000 pats

In [100]:
# specify fields (key:val pairs) to retain from full api response
retained_keys = ['patent_number', 'patent_date', 'patent_title', 'patent_abstract', 'inventors']
data_1000 = trim_data(data=raw_data_1000, keys=retained_keys)

# create new key:value pair by combining values from patent_title and patent_abstract keys
data_1000 = create_title_abstract_col(data=data_1000)

# create dataframe, organize columns and sort by patent_date
df_1000 = structure_dataframe(data=data_1000)

# partition data
data_train_1000, data_test_1000 = partition_dataframe(df_1000, .8)

# convert dataframe to list
text_data_1000 = df_1000.patent_title_abstract.tolist()
text_train_1000 = data_train_1000.patent_title_abstract.tolist()
text_test_1000 = data_test_1000.patent_title_abstract.tolist()

# TODO (Lee) - this explores direct structuring from api response without df
text_list_1000 = []
for i in data_1000:
    text_list_1000.append(i['patent_title_abstract'])
# text_list

#### Acquire data - Structure data - 2000 pats

In [101]:
# specify fields (key:val pairs) to retain from full api response
retained_keys = ['patent_number', 'patent_date', 'patent_title', 'patent_abstract', 'inventors']
data_2000 = trim_data(data=raw_data_2000, keys=retained_keys)

# create new key:value pair by combining values from patent_title and patent_abstract keys
data_2000 = create_title_abstract_col(data=data_2000)

# create dataframe, organize columns and sort by patent_date
df_2000 = structure_dataframe(data=data_2000)

# partition data
data_train_2000, data_test_2000 = partition_dataframe(df_2000, .8)

# convert dataframe to list
text_data_2000 = df_2000.patent_title_abstract.tolist()
text_train_2000 = data_train_2000.patent_title_abstract.tolist()
text_test_2000 = data_test_2000.patent_title_abstract.tolist()

# TODO (Lee) - this explores direct structuring from api response without df
text_list_2000 = []
for i in data_2000:
    text_list_2000.append(i['patent_title_abstract'])
# text_list

### Pre-process text data

In [102]:
# uncomment to download stop words
# !python -m spacy download en
stop_words = stopwords.words('/Users/lee/Documents/techniche/techniche/data/stopwords/english')

# construct pipeline
nlp = build_pipeline()
print(nlp.pipe_names)

# TODO (Lee) - pre-process documents TODO (Lee) - via text_list directly from api response
# processed_docs_1 = process_docs(text_list)

# pre-process documents TODO (Lee) - via df to list
processed_docs_1000train = process_docs(text_train_1000)

### Build corpus and dictionary

# build dictionary
id_to_word_1000train = Dictionary(processed_docs_1000train)

# apply term-doc frequency (list of (token_id, token_count) tuples) to docs in corpus 
corpus_1000train = [id_to_word_1000train.doc2bow(doc) for doc in processed_docs_1000train]

# view formatted corpus
# uncomment below to view
# formatted_corpus_1000 = [[(id_to_word[id], freq) for id, freq in text] for text in corpus_1000train]
# formatted_corpus_1000
# id_to_word_1000train.token2id

['tagger', 'parser', 'ner']


## Train model #1: Genism LDA model
Model #1: implementation: Gensim LDAmodel; k_topics=5; n_docs=1000, partition = 80/20

In [103]:
# construct model #1
# TODO (Lee) - resolve deprecation warnings
model_1 = LdaModel(corpus=corpus_1000train,
                   id2word=id_to_word_1000train,
                   num_topics=5, 
                   random_state=100,
                   update_every=1,
                   chunksize=100,
                   passes=10,
                   alpha='auto',
                   per_word_topics=True)

### Model #1 - Explore and visualize

In [107]:
# explore topics visually
pyLDAvis.enable_notebook()
viz_topics_model_1 = pyLDAvis.gensim.prepare(model_1, corpus_1000train, id_to_word_1000train)
# viz_topics_model_1

In [108]:
# keywords in n topics in corpus
# uncomment below to view
# pprint(model_1.print_topics())

In [109]:
# most important keywords, and the respective weight, that form topic with index 0
# uncomment below to view
# pprint(model_1.print_topic(4))

### Model #1 - Evaluate
As unsupervised learning task, no labels with which to evaluate the "expected" prediction. There is an open research agenda on various evaluation approaches (intrinsic vs extrinsic; machine vs human-interpretable, etc., task-specific)

#### Model #1 - Evaluate - Pre-process test set

In [110]:
# pre-process documents TODO (Lee) - via df to list
processed_docs_1000test = process_docs(text_test_1000)

# build dictionary
id_to_word_1000test = Dictionary(processed_docs_1000test)

# apply term-doc frequency (list of (token_id, token_count) tuples) to docs in corpus 
corpus_1000test = [id_to_word_1000test.doc2bow(doc) for doc in processed_docs_1000test]

#### Model #1 - Evaluate - Coherence
Calculate topic coherence for topic models.
Implements 'CoherenceModel' coherence pipeline (segmentation, probability estimation, confirmation measure, aggregation) from Roeder et al., 2015. "Exploring the space of topic coherence measures", WSDM '15 Proceedings of the Eighth ACM International Conference on Web Search and Data Mining (WSDM) 2015, 399-408.

In [111]:
# calculate coherence metric for train set ((n = 800 docs/1000 docs total in dataset))
coherence_model_1train = CoherenceModel(model=model_1, 
                                        texts=processed_docs_1000train,
                                        dictionary=id_to_word_1000train,
                                        coherence='c_v')
coherence_model_1train_get = coherence_model_1train.get_coherence()
print(coherence_model_1train_get)

0.3557055926610265


In [112]:
# calculate coherence metric for test_set (n = 200 docs/100 docs total in dataset)
coherence_model_1test = CoherenceModel(model=model_1, 
                                       texts=processed_docs_1000test, 
                                       dictionary=id_to_word_1000test, 
                                       coherence='c_v')
coherence_model_1test_get = coherence_model_1test.get_coherence()
print(coherence_model_1test_get)

0.6020573135746659


In [113]:
# calculate coherence metric for each of the n topics in the test set
coherence_model_1_per_topic = coherence_model_1test.get_coherence_per_topic()
# print(coherence_model_1_per_topic)

#### Model #1 - Evaluate - Perplexity
Calculate perplexity metric. Metric calculates and returns per-word likelihood bound using a chunk of documents as evaluation corpus. Output calculated statistics, including the perplexity=2^(-bound), to log at INFO level. Returns the variational bound score calculated for each word

In [114]:
# calculate perplexity metric for model_1 train set
perplexity_model_1train = model_1.log_perplexity(corpus_1000train)
print(perplexity_model_1train)

-7.091984311693296


In [115]:
# calculate perplexity metric for model_1 test set
perplexity_model_1test = model_1.log_perplexity(corpus_1000test)
print(perplexity_model_1test)

-9.66161358304598


### Model #1 - Predict

#### Model #1 - Predict - Pickle model

In [116]:
# pickle model
pickle.dump(model_1, open('/Users/lee/Documents/techniche/techniche/data/model_lda_1.pkl','wb'))

In [117]:
model_1 = pickle.load(open('/Users/lee/Documents/techniche/techniche/data/model_lda_1.pkl','rb'))

#### Model #1 - inference

In [118]:
# define example text_input #1
text_input_1 = 'smart assistant transformer model translation'.split()

# define example text_input #1
text_input_2 = """At the Siri International team within Apple we bring the
Siri intelligent assistant to our customers worldwide in over 40 languages
and dialects. Join us, and tackle some of the most challenging problems in
natural language processing and large scale applied machine learning. You 
will build cutting edge natural language understanding technologies and 
deploy them on a global scale. Your work will advance and shape the future
vision of our multi-lingual, multi-cultural Siri assistant, and Search 
applications used by millions across the world Key Qualifications Extensive
track record of scientific research in NLP and Machine Learning, or similar
experience in developing language technologies for shipping products.
Strong coding and software engineering skills in a mainstream programming 
language, such as Python, Java, C/C++. Familiarity with NLP/ML tools and 
packages like Caffe, pyTorch, TensorFlow, Weka, scikit-learn, nltk, etc.
Practical experience building production quality applications related to 
natural language processing and machine learning. In-depth knowledge of 
machine learning algorithms and ability to apply them in data driven natural
language processing systems. Ability to quickly prototype ideas / solutions,
perform critical analysis, and use creative approaches for solving complex 
problems. Attention to detail and excellent communication skills. Description
We are looking for a highly motivated technologist with a strong background 
in Natural Language Processing and Machine Learning research. The ideal 
candidate will have a strong track record of taking research ideas to 
real-world applications. In this position you will apply your problem solving
skills to challenges and opportunities within Siri International, which 
involves development of large-scale language technologies for many natural
languages worldwide. The primary responsibility of this role is to conduct
research and develop innovative machine learning, artificial intelligence 
and NLP solutions for multi-lingual conversational agents. You will have 
the opportunity to investigate cutting edge research methods that will 
improve customer experience of our products and enable our engineers to 
scale these technologies across a variety of natural languages. You will 
also provide technical leadership and experiment-driven insights for 
engineering teams on their machine learning modeling and data decisions. 
You will play a central role in defining the future technical directions 
of Siri International through quick prototyping, critical analysis and 
development of core multi-lingual NLP technologies. Education & Experience
PhD in Machine Learning, Statistics, Computer Science, Mathematics or 
related field with specialization in natural language processing and/or 
machine learning, OR * Masters degree in a related field with a strong 
academic/industrial track record. * Hands-on research experience in an 
academic or industrial setting.""".split()

In [119]:
id_to_word_1000train.doc2bow(text_input_1)

[(53, 1), (166, 1), (1455, 1), (2332, 1), (3146, 1)]

In [120]:
predict_input_1 = get_topics(id_to_word_1000train.doc2bow(text_input_1), model_1, k=10)
# uncomment below to view
# predict_input_1

In [121]:
predict_input_2 = get_topics(id_to_word_1000train.doc2bow(text_input_2), model_1, k=10)
# uncomment below to view
# predict_input_2

## Train model #2: Genism LDAMallet model
Model #2: implementation: Gensim LDAMallet wrapper around LDA Mallet model; 
          k_topics=5; 
          n_docs=1000; 
          partition = 80/20

In [122]:
# uncomment to download Mallet topic model
# !wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
# update path
path_mallet = '/Users/lee/Documents/techniche/techniche/data/mallet-2.0.8/bin/mallet'

In [123]:
# construct model #2
# TODO (Lee) - resolve deprecation warnings
model_2 = gensim.models.wrappers.LdaMallet(path_mallet, 
                                           corpus=corpus_1000train, 
                                           num_topics=5, 
                                           id2word=id_to_word_1000train)

### Model #2 - Explore and visualize

In [125]:
# TODO (Lee) - visualization with LDA Mallet
# explore topics visually
# pyLDAvis.enable_notebook()
# viz_topics_model_2 = pyLDAvis.prepare(model_2, corpus_1000train, id_to_word_1000train)
# viz_topics_model_1

In [126]:
# keywords in n topics in corpus
# uncomment below to view
# pprint(model_2.print_topics())

In [127]:
# most important keywords, and the respective weight, that form topic with index 0
# uncomment below to view
# pprint(model_2.print_topic(4))

### Model #2- Evaluate

#### Model #2 - Evaluate - Coherence

In [128]:
# calculate coherence metric for train set ((n = 800 docs/1000 docs total in dataset))
coherence_model_2train = CoherenceModel(model=model_2, 
                                        texts=processed_docs_1000train,
                                        dictionary=id_to_word_1000train,
                                        coherence='c_v')
coherence_model_2train_get = coherence_model_2train.get_coherence()
print(coherence_model_2train_get)

0.35959575939439825


In [129]:
# calculate coherence metric for test_set (n = 200 docs/100 docs total in dataset)
coherence_model_2test = CoherenceModel(model=model_2, 
                                       texts=processed_docs_1000test, 
                                       dictionary=id_to_word_1000test, 
                                       coherence='c_v')
coherence_model_2test_get = coherence_model_2test.get_coherence()
print(coherence_model_2test_get)

0.5969635120006694


In [130]:
# calculate coherence metric for each of the n topics in the test set
coherence_model_2_per_topic = coherence_model_2test.get_coherence_per_topic()
# print(coherence_model_2_per_topic)

### Train Model #3: Gensim LDA model
Model #1: implementation: Gensim LDAmodel; k_topics=10; n_docs=1000, partition = 80/20
This model increases the k_topics from 5 to 10, relative to model #1 above

In [131]:
# construct model #3
# TODO (Lee) - resolve deprecation warnings
model_3 = LdaModel(corpus=corpus_1000train,
                   id2word=id_to_word_1000train,
                   num_topics=10, 
                   random_state=100,
                   update_every=1,
                   chunksize=100,
                   passes=10,
                   alpha='auto',
                   per_word_topics=True)

### Model #3 - Explore and visualize

In [132]:
# explore topics visually
pyLDAvis.enable_notebook()
viz_topics_model_3 = pyLDAvis.gensim.prepare(model_3, corpus_1000train, id_to_word_1000train)
# viz_topics_model_3

In [133]:
# keywords in n topics in corpus
# uncomment below to view
# pprint(model_3.print_topics())

In [134]:
# most important keywords, and the respective weight, that form topic with index 0
# uncomment below to view
# pprint(model_3.print_topic(4))

### Model #3 - Evaluate

#### Model #3 - Evaluate - Coherence

In [135]:
# calculate coherence metric for train set ((n = 800 docs/1000 docs total in dataset))
coherence_model_3train = CoherenceModel(model=model_3, 
                                        texts=processed_docs_1000train,
                                        dictionary=id_to_word_1000train,
                                        coherence='c_v')
coherence_model_3train_get = coherence_model_3train.get_coherence()
print(coherence_model_3train_get)

0.3628245805478861


In [136]:
# calculate coherence metric for test_set (n = 200 docs/100 docs total in dataset)
coherence_model_3test = CoherenceModel(model=model_3, 
                                       texts=processed_docs_1000test, 
                                       dictionary=id_to_word_1000test, 
                                       coherence='c_v')
coherence_model_3test_get = coherence_model_3test.get_coherence()
print(coherence_model_3test_get)

0.6081902191345155


In [137]:
# calculate coherence metric for each of the n topics in the test set
coherence_model_3_per_topic = coherence_model_3test.get_coherence_per_topic()
# print(coherence_model_1_per_topic)

#### Model #3 - Evaluate - Perplexity

In [138]:
# calculate perplexity metric for model_1 train set
perplexity_model_3train = model_3.log_perplexity(corpus_1000train)
print(perplexity_model_3train)

-7.0676967060500635


In [139]:
# calculate perplexity metric for model_1 test set
perplexity_model_3test = model_3.log_perplexity(corpus_1000test)
print(perplexity_model_3test)

-9.945866263106378


### Model #3 - Predict

#### Model #3 - Predict - Pickle model

In [140]:
# pickle model #3
pickle.dump(model_3, open('/Users/lee/Documents/techniche/techniche/data/model_lda_1.pkl','wb'))

In [141]:
model_3 = pickle.load(open('/Users/lee/Documents/techniche/techniche/data/model_lda_1.pkl','rb'))

#### Model #3 - inference

In [142]:
predict_input_1 = get_topics(id_to_word_1000train.doc2bow(text_input_1), model_3, k=10)
# uncomment below to view
# predict_input_1

In [143]:
predict_input_2 = get_topics(id_to_word_1000train.doc2bow(text_input_2), model_3, k=10)
# uncomment below to view
# predict_input_2

## Train Model #4: Gensim LDA model
Model #4: implementation: Gensim LDAmodel; k_topics=15; n_docs=1000, partition = 80/20
This model increases the k_topics to 15, relative to model #1 and model #3 above

In [152]:
# construct model #4
# TODO (Lee) - resolve deprecation warnings
model_4 = LdaModel(corpus=corpus_1000train,
                   id2word=id_to_word_1000train,
                   num_topics=15, 
                   random_state=100,
                   update_every=1,
                   chunksize=100,
                   passes=10,
                   alpha='auto',
                   per_word_topics=True)

### Model #4 - Explore and visualize

In [153]:
# explore topics visually
pyLDAvis.enable_notebook()
viz_topics_model_4 = pyLDAvis.gensim.prepare(model_4, corpus_1000train, id_to_word_1000train)
# viz_topics_model_1

In [154]:
# keywords in n topics in corpus
# uncomment below to view
# pprint(model_4.print_topics())

In [155]:
# most important keywords, and the respective weight, that form topic with index 0
# uncomment below to view
# pprint(model_4.print_topic(4))

### Model #4 - Evaluate

#### Model #4 - Evaluate - Coherence

In [156]:
# calculate coherence metric for train set ((n = 800 docs/1000 docs total in dataset))
coherence_model_4train = CoherenceModel(model=model_4, 
                                        texts=processed_docs_1000train,
                                        dictionary=id_to_word_1000train,
                                        coherence='c_v')
coherence_model_4train_get = coherence_model_4train.get_coherence()
print(coherence_model_4train_get)

0.37342906623443217


In [151]:
# calculate coherence metric for test_set (n = 200 docs/100 docs total in dataset)
coherence_model_4test = CoherenceModel(model=model_4, 
                                       texts=processed_docs_1000test, 
                                       dictionary=id_to_word_1000test, 
                                       coherence='c_v')
coherence_model_4test_get = coherence_model_4test.get_coherence()
print(coherence_model_4test_get)

KeyError: 3125

In [88]:
# calculate coherence metric for each of the n topics in the test set
coherence_model_4_per_topic = coherence_model_3test.get_coherence_per_topic()
# print(coherence_model_1_per_topic)

KeyError: 4130

#### Model #4 - Evaluate - Perplexity

In [91]:
# calculate perplexity metric for model_1 train set
perplexity_model_3train = model_3.log_perplexity(corpus_1000train)
print(perplexity_model_3train)

-7.103390272359293


In [92]:
# calculate perplexity metric for model_1 test set
perplexity_model_3test = model_3.log_perplexity(corpus_1000test)
print(perplexity_model_3test)

-10.324662145126718


### Model #4 - Predict

#### Model #3 - Predict - Pickle model

In [93]:
# pickle model
pickle.dump(model_4, open('/Users/lee/Documents/techniche/techniche/data/model_lda_1.pkl','wb'))

In [94]:
model_4 = pickle.load(open('/Users/lee/Documents/techniche/techniche/data/model_lda_1.pkl','rb'))

#### Model #4 - inference

In [95]:
predict_input_1 = get_topics(id_to_word_1000train.doc2bow(text_input_1), model_4, k=10)
# uncomment below to view
# predict_input_1

In [96]:
predict_input_2 = get_topics(id_to_word_1000train.doc2bow(text_input_2), model_4, k=10)
# uncomment below to view
# predict_input_2

## Train Model #5: Author-topic model
Model #4: implementation: Gensim AuthorTopicModel; k_topics=15; n_docs=1000, partition = 80/20
This model increases the k_topics to 15, relative to model #1 and model #3 above

In [97]:
# construct inventor-to-doc mapping as df from nested inventors column in json api response
# df_inventors = json_normalize(data, record_path=['inventors'], meta=['patent_number', 'patent_date'])
# df_inventors = df_inventors[['inventor_id', 'patent_number', 'patent_date']]
# df_inventors.sort_values(by=['patent_date'])
# df_inventors.head(3)

In [98]:
# quick visual index to patent number mapping
# for i in data:
#     print(data.index(i), i['patent_number'])

In [99]:
# TODO (Lee) review fix to pat_inv_map, in which "patent" in mapping is idx of pat, not pat_number from api
pat2inv = pat_inv_map(data)

NameError: name 'data' is not defined

#### Construct author-topic model

In [None]:
# construct author-topic model
model_at = AuthorTopicModel(corpus=corpus,
                         doc2author=pat2inv,
                         id2word=id_to_word)

In [None]:
# construct vectors for authors
author_vecs = [model_at.get_author_topics(author) for author in model_at.id2author.values()]
author_vecs

In [None]:
# retrieve topic distribution for author using use model[name] syntax
# each topic has a probability of being expressed given the particular author, 
# but only the ones above a certain threshold are displayed

model_at['7788103-1']

In [None]:
# def show_author(name):
#     print('\n%s' % name)
#     print('Docs:', model.author2doc[name])
#     print('Topics:')
#     pprint([(topic_labels[topic[0]], topic[1]) for topic in model[name]])

In [None]:
# build mapping from inventor to patent
inv2pat = gensim.models.atmodel.construct_author2doc(pat2inv)

### Model #X - Predict

In [None]:
# prediction functions that take input of new text string, and predict topic distribution

## Appendix

#### Appendix - Model #1 - Evaluate - holdout set

In [None]:
# TODO (Lee) - evaluate on 1k documents **not** used in LDA training
doc_stream = (tokens for _, tokens in iter_wiki('./data/simplewiki-20140623-pages-articles.xml.bz2'))  # generator
test_docs = list(itertools.islice(doc_stream, 8000, 9000))

#### Appendix - Model #1 - Evaluate - Doc split

In [None]:
# TODO (Lee) - split each document into two parts, and check that 1) topics of the first half are similar to 
topics of the second 2) halves of different documents are mostly dissimilar:

In [None]:
# TODO (Lee)
def intra_inter(model, test_docs, num_pairs=10000):
    # split each test document into two halves and compute topics for each half
    part1 = [model[id2word.doc2bow(tokens[: len(tokens) / 2])] for tokens in test_docs]
    part2 = [model[id2word.doc2bow(tokens[len(tokens) / 2 :])] for tokens in test_docs]
    
    # print computed similarities (uses cossim)
    print("average cosine similarity between corresponding parts (higher is better):")
    print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip(part1, part2)]))

    random_pairs = np.random.randint(0, len(test_docs), size=(num_pairs, 2))
    print("average cosine similarity between 10,000 random parts (lower is better):")    
    print(np.mean([gensim.matutils.cossim(part1[i[0]], part2[i[1]]) for i in random_pairs]))

In [None]:
# TODO (Lee)
print("LDA results:")
intra_inter(lda_model, test_docs)

#### Appendix - Model #1 - Evaluate - Log likelihood

#### Appendix - Model #1 - Evaluate - Alternate unimplemented workflow

In [None]:
# TODO (Lee)
# def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
#     """
#     Compute c_v coherence for various number of topics

#     Parameters:
#     ----------
#     dictionary : Gensim dictionary
#     corpus : Gensim corpus
#     texts : List of input texts
#     limit : Max num of topics

#     Returns:
#     -------
#     model_list : List of LDA topic models
#     coherence_values : Coherence values corresponding to the LDA model with respective number of topics
#     """
#     coherence_values = []
#     model_list = []
#     for num_topics in range(start, limit, step):
#         model = gensim.models.wrappers.LdaMallet(path_mallet, corpus=corpus, num_topics=num_topics, id2word=id2word)
#         model_list.append(model)
#         coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
#         coherence_values.append(coherencemodel.get_coherence())

#     return model_list, coherence_values

# model_list, coherence_values = compute_coherence_values(dictionary=id_to_word, corpus=corpus, texts=data, start=2, limit=40, step=6)

#### Appendix - Model 1 - Inference - Alternate workflows

In [None]:
# `get_document_topics()` returns topic probability distribution for given document
# topic_dist_675_a = model_lda.get_document_topics(corpus[50])
# pprint(sorted(topic_dist_50_a))

In [None]:
# topicid = 3
# model_lda.get_topic_terms(topicid, topn=10)

In [None]:
# text_train[doc_id]
# doc_id = 675
# topic_dist_675_b = sorted(get_topics(corpus[doc_id], k=10)), text_train[doc_id]
# pprint(topic_dist_675_b)

In [None]:
# From Gensim example - Alternate predict workflow - Create a new corpus, made of previously unseen documents.
# other_texts = [
#      ['computer', 'time', 'graph'],
#      ['survey', 'response', 'eps'],
#      ['human', 'system', 'computer']
#  ]
# other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]
# unseen_doc = other_corpus[0]
# vector = lda[unseen_doc]  # get topic probability distribution for a document
# Update the model by incrementally training on the new corpus

In [None]:
# print keywords in n topics
sorted(model_lda.show_topics(), key=lambda x: x[1])

# print keywords in n topics
sorted(model_lda.print_topics(), key=lambda x: x[1])

# print keywords in n topics
sorted(model_l.print_topics(), key=lambda x: x[1])

# print keywords in n topics
sorted(model_1.print_topics(), key=lambda x: x[0])

# show_topic() returns n most important/relevant words, and their weights, that comprise given topic
pprint(model_1.show_topic(1, topn=10))

pprint(model_1.show_topics(num_topics=5, num_words=10))

#### Appendix - Model #2 - Evaluate - Perplexity
No implementation of log_perplexity method for LDAMallet