In [1]:
# code for loading the format for the notebook
import os

# path : store the current path to convert back to it later
path = os.getcwd()

os.chdir( os.path.join('..', 'notebook_format') )
from formats import load_style
load_style()

In [2]:
os.chdir(path)
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 8, 6 # change default figure size
plt.rcParams['font.size'] = 14 # and font size

# 1. magic to print version
# 2. magic so that the notebook will reload external python modules
%load_ext watermark
%load_ext autoreload 
%autoreload 2

from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel

%watermark -a 'Ethen' -d -t -v -p numpy,pandas,matplotlib,gensim

Ethen 2016-10-30 21:38:03 

CPython 3.5.2
IPython 4.2.0

numpy 1.11.1
pandas 0.18.1
matplotlib 1.5.1
gensim 0.13.1


# Topic Coherence

Topic coherence is a quantitative measure that can be used to compare difference topic models based on their human-interpretability.

## Setting up toy corpus

The toy corpus essentially has two classes of documents. First five are about human-computer interaction and the other four are about graphs. We will be using the `u_mass` and `c_v` coherence for a "good" (trained for 50 iterations) and a "bad" (trained for only 1 iteration) LDA model. Intuitively, the good LDA model should be able come up with better or more human-interpretable topics. Therefore the coherence measure for the good LDA model should be better (the higher the better) than that for the bad LDA model.

In [3]:
texts = [['human', 'interface', 'computer'],
         ['survey', 'user', 'computer', 'system', 'response', 'time'],
         ['eps', 'user', 'interface', 'system'],
         ['system', 'human', 'system', 'eps'],
         ['user', 'response', 'time'],
         ['trees'],
         ['graph', 'trees'],
         ['graph', 'minors', 'trees'],
         ['graph', 'minors', 'survey']]

# build the dictionary and convert each document to
# bag of words representation
dictionary = Dictionary(texts)
corpus = [ dictionary.doc2bow(text) for text in texts ]

In [4]:
# train the LDA model
lda_good = LdaModel(corpus = corpus, id2word = dictionary, iterations = 50, num_topics = 2)
lda_bad = LdaModel(corpus = corpus, id2word = dictionary, iterations = 1, num_topics = 2)

## Interpretation

After training the model, we can look at the top words associated with each topic to interpret the meaning.

In [5]:
topics = lda_good.show_topics(num_words = 10)
topics

[(0,
  '0.173*"graph" + 0.168*"trees" + 0.115*"minors" + 0.079*"interface" + 0.077*"survey" + 0.070*"computer" + 0.064*"user" + 0.062*"human" + 0.054*"system" + 0.050*"eps"'),
 (1,
  '0.177*"system" + 0.125*"user" + 0.099*"time" + 0.093*"response" + 0.092*"eps" + 0.082*"human" + 0.076*"computer" + 0.070*"survey" + 0.069*"interface" + 0.041*"trees"')]

In [6]:
topics = lda_bad.show_topics(num_words = 10)
topics

[(0,
  '0.156*"system" + 0.094*"user" + 0.093*"trees" + 0.083*"graph" + 0.080*"interface" + 0.078*"human" + 0.076*"eps" + 0.072*"survey" + 0.071*"minors" + 0.070*"time"'),
 (1,
  '0.112*"graph" + 0.103*"trees" + 0.101*"user" + 0.088*"system" + 0.086*"computer" + 0.079*"response" + 0.076*"time" + 0.075*"minors" + 0.074*"survey" + 0.070*"eps"')]

In [7]:
# we can also use pyLDAvis for interactive web visualization

# import pyLDAvis.gensim
# pyLDAvis.enable_notebook()
# pyLDAvis.gensim.prepare(lda_good, corpus, dictionary)
# pyLDAvis.gensim.prepare(lda_bad, corpus, dictionary)

For the good LdaModel:

- Topic 1: More weight was assigned to words such as "system", "user", "eps", "interface" etc which captures the first set of documents.
- Topic 2: More weight was assigned to words such as "graph", "trees", "survey" which captures the topic in the second set of documents.

As for the badLdaModel:

- Topic 1: More weight was assigned to words such as "system", "user", "trees", "graph"
- Topic 2: More weight wa assigned to words such as "system", "trees", "graph", "user" which is similar to the first topic. 

Looking at the result, we can say the topics generated by the bad LdaMdodel are not clear enough (less human-interpretable). Next, we'll see if the topic coherence measurement's output is consistent with this result.

In [8]:
# obtain topic coherence using u_mass measure
print('u_mass')
cm_good1 = CoherenceModel(model = lda_good, corpus = corpus, 
                         dictionary = dictionary, coherence = 'u_mass')
cm_bad1 = CoherenceModel(model = lda_bad, corpus = corpus, 
                         dictionary = dictionary, coherence = 'u_mass')

print( 'good: ', cm_good1.get_coherence() )
print( 'bad: ',  cm_bad1.get_coherence() )
print()

# obtain topic coherence using c_v measure
print('c_v')
cm_good2 = CoherenceModel(model = lda_good, texts = texts, 
                          dictionary = dictionary, coherence = 'c_v')
cm_bad2 = CoherenceModel(model = lda_bad, texts = texts, 
                         dictionary = dictionary, coherence = 'c_v')

print( 'good: ', cm_good2.get_coherence() )
print( 'bad: ', cm_bad2.get_coherence() )

u_mass
good:  -13.2293293715
bad:  -14.7183268551

c_v
good:  0.372857430939
bad:  0.361089571831


As we can see, the `u_mass` and `c_v` coherence for the good LDA model is better than the bad LDA model. 

In [9]:
from gensim import interfaces
from gensim.topic_coherence import (segmentation, probability_estimation,
                                    direct_confirmation_measure, indirect_confirmation_measure,
                                    aggregation)
from gensim.matutils import argsort
from gensim.utils import is_corpus, FakeDict
from gensim.models.ldamodel import LdaModel
from gensim.models.wrappers import LdaVowpalWabbit, LdaMallet

import numpy as np
from collections import namedtuple

boolean_document_based = ['u_mass']
make_pipeline = namedtuple('Coherence_Measure', 'seg, prob, conf, aggr')

coherence_dict = {
    'u_mass': make_pipeline(segmentation.s_one_pre,
                            probability_estimation.p_boolean_document,
                            direct_confirmation_measure.log_conditional_probability,
                            aggregation.arithmetic_mean)
}

In [10]:
coherence_dict['u_mass']

Coherence_Measure(seg=<function s_one_pre at 0x118237ae8>, prob=<function p_boolean_document at 0x118237d08>, conf=<function log_conditional_probability at 0x118237f28>, aggr=<function arithmetic_mean at 0x11823f378>)

In [11]:
topn = 10
model = lda_good
coherence = 'u_mass'

if coherence in boolean_document_based:
    print(1)

1


In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [16]:
texts = ['human interface computer',
         'survey user computer system response time',
         'eps user interface system',
         'system human system eps',
         'user response time',
         'trees',
         'graph trees',
         'graph minors trees',
         'graph minors survey']

In [17]:
count_vec = CountVectorizer(ngram_range = (1, 2), stop_words = 'english', 
                            min_df = 2, max_df = 0.9, max_features = 10000)

X_dtm = count_vec.fit_transform(texts)
print(X_dtm.shape)

(9, 13)


In [18]:
lda = LatentDirichletAllocation(n_topics = 2, max_iter = 10, n_jobs = 1)
doc_topic_distr = lda.fit_transform(X_dtm)

In [20]:
TOPWORDS = 15

def print_top_words(lda_model, count_vec, n_top_words):
    """top words associated with each topic for the sklearn LDA model"""
    features = count_vec.get_feature_names()
    for topic_idx, topic in enumerate(lda_model.components_):
        print( 'Topic #{}:'.format(topic_idx + 1) )
        print( ', '.join([ features[i] for i in np.argsort(topic)[-n_top_words:] ]) )
        print()
    
    print()

In [21]:
print_top_words(lda_model = lda, count_vec = count_vec, n_top_words = TOPWORDS)

Topic #1:
survey, time, graph minors, minors, response time, response, user, computer, graph, interface, eps, human, trees

Topic #2:
human, eps, interface, trees, computer, minors, graph minors, response time, response, survey, time, graph, user




## Reference

- [Notebook: Gensim's topic coherence tutorial](http://nbviewer.jupyter.org/github/RaRe-Technologies/gensim/blob/develop/docs/notebooks/topic_coherence_tutorial.ipynb#topic=2&lambda=1&term=)