A test run of the fantastic pyLDAvis library for visualizing topic models

In [1]:
import re
import sys
import string
import itertools
from collections import Counter

import numpy as np
import pandas as pd
from nltk import tokenize
from nltk.corpus import brown
from nltk.corpus import stopwords, wordnet
from nltk.tag.perceptron import PerceptronTagger
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize, pos_tag

#import pyLDAvis.gensim
from gensim import corpora, models

PRINTABLE = set(string.printable)
STOPLIST = set(stopwords.words("english")) | {"'ve", "'m", "'s", "'re", "'d", "'ll"}

In [2]:
def process_stemming_pos(text,
                        pos_filter=set("VNJ"),
                        stemming=True,
                        stoplist=STOPLIST,
                        sent_tokenize=sent_tokenize,
                        word_tokenize=word_tokenize,
                        lemmantize=WordNetLemmatizer):

    """
    Performs work stemming, lemmatization, stopword removal, 
    and optionally keeps verbs, nouns, and adjectives only
    """
    lemmatize_map = {"N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ}
    wl = WordNetLemmatizer()

    def stem_word_pos_pair(word, pos):
        v = pos[0].upper()
        if v != "V" and v != "N" and v != "J":
            return word, pos
        else:
            return wl.lemmatize(word, lemmatize_map[v]), pos

    try:
        sents_tokens_pos = [pos_tag(word_tokenize(sent))
                            for sent in sent_tokenize(text)]
        tokens_pos = list(itertools.chain(*sents_tokens_pos))

        if stoplist is not None:
            tokens_pos = filter(lambda pair: pair[0] not in stoplist, tokens_pos)
        if stemming:
            tokens_pos = [stem_word_pos_pair(*pair) for pair in tokens_pos]
        if pos_filter is not None:
            tokens_pos = filter(lambda pair: pair[1][0] in pos_filter, tokens_pos)

        if len(tokens_pos) == 0:
            return ""
        text = " ".join(zip(*tokens_pos)[0])
    except Exception as inst:
        print "semantic_processing: %s\ninput: %r" % (inst, text)
        sys.exit(1)

    return text

def process_corpus(corpus):
    """
    1. Concatenate sentences (list of words) in paragraphs into a single list of words
    2. Join the list of words separated by space
    3. Process the text of paragraph by keeping only verbs, nouns and adjectives
    4. Remove words that only occur once in the corpus
    5. Return each paragraph as a list of words
    """
    corpus = [" ".join(list(itertools.chain(*para))) for para in corpus]
    corpus = [process_stemming_pos(para, pos_filter=set("VNJ"),
                                   stemming=True, stoplist=STOPLIST) for para in corpus]
    corpus = [para.split() for para in corpus]

    freq = Counter()
    for para in corpus:
        freq.update(para)
    
    processed_corpus = [[token for token in para if freq[token] > 1] for para in corpus]
    return processed_corpus

In [3]:
print brown.categories()

[u'adventure', u'belles_lettres', u'editorial', u'fiction', u'government', u'hobbies', u'humor', u'learned', u'lore', u'mystery', u'news', u'religion', u'reviews', u'romance', u'science_fiction']


Concatenate sentences within paragraph:

In [4]:
corpus = brown.paras(categories=["science_fiction", "government"])

processed_corpus = process_corpus(corpus)

dictionary = corpora.Dictionary(processed_corpus)
corpus_count = [dictionary.doc2bow(text) for text in processed_corpus]
tfidf = models.TfidfModel(corpus_count)
corpus_tfidf = tfidf[corpus_count]
lda = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=20, alpha=0.0001, passes=100)

In [5]:
lda.show_topics()

[(5,
  u'0.005*"stay" + 0.005*"shelter" + 0.005*"long" + 0.004*"blockade" + 0.003*"Digby" + 0.003*"request" + 0.003*"calm" + 0.003*"Hays" + 0.002*"legislative" + 0.002*"admit"'),
 (3,
  u'0.008*"speak" + 0.008*"close" + 0.006*"fiscal" + 0.004*"pause" + 0.004*"roof" + 0.004*"Hesperus" + 0.004*"get" + 0.004*"city" + 0.004*"temperature" + 0.003*"block"'),
 (9,
  u'0.005*"govern" + 0.004*"fine" + 0.004*"temporary" + 0.003*"thing" + 0.003*"Court" + 0.003*"tray" + 0.003*"Source" + 0.003*"Data" + 0.003*"state" + 0.003*"fact"'),
 (17,
  u'0.006*"Jr." + 0.006*"Notte" + 0.006*"A." + 0.005*"John" + 0.005*"Day" + 0.004*"teach" + 0.004*"Do" + 0.004*"allowance" + 0.004*"Rhode" + 0.003*"Governor"'),
 (2,
  u'0.019*"Governor" + 0.011*"shelter" + 0.007*"entrance" + 0.005*"basement" + 0.004*"fallout" + 0.004*"ground" + 0.004*"SBA" + 0.004*"family" + 0.004*"concrete" + 0.003*"case"'),
 (7,
  u'0.011*"Rayburn" + 0.010*"Sam" + 0.008*"American" + 0.006*"rate" + 0.006*"great" + 0.005*"Speaker" + 0.004*"today

In [7]:
import pyLDAvis
from pyLDAvis import gensim

data = gensim.prepare(lda, corpus_tfidf, dictionary)
pyLDAvis.display(data)

In pyLDAvis, the distance between topics (represented by probability distribution of words) is computed as Jensen-Shannon Divergence, and then MDS is used to project topics into 2-D space for visualization.

There appear to be three clusters 

1. topic #1
2. topic #2
3. topic #3 through #20

Note that we only included paragraphs of articles in **science fiction** and **government**, and the resulting plot shows that topic Num. 1 and 2 are well separated from all the other topics. Taking a closer look at the word distributions of each topics, we can see that topic Num. 1 and 2 correponds to discussions about politics related issues, while topic Num. 3 through 20 appear to be more diverse. 