In [1]:
from microscopes.common.rng import rng
from microscopes.lda.definition import model_definition
from microscopes.lda.model import initialize
from microscopes.lda.testutil import toy_dataset
from microscopes.lda import model, runner
from collections import Counter

import itertools
import numpy as np
import pyLDAvis
import scipy as sp
import numpy as np
import re
import simplejson

In [3]:
N, V = 10, 3
defn = model_definition(N, V)
data = toy_dataset(defn)
prng = rng()

latent = initialize(defn, data, prng)
r = runner.runner(defn, data, latent)
r.run(prng, 1000)

print "Topics Found:", latent.ntopics()
print "Assignments:"
for doc in latent.assignments():
    print "\t", doc
del latent
del r
del data
del defn

Topics Found: 2
Assignments:
	[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
	[2, 2, 2, 2, 2]
	[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
	[2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
	[2, 2, 2, 2, 2, 2, 2, 2]
	[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
	[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
	[2, 2, 2]
	[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
	[2, 2, 2, 2, 2, 2, 2, 2, 2]


### Visualize Daily Kos Topics

In [4]:
def get_vis_data(latent, num_docs, num_to_word):
    sorted_num_vocab = sorted(num_to_word.keys())
    
    topic_term_distribution = []
    for topic in latent.word_distribution(prng):
        topic_term_distribution.append([topic[word_id] for word_id in sorted_num_vocab])
        
    doc_topic_distribution = latent.document_distribution()
    
    doc_lengths = [len(doc) for doc in num_docs]
    
    vocab = [num_to_word[k] for k in sorted_num_vocab]
    assert all(map(len, vocab))
    
    ctr = Counter(list(itertools.chain.from_iterable(num_docs)))
    term_frequency = [ctr[num] for num in sorted_num_vocab]
    
    return {'topic_term_dists': topic_term_distribution, 
            'doc_topic_dists': doc_topic_distribution,
            'doc_lengths': doc_lengths,
            'vocab': vocab,
            'term_frequency': term_frequency}

In [5]:
with open("docword.kos.txt", "r") as f:
    kos_raw = [map(int, _.strip().split()) for _ in f.readlines()][3:]

    docs = []
for _, grp in itertools.groupby(kos_raw, lambda x: x[0]):
    doc = []
    for _, word_id, word_cnt in grp:
        doc += word_cnt * [word_id - 1]
    docs.append(doc)

In [6]:
with open("vocab.kos.txt", "r") as f:
    kos_vocab = [word.strip() for word in f.readlines()]
id_to_word = {i: word for i, word in enumerate(kos_vocab)}
word_to_id = {word: i for i, word in enumerate(kos_vocab)}

In [7]:
N, V = len(docs), len(id_to_word)
defn = model_definition(N, V)
prng = rng()
kos_latent = initialize(defn, docs, prng, 
                        vocab_hp=0.5, 
                        dish_hps={"alpha": 0.1, "gamma": 0.1})
r = runner.runner(defn, docs, kos_latent)

print "number of docs:", N, "vocabulary size:", V

number of docs: 3430 vocabulary size: 6906


In [8]:
step_size = 10
steps = 100

for _ in range(steps):
    r.run(prng, step_size)
    with open("daily-kos-summary.json", "w") as fp:
        simplejson.dump(get_vis_data(kos_latent, docs, id_to_word), fp=fp)
    print "iteration:", _ * step_size, "perplexity:", kos_latent.perplexity(), "num topics:", kos_latent.ntopics()

iteration: 0 perplexity: 1626.38094644 num topics: 13
iteration: 10 perplexity: 1596.59802725 num topics: 14
iteration: 20 perplexity: 1584.65051021 num topics: 15
iteration: 30 perplexity: 1578.50104727 num topics: 14
iteration: 40 perplexity: 1572.62513901 num topics: 15
iteration: 50 perplexity: 1568.60276275 num topics: 14
iteration: 60 perplexity: 1565.20036529 num topics: 16
iteration: 70 perplexity: 1561.41166493 num topics: 17
iteration: 80 perplexity: 1559.39319032 num topics: 17
iteration: 90 perplexity: 1557.5257607 num topics: 17
iteration: 100 perplexity: 1555.64344902 num topics: 17
iteration: 110 perplexity: 1553.91984539 num topics: 17
iteration: 120 perplexity: 1552.30543595 num topics: 16
iteration: 130 perplexity: 1551.04745724 num topics: 16
iteration: 140 perplexity: 1550.62808228 num topics: 16
iteration: 150 perplexity: 1548.31268875 num topics: 16
iteration: 160 perplexity: 1547.74869168 num topics: 16
iteration: 170 perplexity: 1545.87557144 num topics: 17
iter

In [11]:
data = get_vis_data(kos_latent, docs, id_to_word)
prepared = pyLDAvis.prepare(**data)
pyLDAvis.display(prepared)