In [13]:
from microscopes.common.rng import rng
from microscopes.lda.definition import model_definition
from microscopes.lda.model import initialize
from microscopes.lda.testutil import toy_dataset
from microscopes.lda import model, runner
from collections import Counter

import itertools
import numpy as np
import pyLDAvis
import scipy as sp
import numpy as np
import re
import simplejson

In [20]:
N, V = 10, 3
defn = model_definition(N, V)
data = toy_dataset(defn)
prng = rng()

latent = initialize(defn, data, prng)
r = runner.runner(defn, data, latent)
r.run(prng, 1000)

print "Topics Found:", latent.ntopics()
print "Assignments:"
for doc in latent.assignments():
    print "\t", doc

Topics Found: 3
Assignments:
	[1, 1, 1, 1, 1, 1, 1, 1, 1]
	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
	[2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2]
	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
	[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
	[2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2]
	[3, 1, 1, 1, 3, 1, 1, 3, 1, 3, 1, 1]
	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


### Visualize Daily Kos Topics

In [11]:
def get_vis_data(latent, num_docs, num_to_word):
    sorted_num_vocab = sorted(num_to_word.keys())
    
    topic_term_distribution = []
    for topic in latent.word_distribution(prng):
        topic_term_distribution.append([topic[word_id] for word_id in sorted_num_vocab])
        
    doc_topic_distribution = latent.document_distribution()
    
    doc_lengths = [len(doc) for doc in num_docs]
    
    vocab = [num_to_word[k] for k in sorted_num_vocab]
    assert all(map(len, vocab))
    
    ctr = Counter(list(itertools.chain.from_iterable(num_docs)))
    term_frequency = [ctr[num] for num in sorted_num_vocab]
    
    return {'topic_term_dists': topic_term_distribution, 
            'doc_topic_dists': doc_topic_distribution,
            'doc_lengths': doc_lengths,
            'vocab': vocab,
            'term_frequency': term_frequency}

In [6]:
with open("docword.kos.txt", "r") as f:
    kos_raw = [map(int, _.strip().split()) for _ in f.readlines()][3:]

    docs = []
for _, grp in itertools.groupby(kos_raw, lambda x: x[0]):
    doc = []
    for _, word_id, word_cnt in grp:
        doc += word_cnt * [word_id - 1]
    docs.append(doc)

In [7]:
with open("vocab.kos.txt", "r") as f:
    kos_vocab = [word.strip() for word in f.readlines()]
id_to_word = {i: word for i, word in enumerate(kos_vocab)}
word_to_id = {word: i for i, word in enumerate(kos_vocab)}

In [8]:
N, V = len(docs), len(id_to_word)
defn = model_definition(N, V)
prng = rng()
latent = initialize(defn, docs, prng, 
                    vocab_hp=0.5, 
                    dish_hps={"alpha": 0.1, "gamma": 0.1})
r = runner.runner(defn, docs, latent)

print "number of docs:", N, "vocabulary size:", V

number of docs: 3430 vocabulary size: 6906


In [16]:
step_size = 10
steps = 20

for _ in range(steps):
    r.run(prng, step_size)
    with open("daily-kos-summary.json", "w") as fp:
        simplejson.dump(get_vis_data(latent, docs, id_to_word), fp=fp)
    print "iteration:", _ * step_size, "perplexity:", latent.perplexity(), "num topics:", latent.ntopics()

iteration: 0 perplexity: 1660.04514575 num topics: 13
iteration: 10 perplexity: 1642.88840381 num topics: 14
iteration: 20 perplexity: 1628.16751916 num topics: 14
iteration: 30 perplexity: 1621.08824537 num topics: 13
iteration: 40 perplexity: 1613.7293193 num topics: 14
iteration: 50 perplexity: 1607.56607523 num topics: 14
iteration: 60 perplexity: 1598.09713799 num topics: 14
iteration: 70 perplexity: 1584.36558045 num topics: 14
iteration: 80 perplexity: 1580.50486935 num topics: 14
iteration: 90 perplexity: 1577.8944629 num topics: 15
iteration: 100 perplexity: 1574.40470209 num topics: 15
iteration: 110 perplexity: 1571.62380585 num topics: 15
iteration: 120 perplexity: 1570.25337662 num topics: 15
iteration: 130 perplexity: 1560.67636844 num topics: 16
iteration: 140 perplexity: 1556.59002989 num topics: 17
iteration: 150 perplexity: 1555.3368302 num topics: 16
iteration: 160 perplexity: 1553.96941592 num topics: 16
iteration: 170 perplexity: 1552.24021544 num topics: 16
iterat

In [18]:
data = get_vis_data(latent, docs, id_to_word)
prepared = pyLDAvis.prepare(**data)
pyLDAvis.display(prepared)