In [47]:
from microscopes.common.rng import rng
from microscopes.lda.definition import model_definition
from microscopes.lda.model import initialize
from microscopes.lda.testutil import toy_dataset
from microscopes.lda import model, runner
from microscopes.lda.biology_data import get_docs
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

import itertools
import numpy as np
import pyLDAvis
import scipy as sp
import numpy as np
import re

In [4]:
N, V = 10, 3
defn = model_definition(N, V)
data = toy_dataset(defn)
prng = rng()

latent = initialize(defn, data, prng)
r = runner.runner(defn, data, latent)
r.run(prng, 1000)

print "Topics Found:", latent.ntopics()
print "Assignments:"
for doc in latent.assignments():
    print "\t", doc

Topics Found: 1
Assignments:
	[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
	[3, 3]
	[3, 3, 3, 3, 3, 3, 3, 3]
	[3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
	[3, 3, 3, 3, 3, 3, 3, 3, 3]
	[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
	[3, 3, 3, 3, 3, 3, 3, 3, 3]
	[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
	[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
	[3, 3, 3, 3, 3]


### Compare with Teh on Biology Abstract Data

Parse biology abstract data

In [5]:
def load_raw(raw_data_file):
    with open(raw_data_file, "r") as f:
        raw_abstracts = f.read()
    return raw_abstracts
    

def vectorize(docs, *args, **qsargs):
    vectorizer = CountVectorizer(stop_words='english', *args, **qsargs)
    data = vectorizer.fit_transform(docs)
    col_freq_filter = np.asarray(data.sum(axis=0) > 9)[0]
    print "total words", data[:, col_freq_filter].sum()
    vocabulary = [t for _, t in sorted([(v, k) for k, v in vectorizer.vocabulary_.iteritems()])]
    vectorized_docs = [[] for _ in docs]
    for row, col, count in zip(*sp.sparse.find(data)):
        if not col_freq_filter[col]:
            continue
        for word in range(count):
            vectorized_docs[row].append(vocabulary[col])
    return vectorized_docs


def preprocess(raw_abstracts):
    processed_abstracts = {}
    for _ in re.findall("Citation: (.+?)----", raw_abstracts, re.DOTALL):
        citation = _.split("\n")[0]
        _ = re.findall("Abstract(.+)", _, re.DOTALL)[0]
        _ = _[2:]

        _ = re.sub("\n", " ", _)
        _ = re.sub("[ ]{2,}", " ", _)
        _ = _.strip()
        processed_abstracts[citation] = _
    return processed_abstracts

processed_abstracts = preprocess(load_raw("nematode biology abstracts.txt"))
print "number of abstracts", len(processed_abstracts)
word_docs = vectorize(processed_abstracts.values())

number of abstracts 5828
total words 488903


Get needed structures of biology abstracts

In [6]:
vocab = set(list(itertools.chain.from_iterable(word_docs)))
word_to_num = { word: i for i, word in enumerate(vocab)}
num_to_word = { i: word for word, i in word_to_num.iteritems()}
num_docs = [[word_to_num[word] for word in doc] for doc in word_docs]

Create model

In [10]:
N, V = len(num_docs), len(set(list(itertools.chain.from_iterable(num_docs))))
defn = model_definition(N, V)
prng = rng()
latent = initialize(defn, num_docs, prng, 
                    vocab_hp=0.5, 
                    dish_hps={"alpha": 0.1, "gamma": 0.1})
r = runner.runner(defn, num_docs, latent)

print "number of docs:", N, "vocabulary size:", V

number of docs: 5828 vocabulary size: 5800


Run model

In [11]:
for _ in range(5):#itertools.count():
    r.run(prng, 10)
    print "perplexity:", latent.perplexity(), "num topics:", latent.ntopics()

perplexity: 1231.16064334 num topics: 12
perplexity: 1199.50043136 num topics: 13
perplexity: 1181.43752104 num topics: 15
perplexity: 1171.83230063 num topics: 15
perplexity: 1166.16724273 num topics: 15


Prepare for vis

In [48]:
sorted_num_vocab = sorted(num_to_word.keys())
topic_term_distribution = []
for d in latent.word_distribution(prng):
    topic_term_distribution.append([d[k] for k in sorted_num_vocab])
doc_topic_distribution = latent.document_distribution()
doc_topic_distribution = [_[1:] for _ in doc_topic_distribution]
doc_lengths = [len(doc) for doc in num_docs]
vocab = [num_to_word[k] for k in sorted_num_vocab]
ctr = Counter(list(itertools.chain.from_iterable(num_docs)))
term_frequency = [ctr[num] for num in sorted_num_vocab]

In [45]:
vis = pyLDAvis.prepare(topic_term_distribution, doc_topic_distribution, doc_lengths, vocab, term_frequency)

In [46]:
pyLDAvis.display(vis)