In [1]:
from microscopes.common.rng import rng
from microscopes.lda.definition import model_definition
from microscopes.lda.model import initialize
from microscopes.lda.testutil import toy_dataset
import numpy as np
from microscopes.lda import model, runner

import itertools

In [14]:
N, V = 10, 3
defn = model_definition(N, V)
data = toy_dataset(defn)
prng = rng()

latent = initialize(defn, data, prng)
r = runner.runner(defn, data, latent)
r.run(prng, 1000)

print "Topics Found:", latent.ntopics()
print "Assignments:"
for doc in latent.assignments():
    print "\t", doc

Topics Found: 2
Assignments:
	[3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3]
	[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
	[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
	[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
	[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
	[3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
	[3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
	[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
	[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
	[3, 3, 3, 3, 3, 3, 3, 3, 3, 3]


### Compare with Teh on Biology Abstract Data

In [16]:
processed_data_file = "preprocessed_bio_abstracts.txt"
raw_data_file = "nematode biology abstracts.txt"

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
import scipy as sp
import numpy as np
import re
from collections import Counter


def load_raw(raw_data_file):
    with open(raw_data_file, "r") as f:
        raw_abstracts = f.read()
    return raw_abstracts
    

def vectorize(docs, *args, **qsargs):
    vectorizer = CountVectorizer(stop_words='english', *args, **qsargs)
    data = vectorizer.fit_transform(docs)
    col_freq_filter = np.asarray(data.sum(axis=0) > 9)[0]
    print "total words", data[:, col_freq_filter].sum()
    vocabulary = [t for _, t in sorted([(v, k) for k, v in vectorizer.vocabulary_.iteritems()])]
    vectorized_docs = [[] for _ in docs]
    for row, col, count in zip(*sp.sparse.find(data)):
        if not col_freq_filter[col]:
            continue
        for word in range(count):
            vectorized_docs[row].append(vocabulary[col])
    return vectorized_docs


def preprocess(raw_abstracts):
    processed_abstracts = {}
    for _ in re.findall("Citation: (.+?)----", raw_abstracts, re.DOTALL):
        citation = _.split("\n")[0]
        _ = re.findall("Abstract(.+)", _, re.DOTALL)[0]
        _ = _[2:]

        _ = re.sub("\n", " ", _)
        _ = re.sub("[ ]{2,}", " ", _)
        _ = _.strip()
        processed_abstracts[citation] = _
    return processed_abstracts

processed_abstracts = preprocess(load_raw(raw_data_file))
print "number of abstracts", len(processed_abstracts)
docs = vectorize(processed_abstracts.values())

with open(processed_data_file, "w") as f:
    f.write('\n'.join([' '.join(doc) for doc in docs]))
    
vocab = set(list(itertools.chain.from_iterable(docs)))
word_to_num = { word: i for i, word in enumerate(vocab)}
num_to_word = { i: word for word, i in word_to_num.iteritems()}
docs = [[word_to_num[word] for word in doc] for doc in docs]

number of abstracts 5828
total words 488903


In [18]:
N, V = len(docs), len(word_to_num)
defn = model_definition(N, V)
prng = rng()
latent = initialize(defn, docs, prng)
r = runner.runner(defn, data, latent)

In [None]:
for i in range(10):
    r.run(prng, 1)
    print "perplexity:", latent.perplexity()

perplexity: 1265.18644827
perplexity: