This is a tutorial to lean how LDA works under the hood, based on the tutorial by andrew woods, http://brooksandrew.github.io/simpleblog/articles/latent-dirichlet-allocation-under-the-hood/

In [2]:
## PARAMETERS
K = 2 # number of topics
alpha = 1 # hyperparameter. single value indicates symmetric dirichlet prior. higher=>scatters document clusters
eta = .001 # hyperparameter
iterations = 3 # iterations for collapsed gibbs sampling.  This should be a lot higher than 3 in practice.

In [4]:
import numpy as np
import operator
from functools import reduce

Generate a corpus

In [1]:
rawdocs = ['eat turkey on turkey day holiday',
          'i like to eat cake on holiday',
          'turkey trot race on thanksgiving holiday',
          'snail race the turtle',
          'time travel space race',
          'movie on thanksgiving',
          'movie at air and space museum is cool movie',
          'aspiring movie star']

Convert raw docs into bag of words docs

In [5]:
docs_bow = [ d.split(' ') for d in rawdocs]
print(docs_bow)

[['eat', 'turkey', 'on', 'turkey', 'day', 'holiday'], ['i', 'like', 'to', 'eat', 'cake', 'on', 'holiday'], ['turkey', 'trot', 'race', 'on', 'thanksgiving', 'holiday'], ['snail', 'race', 'the', 'turtle'], ['time', 'travel', 'space', 'race'], ['movie', 'on', 'thanksgiving'], ['movie', 'at', 'air', 'and', 'space', 'museum', 'is', 'cool', 'movie'], ['aspiring', 'movie', 'star']]


Construct vocabulary by obtaining unique words from the document

In [6]:

vocabulary = np.unique( reduce( operator.concat, docs_bow) ).tolist()
print(vocabulary)

['air', 'and', 'aspiring', 'at', 'cake', 'cool', 'day', 'eat', 'holiday', 'i', 'is', 'like', 'movie', 'museum', 'on', 'race', 'snail', 'space', 'star', 'thanksgiving', 'the', 'time', 'to', 'travel', 'trot', 'turkey', 'turtle']


Replace the words in documents with the index from the vocabulary

In [7]:
docs = []
for d in  docs_bow:
    docs.append([vocabulary.index(w) for w in d])
print(docs)

[[7, 25, 14, 25, 6, 8], [9, 11, 22, 7, 4, 14, 8], [25, 24, 15, 14, 19, 8], [16, 15, 20, 26], [21, 23, 17, 15], [12, 14, 19], [12, 3, 0, 1, 17, 13, 10, 5, 12], [2, 12, 18]]


In [32]:
import pprint
import random

tw = np.zeros(shape=(K, len(vocabulary)), dtype=np.int8).tolist()
dt = [np.arange(len(d)).tolist() for d in docs]

for d in range(0, len(docs)):
    for w in range(0, len(docs[d])):
        ##Randomly assign a topic to a word in the document
        dt[d][w] = random.randint(1, K)
        ti = dt[d][w] - 1 #topic index
        wi = docs[d][w] # word index
        tw[ti][wi] = tw[ti][wi] + 1

pprint.pprint(np.array(tw))
pprint.pprint(dt)


array([[0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 3, 1, 2, 2, 1, 0, 1, 1, 1, 0,
        0, 0, 0, 2, 1],
       [1, 1, 0, 1, 1, 0, 0, 1, 2, 1, 0, 1, 1, 0, 2, 1, 0, 2, 0, 1, 0, 1,
        1, 1, 1, 1, 0]])
[[2, 2, 2, 1, 1, 2],
 [2, 2, 2, 1, 2, 1, 2],
 [1, 2, 1, 2, 2, 1],
 [1, 1, 1, 1],
 [2, 2, 2, 2],
 [1, 1, 1],
 [1, 2, 2, 2, 2, 1, 1, 1, 1],
 [1, 2, 1]]
