This is a tutorial to lean how LDA works under the hood, based on the tutorial by andrew woods, http://brooksandrew.github.io/simpleblog/articles/latent-dirichlet-allocation-under-the-hood/

In [None]:
## PARAMETERS
K = 2 # number of topics
alpha = 1 # hyperparameter. single value indicates symmetric dirichlet prior. higher=>scatters document clusters
eta = .001 # hyperparameter
iterations = 3 # iterations for collapsed gibbs sampling.  This should be a lot higher than 3 in practice.

In [None]:
import numpy as np
import operator
from functools import reduce

Generate a corpus

In [None]:
rawdocs = ['eat turkey on turkey day holiday',
          'i like to eat cake on holiday',
          'turkey trot race on thanksgiving holiday',
          'snail race the turtle',
          'time travel space race',
          'movie on thanksgiving',
          'movie at air and space museum is cool movie',
          'aspiring movie star']

Convert raw docs into bag of words docs

In [None]:
docs_bow = [ d.split(' ') for d in rawdocs]
print(docs_bow)

Construct vocabulary by obtaining unique words from the document

In [None]:

vocabulary = np.unique( reduce( operator.concat, docs_bow) ).tolist()
print(vocabulary)

Replace the words in documents with the index from the vocabulary

In [None]:
docs = []
for d in  docs_bow:
    docs.append([vocabulary.index(w) for w in d])
print(docs)

Assign a topic to each word in the document

In [None]:
import pprint
import random

topic_word_count = np.zeros(shape=(K, len(vocabulary)), dtype=np.int8).tolist()
dword_topic_assign = [np.arange(len(d)).tolist() for d in docs]

for d in range(0, len(docs)):
    for w in range(0, len(docs[d])):
        ##Randomly assign a topic to a word in the document
        dword_topic_assign [d][w] = random.randint(1, K)
        ti = dword_topic_assign [d][w] - 1 #topic index
        wi = docs[d][w] # word index
        topic_word_count[ti][wi] = topic_word_count[ti][wi] + 1




In [None]:
doc_topic_count = []
for d in range(0, len(docs)):
    
    doc_topic_count.append([])    
    for t in range(1, K+1):
        total = len( [ [ta] for ta in dword_topic_assign[d] if ta == t ])
        doc_topic_count[d].append(total)

print(doc_topic_count)

In [None]:
for i in range(0, iterations+1):
    for d in range(0, len(docs)):
        for w in range(0, len(docs[d])):
            current_topic_assignment =  dword_topic_assign[d][w] 
            tid = current_topic_assignment -1
            wid = docs[d][w]
            
           
            #remove weight, essentially remove all topic linked to that word before ressigning to 
            doc_topic_count[d][tid]= doc_topic_count[d][tid]-1
            topic_word_count[tid][ wid]= topic_word_count[tid][ wid] -1
            
            denom_a_doc_word_count_based = sum(doc_topic_count[d])  + K * alpha ## length of the document
            denom_b_per_topic_word_count_based = [ sum(t) + len(vocabulary) * eta for t in topic_word_count]   
            
   
            print(np.array(topic_word_count)[:,wid])
            print((np.array(topic_word_count)[:,wid] + eta)/ denom_b_per_topic_word_count_based)
            print((np.array(doc_topic_count[d]) + alpha)/ denom_a_doc_word_count_based) 
         
            
            
            p_z = (np.array(topic_word_count)[:,wid] + eta)/ denom_b_per_topic_word_count_based * (np.array(doc_topic_count[d]) + alpha) / denom_a_doc_word_count_based
            
            print( p_z)
            print( p_z/sum(p_z))
            new_topic_assignment = np.random.choice(range(1, K+1), 1, (p_z/sum(p_z)).tolist())[0]
            
            
            
            #Assign the new topic
            ntid = new_topic_assignment -1
            dword_topic_assign[d][w]  = new_topic_assignment
       
            doc_topic_count[d][ntid]=doc_topic_count[d][ntid]+1
            topic_word_count[ntid][ wid] = topic_word_count[ntid][ wid] +1
            
            if(current_topic_assignment != new_topic_assignment):
                print("doc: {}  token: {}  topic from {} to topic {}".format(d, vocabulary[wid], current_topic_assignment, new_topic_assignment )) # examine when topic assignments change
 
    
    