Latent Dirichlet Allocation - Gibbs Sampling
====

Based on the paper "Latent Dirchlet Allocation" by David M. Blei, Andrew Y. Ng, Michael I. Jordan

In [82]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim

tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
    
# create sample documents
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health." 

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

# list for tokenized documents in loop
texts = []

# loop through document list
for i in doc_set:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
    
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

In [83]:
import numpy as np
import numpy.linalg as la

In [84]:
corpus

[[(0, 2), (1, 1), (2, 2), (3, 2), (4, 1), (5, 1)],
 [(1, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)],
 [(11, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1)],
 [(1, 1),
  (5, 1),
  (11, 1),
  (18, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1)],
 [(2, 1), (3, 1), (17, 2), (30, 1), (31, 1)]]

## Parameters

document:    $m = 1,...,M$

topic asigned to word:       $z = 1,...,K$

word:        $w = 1,...,N_V$

vocabulary : $v = 1,...,V$

Z: topic assigned to word w

$\theta: K \times N$ 

$\varphi: M \times K$ 

$Multinomial(\theta)$: distribution over words for a given topic

$Multinomial(\varphi)$: distribution over topics for a given document

According to posterior of $\theta$ and $\varphi$, their etimates only depend on the topic assignments Z. Therefore, we are able to only focus on inferring the variable Z and the other variables can be computed directly from Z.

$Z_{mw}$: topic of word w in document m

$n_{mz}$: word count of document m with topic z

$n_{zw}$: count of word w with topic z

$n_{z}$: word count with topic z


In [85]:
#word count of document m
n_m = []
for i in range(len(corpus)):
    n_m.append(np.sum(corpus[i], axis = 0)[1])
n_m = np.array(n_m)

In [86]:
type(n_m)

numpy.ndarray

In [87]:
#corpus : corpus contains bag-of-words
#K : number of topics
#V : vocaburary size

K = 3
V = 32
max_iter = 50

alpha = 0.5
beta = 0.5

#topic of words in documents
z_mw = []
#word count of document m with topic z
n_mz = np.zeros((len(corpus), K))
#count of word w with topic z
n_zw = np.zeros((K, V))
#word count of each topic
n_z = np.zeros(K)

In [88]:
#initial
#m: index of document 
#n: index of word w
#t: times
#z: index of topic
for m, doc in enumerate(corpus):
    z_n = []
    for n, t in doc:
        z = np.random.randint(0, K)
        z_n.append(z)
        n_mz[m, z] += t
        n_zw[z, n] += t
        n_z[z] += t
    z_mw.append(np.array(z_n))

In [89]:
for n, (w, t) in enumerate(doc):
    print(n,',',w,',',t,'\n')

0 , 2 , 1 

1 , 3 , 1 

2 , 17 , 2 

3 , 30 , 1 

4 , 31 , 1 



In [90]:
z_mw

[array([1, 0, 2, 1, 1, 0]),
 array([2, 2, 0, 2, 1, 2, 1, 1, 0]),
 array([2, 1, 2, 2, 2, 0, 0, 2, 0, 2]),
 array([2, 1, 1, 0, 2, 0, 2, 0, 2, 0, 0, 2]),
 array([2, 2, 1, 0, 0])]

In [91]:
print(n_zw)

[[ 0.  1.  0.  0.  0.  1.  1.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  1.
   2.  0.  1.  0.  0.  1.  0.  1.  0.  1.  1.  0.  1.  1.]
 [ 2.  0.  0.  2.  1.  1.  0.  0.  1.  0.  1.  2.  0.  1.  0.  0.  0.  2.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  2.  3.  1.  0.  1.  0.  1.  0.  1.  0.  1.  0.  0.  1.  1.  1.  0.
   0.  1.  0.  1.  1.  0.  1.  0.  1.  0.  0.  1.  0.  0.]]


In [92]:
n_z

array([ 14.,  13.,  19.])

In [93]:
 for m, doc in enumerate(corpus):
        for n, (w, t) in enumerate(doc):
            z = z_mw[m][n]
            n_mz[m, z] -= t
            n_m[m] -= t
            n_zw[z, w] -= t
            n_z[z] -= t

            #sample new topic
            p_z = np.zeros(K)
            for j in range(K):
                p_z[j] = ((n_zw[j, w] + beta)/(n_z[j] + V * beta)) * ((n_mz[m, j] + alpha)/(n_m[m] + K * alpha))
            #p_z = ((n_zw[:, n] + beta) * (n_mz[m] + alpha))/((n_z[z] + V * beta) * (n_m[m] + K * alpha))
            #print(p_z/p_z.sum())
            #print(n_m)
            #print(p_z/p_z.sum())

            new_z = np.random.multinomial(1, p_z/p_z.sum()).argmax()
            z_mw[m][n] = new_z
            n_mz[m, new_z] += t
            n_zw[new_z, w] += t
            n_z[new_z] += t
            n_m[m] += t

In [94]:
n_m = []
for i in range(len(corpus)):
    n_m.append(np.sum(corpus[i], axis = 0)[1])
n_m = np.array(n_m)
    

In [95]:
#Gibbs sampling

for i in range(10):
    for m, doc in enumerate(corpus):
        for n, (w, t) in enumerate(doc):
            z = z_mw[m][n]
            n_mz[m, z] -= t
            n_m[m] -= t
            n_zw[z, w] -= t
            n_z[z] -= t

            #sample new topic
            p_z = np.zeros(K)
            for j in range(K):
                p_z[j] = ((n_zw[j, w] + beta)/(n_z[j] + V * beta)) * ((n_mz[m, j] + alpha)/(n_m[m] + K * alpha))
            #p_z = ((n_zw[:, n] + beta) * (n_mz[m] + alpha))/((n_z[z] + V * beta) * (n_m[m] + K * alpha))
            #print(p_z/p_z.sum())
            new_z = np.random.multinomial(1, p_z/p_z.sum()).argmax()

            z_mw[m][n] = new_z
            n_mz[m, new_z] += t
            n_zw[new_z, w] += t
            n_z[new_z] += t
            n_m[m] += t

In [108]:
#update phi
phi = (n_zw + alpha)/(n_z[:,None] + V *alpha)

#update theta
theta = (n_mz + beta)/(n_m[:, None] + K * beta)

In [114]:
sum(theta[2])

1.0