Latent Dirichlet Allocation - Gibbs Sampling
====

Based on the paper "Latent Dirchlet Allocation" by David M. Blei, Andrew Y. Ng, Michael I. Jordan

In [2]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim

tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
    
# create sample documents
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health." 

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

# list for tokenized documents in loop
texts = []

# loop through document list
for i in doc_set:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
    
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

In [11]:
corpus

[[(0, 1), (1, 1), (2, 2), (3, 2), (4, 2), (5, 1)],
 [(0, 1), (1, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)],
 [(12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1)],
 [(0, 1),
  (1, 1),
  (12, 1),
  (18, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1)],
 [(2, 1), (4, 1), (19, 2), (30, 1), (31, 1)]]

In [8]:
import numpy as np
import numpy.linalg as la

## Parameters

document:    $m = 1,...,M$

topic asigned to word:       $z = 1,...,K$

word:        $w = 1,...,N_V$

vocabulary : $v = 1,...,V$

Z: topic assigned to word w

$\theta: K \times N$ 

$\varphi: M \times K$ 

$Multinomial(\theta)$: distribution over words for a given topic

$Multinomial(\varphi)$: distribution over topics for a given document

According to posterior of $\theta$ and $\varphi$, their etimates only depend on the topic assignments Z. Therefore, we are able to only focus on inferring the variable Z and the other variables can be computed directly from Z.

$Z_{mw}$: topic of word w in document m

$n_{mz}$: word count of document m with topic z

$n_{zw}$: count of word w with topic z

$n_{z}$: word count with topic z


In [37]:
#corpus : corpus contains bag-of-words
#K : number of topics
#V : vocaburary size

K = 3
V = 32

alpha = 0.5
beta = 0.5

#topic of words in documents
z_mw = []
#word count of document m with topic z
n_mz = np.zeros((len(corpus), K))
#count of word w with topic z
n_zw = np.zeros((K, V))
#word count of each topic
n_z = np.zeros(K)

In [38]:
for m, doc in enumerate(corpus):
    z_n = []
    for w in doc:
        z = np.random.randint(0, K)
        z_n.append(z)
        n_mz[m, z] += 1
        n_zw[z, w] += 1
        n_z[z] += 1
    z_mw.append(np.array(z_n))

In [39]:
z_mw

[array([2, 0, 0, 2, 2, 1]),
 array([1, 1, 2, 2, 2, 0, 1, 0, 0]),
 array([2, 0, 0, 1, 1, 0, 2, 1, 2, 2]),
 array([2, 1, 0, 2, 1, 2, 0, 1, 2, 2, 1, 2]),
 array([1, 1, 0, 0, 0])]

In [61]:
for m, doc in enumerate(corpus):
    for n, w in enumerate(doc):
        z = z_mw[m][n]
        n_mz[m, z] -= 1
        n_zw[z, w] -= 1
        n_z[z] -= 1

    #sample new topic
    p_z = (n_zw[:, w] + beta) * (n_mz[m] + alpha)/(n_z + V * beta)
    new_z = np.random.multinomial(1, p_z/p_z.sum()).argmax()

    z_mw[m][w] = new_z
    n_mz[m, new_z] += 1
    n_zw[new_z, w] += 1
    n_z[new_z] += 1

ValueError: operands could not be broadcast together with shapes (3,2) (5,3) 

In [60]:
for m, doc in enumerate(corpus):
    for n, w in enumerate(doc):
        z = z_mw[m][n]
        n_mz[m, z] -= 1
        n_zw[z, w] -= 1
        n_z[z] -= 1
        print((n_mz[m] + alpha))

[-11.5  -2.5 -11.5]
[-12.5  -2.5 -11.5]
[-13.5  -2.5 -11.5]
[-13.5  -2.5 -12.5]
[-13.5  -2.5 -13.5]
[-13.5  -3.5 -13.5]
[-2.5 -3.5 -2.5]
[-2.5 -4.5 -2.5]
[-2.5 -4.5 -3.5]
[-2.5 -4.5 -4.5]
[-2.5 -4.5 -5.5]
[-3.5 -4.5 -5.5]
[-3.5 -5.5 -5.5]
[-4.5 -5.5 -5.5]
[-5.5 -5.5 -5.5]
[-2.5 -2.5 -4.5]
[-3.5 -2.5 -4.5]
[-4.5 -2.5 -4.5]
[-4.5 -3.5 -4.5]
[-4.5 -4.5 -4.5]
[-5.5 -4.5 -4.5]
[-5.5 -4.5 -5.5]
[-5.5 -5.5 -5.5]
[-5.5 -5.5 -6.5]
[-5.5 -5.5 -7.5]
[-1.5 -3.5 -6.5]
[-1.5 -4.5 -6.5]
[-2.5 -4.5 -6.5]
[-2.5 -4.5 -7.5]
[-2.5 -5.5 -7.5]
[-2.5 -5.5 -8.5]
[-3.5 -5.5 -8.5]
[-3.5 -6.5 -8.5]
[-3.5 -6.5 -9.5]
[ -3.5  -6.5 -10.5]
[ -3.5  -7.5 -10.5]
[ -3.5  -7.5 -11.5]
[-2.5 -2.5  0.5]
[-2.5 -3.5  0.5]
[-3.5 -3.5  0.5]
[-4.5 -3.5  0.5]
[-5.5 -3.5  0.5]
