Latent Dirichlet Allocation - Gibbs Sampling
====

Based on the paper "Latent Dirchlet Allocation" by David M. Blei, Andrew Y. Ng, Michael I. Jordan

In [56]:
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim

tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
    
# create sample documents
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health." 

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

# list for tokenized documents in loop
texts = []

# loop through document list
for i in doc_set:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)  
# convert tokenized documents into a document-term matrix
corpus0 = [dictionary.doc2bow(text) for text in texts]

In [57]:
print(dictionary.token2id)

{'lot': 6, 'basebal': 9, 'mother': 2, 'suggest': 15, 'health': 16, 'feel': 26, 'pressur': 17, 'perform': 27, 'better': 28, 'around': 10, 'brother': 0, 'often': 25, 'brocolli': 3, 'like': 5, 'well': 22, 'practic': 8, 'eat': 1, 'school': 29, 'blood': 14, 'good': 4, 'say': 30, 'tension': 19, 'increas': 20, 'caus': 21, 'spend': 7, 'expert': 13, 'seem': 24, 'may': 18, 'never': 23, 'profession': 31, 'time': 11, 'drive': 12}


In [40]:
import numpy as np
import numpy.linalg as la

## Parameters

document:    $m = 1,...,M$

topic asigned to word:       $z = 1,...,K$

word:        $w = 1,...,N_V$

vocabulary : $v = 1,...,V$

Z: topic assigned to word w

$\theta: K \times N$ 

$\beta: M \times K$ 

$Multinomial(\theta)$: distribution over words for a given topic

$Multinomial(\beta)$: distribution over topics for a given document

According to posterior of $\theta$ and $\beta$, their etimates only depend on the topic assignments Z. Therefore, we are able to only focus on inferring the variable Z and the other variables can be computed directly from Z.

$Z_{mw}$: the topic of word w in document m

$n_{mz}$: the number of words from document m assigned to topic z

$n_{zw}$: the number of words assigned topic z

$n_{z}$: the total number of words assigned to topic z

$n_{m}$: the total number of words in document m


In [41]:
def words_count_doc(corpus):
    """
    Count the toal number of words in each document in corpus.

    Parameters
    ----------
    corpus : a list-like, contains bag-of-words of each document

    Returns
    -------
    n_m : a np.array, shape(M)
         the total number of words in each document
    """
    n_m = []
    for i in range(len(corpus)):
        n_m.append(np.sum(corpus[i], axis = 0)[1])
    return np.array(n_m)


In [42]:
def empty_parameters(corpus, K, V):
    """
    Initialize empty parameter n_mz, n_zw, n_z.

    Parameters:
    -----------
    K : int, the number of topics
    V : int, the number of vocabulary
    
    Returns:
    --------
    z_mw : the topic of word w in document m
    n_mz : the number of words from document m assigned to topic z
    n_zw : the number of words assigned topic z
    n_z : the total number of words assigned to topic z
    """
    z_mw = []
    n_mz = np.zeros((len(corpus), K))
    n_zw = np.zeros((K, V))
    n_z = np.zeros(K)
    return z_mw, n_mz, n_zw, n_z

In [43]:
def initial_parameters(corpus, K, V):
    """
    Initialize parameters for the corpus 

    Parameters:
    -----------
    corpus: a list-like, contains bag-of-words of each document
    K : int, the number of topics
    V : int, the number of vocabulary

    Returns:
    --------
    z_mw : the topic of word w in document m
    n_mz : the number of words from document m assigned to topic z
    n_zw : the number of words assigned topic z
    n_z : the total number of words assigned to topic z
    
    """
    z_mw, n_mz, n_zw, n_z = empty_parameters(corpus, K, V)
    z_mw = []
    for m, doc in enumerate(corpus):
        z_n = []
        for n, t in doc:
            z = np.random.randint(0, K)
            z_n.append(z)
            n_mz[m, z] += t
            n_zw[z, n] += t
            n_z[z] += t
        z_mw.append(np.array(z_n))
    return z_mw, n_mz, n_zw, n_z

In [44]:
def sample_topic(K, n_zw, n_z, n_mz, n_m, alpha, phi, w, m):
    """
    Sample new topic for current word
    
    """
    p_z = np.zeros(K)
    for j in range(K):
        p_z[j] = ((n_zw[j, w] + phi)/(n_z[j] + V * phi)) * ((n_mz[m, j] + alpha)/(n_m[m] + K * alpha))
    new_z = np.random.multinomial(1, p_z/p_z.sum()).argmax()
    return new_z    

In [45]:
def update_beta(V, n_zw, n_z, alpha, beta_gibbs):
    """
    Update beta
    """
    beta = (n_zw + alpha)/(n_z[:,None] + V *alpha)
    return beta

In [46]:
def update_theta(K, n_mz, n_m, phi, theta_gibbs):
    """
    Update theta
    """
    theta = (n_mz + phi)/(n_m[:, None] + K * phi)
    return theta

In [47]:
def gibbs_sampling(corpus, max_iter, K, V, n_zw, n_z, n_mz, n_m, alpha, phi):
    beta_gibbs = []
    theta_gibbs = []
    
    for i in range(max_iter):
        for m, doc in enumerate(corpus):
            for n, (w, t) in enumerate(doc):
                #exclude the current word
                z = z_mw[m][n]
                n_mz[m, z] -= t
                n_m[m] -= t
                n_zw[z, w] -= t
                n_z[z] -= t
        
                new_z = sample_topic(K, n_zw, n_z, n_mz, n_m, alpha, phi, w, m)

                #include the current word
                z_mw[m][n] = new_z
                n_mz[m, new_z] += t
                n_zw[new_z, w] += t
                n_z[new_z] += t
                n_m[m] += t

        #update beta
        beta_gibbs.append(update_beta(V, n_zw, n_z, alpha, beta_gibbs))
        #update theta
        theta_gibbs.append(update_theta(K, n_mz, n_m, phi, theta_gibbs))
    return beta_gibbs, theta_gibbs

In [58]:
#corpus : corpus contains bag-of-words
#K : number of topics
#V : vocaburary size

K = 3
V = 32
alpha = 0.5
phi = 0.5
max_iter = 500

#intialize parameters
n_m = words_count_doc(corpus0)
z_mw, n_mz, n_zw, n_z = initial_parameters(corpus0, K, V)

In [59]:
beta0, theta0 = gibbs_sampling(corpus0, max_iter, K, V, n_zw, n_z, n_mz, n_m, alpha, phi)

### Randomly generated test data

In [50]:
#Intialize parameters
#M: number of documents
M = 200
#k: topic
k = 10
#N: length of each document
np.random.seed(seed=123)
N = np.random.randint(50,100,size=M)
#V: number of words
V = 100

In [51]:
alpha_gen = np.array((1,1,10,1,1,7,1,20,1,1))
beta_gen = np.random.dirichlet(0.1*np.ones(V),k).T

w_struct = list();

for m in range(M):
    theta = np.random.dirichlet(alpha_gen,1)[0]
    doc = np.array([])
    for n in range(N[m]):
        z_n = np.random.choice(np.arange(k),p=theta)
        w_n = np.random.choice(np.arange(V),p=beta_gen[:,z_n])
        w_n = tokenizer.tokenize(np.array_str(w_n))
        doc = np.append(doc,w_n)
    w_struct.append(doc)

In [52]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(w_struct)  
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in w_struct]

In [53]:
#corpus : corpus contains bag-of-words
#K : number of topics
#V : vocaburary size

K = k
alpha = 0.5
phi = 0.5
max_iter = 50

#intialize parameters
n_m = words_count_doc(corpus)
z_mw, n_mz, n_zw, n_z = initial_parameters(corpus, K, V)

In [54]:
beta0, theta0 = gibbs_sampling(corpus, max_iter, K, V, n_zw, n_z, n_mz, n_m, alpha, phi)

In [55]:
len(beta0)

50