Latent Dirichlet Allocation - Variational Inference
====

Based on the paper "Latent Dirchlet Allocation" by David M. Blei, Andrew Y. Ng, Michael I. Jordan

In [1]:
import numpy as np
import numpy.linalg as la
from scipy.special import digamma

## Parameters

document:    $m = 1,...,M$

topic:       $z = 1,...,k$

word:        $w = 1,...,N_m$

vocabulary : $v = 1,...,V$

$\alpha: 1 \times k$ vector of topic distribution probabilities

$\beta: k \times v$ matrix of word probabilities for each topic

In [2]:
np.random.seed(1337)

### Test data and pre-processing

Run the following first:
```
pip install -U nltk
pip install stop-words
pip install -U gensim
```

In [4]:
!pip install -U nltk
!pip install stop-words
!pip install -U gensim

Requirement already up-to-date: nltk in /opt/conda/lib/python3.4/site-packages
Collecting gensim
  Using cached gensim-0.12.4.tar.gz
Collecting numpy>=1.3 (from gensim)
  Using cached numpy-1.11.0-cp34-cp34m-manylinux1_x86_64.whl
Requirement already up-to-date: scipy>=0.7.0 in /opt/conda/lib/python3.4/site-packages (from gensim)
Requirement already up-to-date: six>=1.5.0 in /opt/conda/lib/python3.4/site-packages/six-1.10.0-py3.4.egg (from gensim)
Collecting smart-open>=1.2.1 (from gensim)
  Using cached smart_open-1.3.2.tar.gz
Collecting boto>=2.32 (from smart-open>=1.2.1->gensim)
  Using cached boto-2.39.0-py2.py3-none-any.whl
Collecting httpretty==0.8.10 (from smart-open>=1.2.1->gensim)
  Using cached httpretty-0.8.10.tar.gz
    Complete output from command python setup.py egg_info:
    Traceback (most recent call last):
      File "<string>", line 1, in <module>
      File "/tmp/pip-build-em60bd2t/httpretty/setup.py", line 86, in <module>
        version=read_version(),
      File "

In [3]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim

tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
    
# create sample documents
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health." 

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

# list for tokenized documents in loop
texts = []

# loop through document list
for i in doc_set:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
    
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

ImportError: No module named 'gensim'

In [3]:
M = 3
k = 10
N = np.random.randint(50,size=M)
V = 100

print('N: {0}'.format(N))

N: [23 28 40]


### Initialize parameters $\alpha, \beta, \phi$ and $\gamma$

In [8]:
alpha = np.random.dirichlet(np.ones(k),1)
beta = np.random.dirichlet(np.ones(k),V)

phi = np.array([1/k*np.ones([N[m],k]) for m in range(M)])
gamma = np.tile(alpha,(M,1)) + np.tile(N/k,(k,1)).T

In [5]:
alpha

array([[ 0.03614628,  0.0682012 ,  0.04293727,  0.08103403,  0.03368725,
         0.41405727,  0.14638083,  0.01358403,  0.05414736,  0.10982448]])

In [7]:
beta.shape

(100, 10)

In [8]:
phi.shape

(3,)

In [9]:
gamma.shape

(3, 10)

### Optimize variational parameters $\phi$ and $\gamma$

In [None]:
# TODO: Split phi and gamma optimization apart for parallelization purposes
# TODO: See if some sort of vectorization is possible for speed-up
def optVarParams(alpha,beta,phi,gamma,words):
    # Optimize phi
    for m in range(M):
        for n in range(N[m]):
            for i in range(k):
                phi[m][n,i] = beta[i,words[m][n]] * np.exp(digamma(gamma[m,i]) - digamma(np.sum(gamma[m,:])))
    
    # Optimize gamma
    gamma = np.tile(alpha,(M,1)) + np.array(list(map(lambda x: np.sum(x,axis=0),phi))).T
    
    return phi,gamma

### Estimate model parameters $\alpha$ and $\beta$

In [None]:
def estModParams(alpha,beta,phi,gamma,words):
    # Optimize beta
    for i in range(k):
        for j in range (V):
            beta[i,j] = np.sum(np.array(list(map(lambda x: np.sum(x,axis=0),phi)),axis=0)


    return alpha,beta

### Expectation Maximization

In [None]:
convergence = 10**(-3)

'''
Pseudocode

while(!converged):
    phi,gamma  = optVarParams(alpha,beta,phi,gamma)
    alpha,beta = estModParams(alpha,beta,phi,gamma)
    if converged(alpha,beta):
        break
'''        

### Tests 
Testing out syntax and array dimensions

In [11]:
np.sum(np.array(list(map(lambda x: np.sum(x,axis=0),phi))),axis=0)

array([ 9.1,  9.1,  9.1,  9.1,  9.1,  9.1,  9.1,  9.1,  9.1,  9.1])