Latent Dirichlet Allocation - Variational Inference
====

Based on the paper "Latent Dirchlet Allocation" by David M. Blei, Andrew Y. Ng, Michael I. Jordan

In [22]:
import numpy as np
import numpy.linalg as la
from scipy.special import digamma, polygamma

## Parameters

document:    $m = 1,...,M$

topic:       $z = 1,...,k$

word:        $w = 1,...,N_m$

vocabulary : $v = 1,...,V$

$\alpha: 1 \times k$ vector of topic distribution probabilities

$\beta: k \times v$ matrix of word probabilities for each topic

$\phi: M \times N_m \times k$ matrix of topic probabilities for each word in each document

$\gamma: M \times k$ matrix of topic probabilities for each document

In [8]:
np.random.seed(1337)

### Test data and pre-processing

Run the following first:
```
pip install -U nltk
pip install stop-words
pip install -U gensim
```

In [52]:
!pip install -U nltk
!pip install stop-words
!conda install -y gensim

Requirement already up-to-date: nltk in /opt/conda/lib/python3.4/site-packages
Fetching package metadata: ......
Solving package specifications: .........
  - r-irkernel-0.5-r3.2.2_1a.tar.bz2
  - r-irkernel-0.5-r3.2.2_2.tar.bz2

Package plan for installation in environment /opt/conda:

The following packages will be UPDATED:

    r-irkernel: 0.5-r3.2.2_2 --> 0.5-r3.2.2_1a

Unlinking packages ...
[      COMPLETE      ]|###################################################| 100%
Linking packages ...
[      COMPLETE      ]|###################################################| 100%


In [53]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim

tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
    
# create sample documents
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health." 

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

# list for tokenized documents in loop
texts = []

# loop through document list
for i in doc_set:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
    
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

In [54]:
corpus

[[(0, 1), (1, 1), (2, 2), (3, 2), (4, 1), (5, 2)],
 [(0, 1), (4, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)],
 [(9, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1)],
 [(0, 1),
  (4, 1),
  (9, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1)],
 [(3, 1), (5, 1), (19, 2), (30, 1), (31, 1)]]

In [9]:
M = 3
k = 10
N = np.random.randint(50,size=M)
V = 20

print('N: {0}'.format(N))

N: [23 28 40]


In [70]:
# Generate random "documents"
doc1 = np.random.randint(V,size=N[0])
doc2 = np.random.randint(V,size=N[1])
doc3 = np.random.randint(V,size=N[2])

w = np.array((doc1,doc2,doc3))
w

array([ array([18,  8,  2,  4,  2,  2,  5, 19, 15,  2,  6, 11, 12,  8, 11, 19, 12,
        5,  5, 19,  2, 16,  7]),
       array([ 5, 10, 10, 16,  8, 12, 14, 14,  6,  7, 13,  6, 16, 18, 18,  2, 10,
        1,  1, 18, 16,  2,  1,  1, 18,  3,  0, 18]),
       array([ 6,  9,  7,  0,  8, 11,  3, 17,  3, 10, 17,  8, 14,  4, 18, 17, 13,
       11, 19,  2, 16, 16,  8,  1,  1,  4,  5,  8,  0, 14,  0,  4,  8,  1,
        6, 19,  6, 10,  2, 11])], dtype=object)

In [11]:
# Word #11 in document 2 (w_dn)
w[1][10]

3

In [28]:
[doc == 3 for doc in w]

[array([False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False,  True, False, False], dtype=bool),
 array([False, False, False, False, False, False, False, False, False,
        False,  True, False, False, False, False, False, False, False,
         True, False, False, False, False,  True, False, False, False, False], dtype=bool),
 array([False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False,  True,
        False, False, False, False], dtype=bool)]

In [29]:
[doc == 3 for doc in w]*w

array([ array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0]),
       array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0,
       3, 0, 0, 0, 0]),
       array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0])], dtype=object)

### Initialize parameters $\alpha, \beta, \phi$ and $\gamma$

In [73]:
alpha = np.random.dirichlet(np.ones(k),1)
beta = np.random.dirichlet(np.ones(k),V)

phi = np.array([1/k*np.ones([N[m],k]) for m in range(M)])
gamma = np.tile(alpha,(M,1)) + np.tile(N/k,(k,1)).T

In [5]:
alpha

array([[ 0.03614628,  0.0682012 ,  0.04293727,  0.08103403,  0.03368725,
         0.41405727,  0.14638083,  0.01358403,  0.05414736,  0.10982448]])

In [36]:
beta.shape

(20, 10)

In [42]:
phi.shape

(3,)

In [39]:
gamma.shape

(3, 10)

### Optimize variational parameters $\phi$ and $\gamma$

In [75]:
# TODO: Split phi and gamma optimization apart for parallelization purposes
# TODO: See if some sort of vectorization is possible for speed-up
def optVarParams(alpha,beta,phi,gamma,words):
    ## Optimize phi
    for m in range(M):
        for n in range(N[m]):
            for i in range(k):
                phi[m][n,i] = beta[words[m][n],i] * np.exp(digamma(gamma[m,i]) - digamma(np.sum(gamma[m,:])))
            # Normalize across states so phi represents probability over states for each word
            phi[m][n,:] = phi[m][n,:]/sum(phi[m][n,:])
    
    ## Optimize gamma
    gamma = np.tile(alpha,(M,1)) + np.array(list(map(lambda x: np.sum(x,axis=0),phi)))
    
    return phi,gamma

In [45]:
optVarParams(alpha,beta,phi,gamma,w)

(array([ array([[  6.93716419e-03,   2.23254209e-02,   7.08126000e-03,
           4.22454898e-03,   1.42928522e-03,   1.64223228e-02,
           9.12799952e-04,   4.47430769e-03,   1.54304709e-02,
           2.57855533e-03],
        [  6.93716419e-03,   2.23254209e-02,   7.08126000e-03,
           4.22454898e-03,   1.42928522e-03,   1.64223228e-02,
           9.12799952e-04,   4.47430769e-03,   1.54304709e-02,
           2.57855533e-03],
        [  3.36505257e-03,   4.16752373e-03,   3.96037316e-03,
           1.53020948e-03,   4.36411771e-03,   1.68136177e-03,
           1.48836428e-02,   1.63684362e-02,   1.08669884e-02,
           1.94380683e-02],
        [  3.36505257e-03,   4.16752373e-03,   3.96037316e-03,
           1.53020948e-03,   4.36411771e-03,   1.68136177e-03,
           1.48836428e-02,   1.63684362e-02,   1.08669884e-02,
           1.94380683e-02],
        [  6.00425674e-03,   4.73821292e-03,   6.77860440e-03,
           8.90829178e-03,   1.70636597e-03,   5.25602330e-04

### Estimate model parameters $\alpha$ and $\beta$

In [71]:
def estModParams(alpha,beta,phi,gamma,words):
    ## Optimize beta
    for j in range (V):
        w_dnj = [np.tile((word == j),(k,1)).T for word in w]
        beta[j,:] = np.sum(np.array(list(map(lambda x: np.sum(x,axis=0),phi*w_dnj))),axis=0)
        # Normalize across states so beta represents probability over states for each word
        beta[j,:] = beta[j,:]/sum(beta[j,:])
    
    ## Optimize alpha
    nr_max_iters = 1000
    tol = 10^-3
    for it in range(nr_max_iters):
        alpha_old = alpha
        
        #  Calculate gradient 
        g = M*(digamma(np.sum(alpha))-digamma(alpha)) + np.sum(digamma(gamma)-np.tile(digamma(np.sum(gamma,axis=1)),(k,1)).T,axis=0)
        #  Calculate Hessian diagonal component
        h = M*polygamma(1,alpha) 
        #  Calculate Hessian constant component
        z = -polygamma(1,np.sum(alpha))
        #  Calculate constant
        c = np.sum(g/h)/(z**(-1)+np.sum(h**(-1)))

        #  Update alpha
        alpha = alpha - (g-c)/h
        
        #  Check convergence
        if la.norm(alpha-alpha_old)<tol:
            break
    
    return alpha,beta

In [74]:
estModParams(alpha,beta,phi,gamma,w)



(array([[ nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan]]),
 array([[ 0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1],
        [ 0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1],
        [ 0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1],
        [ 0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1],
        [ 0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1],
        [ 0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1],
        [ 0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1],
        [ 0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1],
        [ 0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1],
        [ 0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1],
        [ 0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1],
        [ 0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1],
        [ 0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1],
        [ 0.1,  0.

### Expectation Maximization

#### Convergence Criterion
The variational inference parameter $\gamma$ contains the topic likelihoods of every document and is thus what is of interest here.

Calculate root-mean-square of the change in $\gamma$

In [47]:
from numpy import sqrt,mean,square

def converged(gamma,gamma_old,convergence):
    return sqrt(mean(square(gamma-gamma_old))) < convergence

#### Inference by iterative EM
Continue until convergence criterion above met

In [48]:
convergence = 10**(-3)
successfully_Converged = False
max_iters = 10**4

'''
Pseudocode

for iters in range(max_iters):
    phi,gamma  = optVarParams(alpha,beta,phi,gamma)
    alpha,beta = estModParams(alpha,beta,phi,gamma)
    if converged(gamma,gamma_old,convergence):
        successfully_Converged = True
        break
'''        

'\nPseudocode\n\nfor iters in range(max_iters):\n    phi,gamma  = optVarParams(alpha,beta,phi,gamma)\n    alpha,beta = estModParams(alpha,beta,phi,gamma)\n    if converged(gamma,gamma_old,convergence):\n        successfully_Converged = True\n        break\n'

### Tests 
Testing out syntax and array dimensions

In [11]:
np.sum(np.array(list(map(lambda x: np.sum(x,axis=0),phi))),axis=0)

array([ 9.1,  9.1,  9.1,  9.1,  9.1,  9.1,  9.1,  9.1,  9.1,  9.1])

In [49]:
[doc == 3 for doc in w]*w

array([ array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0]),
       array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0,
       3, 0, 0, 0, 0]),
       array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0])], dtype=object)