Latent Dirichlet Allocation - Variational Inference
====

Based on the paper "Latent Dirchlet Allocation" by David M. Blei, Andrew Y. Ng, Michael I. Jordan

In [152]:
import numpy as np
from numpy import sqrt,mean,square
import numpy.linalg as la
from scipy.special import digamma, polygamma
import numba
from numba import jit

In [None]:
!git config --global user.email "kevinjliang2011@gmail.com"
!git config --global user.name "Kevin Liang"

## Parameters

document:    $m = 1,...,M$

topic:       $z = 1,...,k$

word:        $w = 1,...,N_m$

vocabulary : $v = 1,...,V$

$\alpha: 1 \times k$ Model parameter - vector of topic distribution probabilities for each document

$\beta: k \times v$ Model parameter - matrix of word probabilities for each topic

$\phi: M \times N_m \times k$ Variational parameter - matrix of topic probabilities for each word in each document

$\gamma: M \times k$ Variational parameter - matrix of topic probabilities for each document

In [None]:
np.random.seed(1337)

In [None]:
M = 300
k = 10
N = np.random.randint(150,200,size=M)
V = 30

print('N: {0}'.format(N))

### Test data and pre-processing
Randomly generate test data

#### Completely random structure

In [None]:
# Generate completely random "documents"
w_rand = list();

for m in range(M):
    doc = np.random.randint(V,size=N[m])
    w_rand.append(doc)

In [None]:
w_rand

#### Add some structure
Generate data according to the LDA model

In [None]:
# Arbitrarily choose topics 2,5,7 (zero-indexed) as the most likely
alpha_gen = np.array((1,1,10,1,1,20,1,15,1,1))

# Arbitrarily choose each topic to have 3 very common words
beta_probs = np.ones((V,k)) + np.array([np.arange(V)%k==i for i in range(k)]).T*19
beta_gen = np.array(list(map(lambda x: np.random.dirichlet(x),beta_probs.T))).T

w_struct = list();
theta = np.empty((M,k))

# Generate each document
for m in range(M):
    # Draw topic distribution for the document
    theta[m,:] = np.random.dirichlet(alpha_gen,1)[0]
    doc = np.array([])
    
    for n in range(N[m]):
        # Draw topic according to document's topic distribution
        z_n = np.random.choice(np.arange(k),p=theta[m,:])
        # Draw word according to topic
        w_n = np.random.choice(np.arange(V),p=beta_gen[:,z_n])
        doc = np.append(doc,w_n)
    w_struct.append(doc)

In [None]:
w_struct

### Initialize parameters $\alpha, \beta, \phi$ and $\gamma$
Randomly Initialize parameters to reasonable values

In [None]:
alpha = 100*np.random.dirichlet(np.ones(k),1)[0]
beta = np.random.dirichlet(np.ones(V),k).T

phi = np.array([1/k*np.ones([N[m],k]) for m in range(M)])
gamma = np.tile(alpha,(M,1)) + np.tile(N/k,(k,1)).T

In [None]:
alpha

In [None]:
beta.shape

In [None]:
phi.shape

In [None]:
gamma.shape

### Optimize variational parameters $\phi$ and $\gamma$

In [162]:
## Optimize variational parameter phi
def opt_phi(beta,gamma,words,M,N,k):
    for m in range(M):
        for n in range(N[m]):
            for i in range(k):
                phi[m][n,i] = beta[words[m][n],i] * np.exp(digamma(gamma[m,i]) - digamma(np.sum(gamma[m,:])))
            # Normalize across states so phi represents probability over states for each word
            phi[m][n,:] = phi[m][n,:]/sum(phi[m][n,:])
    return phi


## Optimize variational parameter gamma
def opt_gamma(alpha,phi,M):
    gamma = np.tile(alpha,(M,1)) + np.array(list(map(lambda x: np.sum(x,axis=0),phi)))
    return gamma

### Estimate model parameters $\alpha$ and $\beta$

In [160]:
## Optimize beta
def est_beta(phi,words,k,V):
    for j in range (V):
        w_dnj = [np.tile((word == j),(k,1)).T for word in words]
        beta[j,:] = np.sum(np.array(list(map(lambda x: np.sum(x,axis=0),phi*w_dnj))),axis=0)
        
    # Normalize across states so beta represents probability of each word given the state
    for i in range(k):
        beta[:,i] = beta[:,i]/sum(beta[:,i])
        
    return beta


## Optimize alpha
#  (Newton-Raphson method, for a Hessian with special structure)
def est_alpha(alpha,gamma,M,k,nr_max_iters = 1000,tol = 10**-2.0):
    for it in range(nr_max_iters):
        alpha_old = alpha
        
        #  Calculate gradient 
        g = M*(digamma(np.sum(alpha))-digamma(alpha)) + np.sum(digamma(gamma)-np.tile(digamma(np.sum(gamma,axis=1)),(k,1)).T,axis=0)
        #  Calculate Hessian diagonal component
        h = -M*polygamma(1,alpha) 
        #  Calculate Hessian constant component
        z = polygamma(1,np.sum(alpha))
        #  Calculate constant
        c = np.sum(g/h)/(z**(-1.0)+np.sum(h**(-1.0)))

        #  Update alpha
        alpha = alpha - (g-c)/h
        
        #  Check convergence
        if sqrt(mean(square(alpha-alpha_old)))<tol:
            break
        
    return alpha

### Expectation Maximization (EM)

#### Convergence Criterion
The variational inference parameter $\gamma$ contains the topic likelihoods of every document and is thus what is of interest here.

Calculate root-mean-square of the change in $\gamma$

In [158]:
def converged(gamma,gamma_old,convergence):
    print(sqrt(mean(square(gamma-gamma_old))))
    return sqrt(mean(square(gamma-gamma_old))) < convergence

#### Inference by iterative EM
Continue until convergence criterion above met

In [None]:
convergence = 10**(-2.0)
successfully_Converged = False
max_iters = 10**3

for iters in range(max_iters):
    print(iters)
    gamma_old = gamma
    
    ## Expectation step: Update variational parameters
    phi   = opt_phi(beta,gamma,w_struct,M,N,k)
    gamma = opt_gamma(alpha,phi,M)
    
    ## Maximization step: Update model parameters
    beta  = est_beta(phi,w_struct,k,V)
    alpha = est_alpha(alpha,gamma,M,k)
    
    if converged(gamma,gamma_old,convergence):
        successfully_Converged = True
        break

In [None]:
alpha

In [None]:
alpha_gen

In [None]:
beta

In [None]:
beta_gen

### Tests 
Testing out syntax and array dimensions

In [None]:
theta

In [None]:
gamma/np.sum(gamma,axis=1)[:,None]

In [None]:
# Word #11 in document 2 (w_dn)
w_rand[1][10]

In [None]:
[doc == 3 for doc in w_rand]

In [None]:
[doc == 3 for doc in w_rand]*w_rand

In [None]:
np.sum(np.array(list(map(lambda x: np.sum(x,axis=0),phi))),axis=0)