Latent Dirichlet Allocation - Variational Inference
====

Based on the paper "Latent Dirchlet Allocation" by David M. Blei, Andrew Y. Ng, Michael I. Jordan

In [1]:
import numpy as np
from numpy import sqrt,mean,square
import numpy.linalg as la
from scipy.special import digamma, polygamma
import numba
from numba import jit

In [None]:
!git config --global user.email "kevinjliang2011@gmail.com"
!git config --global user.name "Kevin Liang"

## Parameters

document:    $m = 1,...,M$

topic:       $z = 1,...,k$

word:        $w = 1,...,N_m$

vocabulary : $v = 1,...,V$

$\alpha: 1 \times k$ Model parameter - vector of topic distribution probabilities for each document

$\beta: k \times v$ Model parameter - matrix of word probabilities for each topic

$\phi: M \times N_m \times k$ Variational parameter - matrix of topic probabilities for each word in each document

$\gamma: M \times k$ Variational parameter - matrix of topic probabilities for each document

In [2]:
np.random.seed(1337)

In [3]:
M = 300
k = 10
N = np.random.randint(150,200,size=M)
V = 30

print('N: {0}'.format(N))

N: [173 178 190 189 175 189 176 168 170 158 159 156 176 173 174 151 177 179
 156 172 152 191 190 161 151 173 169 196 167 197 177 153 170 158 158 157
 177 159 154 181 183 162 156 154 196 170 168 176 153 191 184 192 154 158
 164 188 153 174 179 158 197 157 154 185 168 159 199 178 163 162 198 195
 183 195 150 199 159 170 195 184 198 198 177 157 170 171 188 194 150 166
 168 155 191 175 198 179 173 169 156 160 195 160 195 166 177 177 153 191
 162 195 165 150 162 157 161 151 188 183 190 178 159 154 157 183 157 181
 160 157 172 153 161 155 192 165 180 191 170 167 150 173 173 152 154 154
 191 156 199 188 181 179 162 164 173 159 178 150 187 167 168 177 155 184
 167 196 193 167 151 169 157 154 157 194 172 194 156 191 194 180 186 186
 152 197 156 151 163 180 166 174 166 158 179 169 176 195 177 188 151 169
 153 187 191 184 189 181 194 172 171 188 151 164 188 180 151 177 187 197
 150 164 167 152 182 186 163 191 155 151 183 197 173 165 187 154 154 172
 181 198 194 181 180 192 193 155 159 184 151 193

### Test data and pre-processing
Randomly generate test data

#### Completely random structure

In [None]:
# Generate completely random "documents"
w_rand = list();

for m in range(M):
    doc = np.random.randint(V,size=N[m])
    w_rand.append(doc)

In [None]:
w_rand

#### Add some structure
Generate data according to the LDA model

In [36]:
# Arbitrarily choose topics 2,5,7 (zero-indexed) as the most likely
alpha_gen1 = np.array((20,15,10,1,1,1,1,1,1,1))
alpha_gen2 = np.array((1,1,1,10,15,20,1,1,1,1))
alpha_gen3 = np.array((1,1,1,1,1,1,10,12,15,18))

# Arbitrarily choose each topic to have 3 very common words
beta_probs = np.ones((V,k)) + np.array([np.arange(V)%k==i for i in range(k)]).T*19
beta_gen = np.array(list(map(lambda x: np.random.dirichlet(x),beta_probs.T))).T

w_struct = list();
theta = np.empty((M,k))

# Generate each document
for m in range(M):
    # Draw topic distribution for the document
    if m<M/3:
        theta[m,:] = np.random.dirichlet(alpha_gen1,1)[0]
    elif m<2*M/3:
        theta[m,:] = np.random.dirichlet(alpha_gen2,1)[0]
    else:
        theta[m,:] = np.random.dirichlet(alpha_gen3,1)[0]
    doc = np.array([])
    
    for n in range(N[m]):
        # Draw topic according to document's topic distribution
        z_n = np.random.choice(np.arange(k),p=theta[m,:])
        # Draw word according to topic
        w_n = np.random.choice(np.arange(V),p=beta_gen[:,z_n])
        doc = np.append(doc,w_n)
    w_struct.append(doc)

In [35]:
np.savetxt('betaTest.txt',beta_gen,delimiter=',')
np.savetxt('thetaTest.txt',theta,delimiter=',')

In [None]:
w_struct

### Initialize parameters $\alpha, \beta, \phi$ and $\gamma$
Randomly Initialize parameters to reasonable values

In [37]:
alpha = 100*np.random.dirichlet(10*np.ones(k),1)[0]
beta = np.random.dirichlet(np.ones(V),k).T

phi = np.array([1/k*np.ones([N[m],k]) for m in range(M)])
gamma = np.tile(alpha,(M,1)) + np.tile(N/k,(k,1)).T

In [38]:
alpha

array([ 19.26106287,   5.65205293,  12.31688782,  14.81206391,
         9.35305185,   6.51975435,   5.37469515,   8.99318584,
         6.14538664,  11.57185863])

In [13]:
beta.shape

(30, 10)

In [14]:
phi.shape

(300,)

In [15]:
gamma.shape

(300, 10)

### Optimize variational parameters $\phi$ and $\gamma$

In [39]:
## Optimize variational parameter phi
def opt_phi(beta,gamma,words,M,N,k):
    for m in range(M):
        for n in range(N[m]):
            for i in range(k):
                phi[m][n,i] = beta[words[m][n],i] * np.exp(digamma(gamma[m,i]) - digamma(np.sum(gamma[m,:])))
            # Normalize across states so phi represents probability over states for each word
            phi[m][n,:] = phi[m][n,:]/sum(phi[m][n,:])
    return phi


## Optimize variational parameter gamma
def opt_gamma(alpha,phi,M):
    gamma = np.tile(alpha,(M,1)) + np.array(list(map(lambda x: np.sum(x,axis=0),phi)))
    return gamma

### Estimate model parameters $\alpha$ and $\beta$

In [40]:
## Optimize beta
def est_beta(phi,words,k,V):
    for j in range (V):
        w_dnj = [np.tile((word == j),(k,1)).T for word in words]
        beta[j,:] = np.sum(np.array(list(map(lambda x: np.sum(x,axis=0),phi*w_dnj))),axis=0)
        
    # Normalize across states so beta represents probability of each word given the state
    for i in range(k):
        beta[:,i] = beta[:,i]/sum(beta[:,i])
        
    return beta


## Optimize alpha
#  (Newton-Raphson method, for a Hessian with special structure)
def est_alpha(alpha,gamma,M,k,nr_max_iters = 1000,tol = 10**-2.0):
    for it in range(nr_max_iters):
        alpha_old = alpha
        
        #  Calculate gradient 
        g = M*(digamma(np.sum(alpha))-digamma(alpha)) + np.sum(digamma(gamma)-np.tile(digamma(np.sum(gamma,axis=1)),(k,1)).T,axis=0)
        #  Calculate Hessian diagonal component
        h = -M*polygamma(1,alpha) 
        #  Calculate Hessian constant component
        z = M*polygamma(1,np.sum(alpha))
        #  Calculate constant
        c = np.sum(g/h)/(z**(-1.0)+np.sum(h**(-1.0)))

        #  Update alpha
        alpha = alpha - (g-c)/h
        
        #  Check convergence
        if sqrt(mean(square(alpha-alpha_old)))<tol:
            break
        
    return alpha

### Expectation Maximization (EM)

#### Convergence Criterion
The variational inference parameter $\gamma$ contains the topic likelihoods of every document and is thus what is of interest here.

Calculate root-mean-square of the change in $\gamma$

In [41]:
def converged(gamma,gamma_old,convergence):
    print(sqrt(mean(square(gamma-gamma_old))))
    return sqrt(mean(square(gamma-gamma_old))) < convergence

#### Inference by iterative EM
Continue until convergence criterion above met

In [42]:
convergence = 10**(-2.0)
successfully_Converged = False
max_iters = 10**3

for iters in range(max_iters):
    print(iters)
    gamma_old = gamma
    
    ## Expectation step: Update variational parameters
    phi   = opt_phi(beta,gamma,w_struct,M,N,k)
    gamma = opt_gamma(alpha,phi,M)
    
    ## Maximization step: Update model parameters
    beta  = est_beta(phi,w_struct,k,V)
    alpha = est_alpha(alpha,gamma,M,k)
    
    if converged(gamma,gamma_old,convergence):
        successfully_Converged = True
        break

0




3.4379047275
1
11.2602500949
2
4.61629583073
3
2.24574696154
4
0.7600085649
5
1.13102478184
6
2.58191284139
7
3.87080653526
8
4.72188189671
9
4.90524498381
10
4.39268634455
11
3.52224965916
12
2.77822756912
13
2.27260159679
14
1.79908874414
15
1.30506139594
16
0.904413034419
17
0.647897344679
18
0.499094024025
19
0.409252302522
20
0.348716985068
21
0.304011539672
22
0.269373988056
23
0.241966752743
24
0.220085107103
25
0.202576032338
26
0.188557696442
27
0.177369213968
28
0.168470683386
29
0.161401022943
30
0.155765645682
31
0.151230493201
32
0.147518509471
33
0.144405712774
34
0.141715958878
35
0.139314585263
36
0.137101516394
37
0.135004452328
38
0.132972662135
39
0.13097173338
40
0.12897942617
41
0.126982591923
42
0.124974990621
43
0.122955793272
44
0.120928564287
45
0.118900533411
46
0.116881951073
47
0.114885277058
48
0.11292393119
49
0.111010417838
50
0.109153889453
51
0.107357624382
52
0.105617309656
53
0.103921158474
54
0.10225248153
55
0.100594365139
56
0.098935023507
57
0.097

KeyboardInterrupt: 

In [43]:
alpha

array([ 1.40771518,  0.67984295,  1.16526413,  0.91928545,  1.0853886 ,
        1.94082134,  0.84966669,  0.72647172,  0.7981282 ,  1.49462569])

In [28]:
alpha_gen1

array([ 1,  1, 10,  1,  1, 20,  1, 15,  1,  1])

In [29]:
alpha_gen2

array([20,  1,  1,  1, 15,  2,  1,  1,  1, 10])

In [None]:
beta

In [None]:
beta_gen

In [45]:
gamma

array([[ 65.84418849,   6.71799072,   2.99258713, ...,  26.6700617 ,
          1.10891826,  10.338058  ],
       [ 46.17470251,   7.77396843,   6.26149442, ...,  14.85642314,
          3.0511823 ,  20.65051345],
       [ 58.42898293,   1.78242891,  15.6243198 , ...,  31.84009456,
          2.10838161,  13.57102881],
       ..., 
       [  4.13160447,  11.33137087,  58.3315144 , ...,   3.93008105,
         25.51560979,  64.94855055],
       [  4.93836965,  16.12444379,  40.3306256 , ...,   1.8939937 ,
         42.65153321,  48.99622875],
       [  3.4281343 ,  30.49377288,  46.68805879, ...,   4.88721628,
         13.07539904,  45.33236409]])

In [48]:
np.savetxt('gammaTest.txt',gamma,delimiter=',')
np.savetxt('betaTest.txt',beta,delimiter=',')
np.savetxt('betagenTest.txt',beta_gen,delimiter=',')

In [46]:
theta

array([[ 0.43911811,  0.18764257,  0.21984055, ...,  0.00584336,
         0.00446856,  0.0040175 ],
       [ 0.29015734,  0.43383524,  0.18477215, ...,  0.01731905,
         0.02167948,  0.0112847 ],
       [ 0.30419096,  0.34069983,  0.25099713, ...,  0.00160195,
         0.04501012,  0.00193221],
       ..., 
       [ 0.0066659 ,  0.0280762 ,  0.02455572, ...,  0.16873897,
         0.24125024,  0.37762248],
       [ 0.01045834,  0.0077218 ,  0.02022639, ...,  0.23870335,
         0.20384342,  0.29737286],
       [ 0.01611858,  0.01505298,  0.0041712 , ...,  0.1937288 ,
         0.30519794,  0.136846  ]])

### Tests 
Testing out syntax and array dimensions

In [None]:
theta

In [None]:
gamma/np.sum(gamma,axis=1)[:,None]

In [None]:
# Word #11 in document 2 (w_dn)
w_rand[1][10]

In [None]:
[doc == 3 for doc in w_rand]

In [None]:
[doc == 3 for doc in w_rand]*w_rand

In [None]:
np.sum(np.array(list(map(lambda x: np.sum(x,axis=0),phi))),axis=0)