Latent Dirichlet Allocation - Variational Inference
====

Based on the paper "Latent Dirchlet Allocation" by David M. Blei, Andrew Y. Ng, Michael I. Jordan

In [5]:
import numpy as np
from numpy import sqrt,mean,square
import numpy.linalg as la
from scipy.special import digamma, polygamma

## Parameters

document:    $m = 1,...,M$

topic:       $z = 1,...,k$

word:        $w = 1,...,N_m$

vocabulary : $v = 1,...,V$

$\alpha: 1 \times k$ vector of topic distribution probabilities

$\beta: k \times v$ matrix of word probabilities for each topic

$\phi: M \times N_m \times k$ matrix of topic probabilities for each word in each document

$\gamma: M \times k$ matrix of topic probabilities for each document

In [6]:
np.random.seed(1337)

### Test data and pre-processing

Run the following first:
```
pip install -U nltk
pip install stop-words
pip install -U gensim
```

In [52]:
!pip install -U nltk
!pip install stop-words
!conda install -y gensim

Requirement already up-to-date: nltk in /opt/conda/lib/python3.4/site-packages
Fetching package metadata: ......
Solving package specifications: .........
  - r-irkernel-0.5-r3.2.2_1a.tar.bz2
  - r-irkernel-0.5-r3.2.2_2.tar.bz2

Package plan for installation in environment /opt/conda:

The following packages will be UPDATED:

    r-irkernel: 0.5-r3.2.2_2 --> 0.5-r3.2.2_1a

Unlinking packages ...
[      COMPLETE      ]|###################################################| 100%
Linking packages ...
[      COMPLETE      ]|###################################################| 100%


In [53]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim

tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
    
# create sample documents
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health." 

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

# list for tokenized documents in loop
texts = []

# loop through document list
for i in doc_set:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
    
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

In [54]:
corpus

[[(0, 1), (1, 1), (2, 2), (3, 2), (4, 1), (5, 2)],
 [(0, 1), (4, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)],
 [(9, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1)],
 [(0, 1),
  (4, 1),
  (9, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1)],
 [(3, 1), (5, 1), (19, 2), (30, 1), (31, 1)]]

### Randomly generated test data

In [7]:
M = 3
k = 10
N = np.random.randint(50,size=M)
V = 20

print('N: {0}'.format(N))

N: [23 28 40]


#### Completely random structure

In [8]:
# Generate random "documents"
doc1 = np.random.randint(V,size=N[0])
doc2 = np.random.randint(V,size=N[1])
doc3 = np.random.randint(V,size=N[2])

w = np.array((doc1,doc2,doc3))
w

array([ array([ 7,  7, 18, 18,  8,  9,  6,  1,  6, 18,  2,  9,  8, 11,  1, 19, 14,
       17, 15, 19,  3,  8,  8]),
       array([ 7,  9,  4,  1, 12,  6,  4, 18, 14, 18,  3,  9,  2, 10,  4,  8, 14,
        6,  3,  8, 15,  7,  4,  3, 18,  9, 17, 13]),
       array([18, 12, 16, 13, 19, 18,  1, 13,  0, 17,  9, 13,  2, 16, 16,  7,  6,
       12, 19,  0, 16, 18,  5,  9, 16, 18, 18, 19,  6, 10, 13, 10, 19, 13,
       16,  3,  9, 12, 13, 15])], dtype=object)

#### Add some structure

In [57]:
alpha_gen = np.array((1,1,10,1,1,7,1,20,1,1))
beta_gen = np.random.dirichlet(0.1*np.ones(V),k).T

w_struct = list();

for m in range(M):
    theta = np.random.dirichlet(alpha_gen,1)[0]
    doc = np.array([])
    for n in range(N[m]):
        z_n = np.random.choice(np.arange(k),p=theta)
        w_n = np.random.choice(np.arange(V),p=beta_gen[:,z_n])
        doc = np.append(doc,w_n)
    w_struct.append(doc)

In [58]:
w_struct

[array([  1.,  11.,   8.,  10.,  10.,   8.,  10.,   4.,   4.,  19.,   8.,
          0.,  12.,   8.,  12.,  11.,  19.,   0.,  19.,  12.,  19.,   9.,
         10.]),
 array([  8.,   3.,   8.,  10.,   7.,  10.,  10.,   7.,  10.,   7.,   7.,
         13.,  19.,   0.,  14.,   1.,  19.,  11.,  11.,   7.,  10.,   4.,
         11.,   4.,  10.,   7.,  10.,   7.]),
 array([ 19.,   7.,  19.,  10.,   7.,   5.,  19.,  19.,   1.,  11.,   0.,
         10.,  12.,  12.,   8.,  13.,   7.,   7.,  19.,   7.,   0.,   8.,
         19.,  10.,  12.,  13.,  19.,   8.,  12.,   3.,   8.,   7.,  13.,
          7.,  12.,  12.,   2.,   7.,   8.,  10.])]

### Initialize parameters $\alpha, \beta, \phi$ and $\gamma$

In [77]:
alpha = np.random.dirichlet(np.ones(k),1)[0]
beta = np.random.dirichlet(np.ones(V),k).T

phi = np.array([1/k*np.ones([N[m],k]) for m in range(M)])
gamma = np.tile(alpha,(M,1)) + np.tile(N/k,(k,1)).T

In [5]:
alpha

array([[ 0.03614628,  0.0682012 ,  0.04293727,  0.08103403,  0.03368725,
         0.41405727,  0.14638083,  0.01358403,  0.05414736,  0.10982448]])

In [36]:
beta.shape

(20, 10)

In [42]:
phi.shape

(3,)

In [39]:
gamma.shape

(3, 10)

### Optimize variational parameters $\phi$ and $\gamma$

In [100]:
# TODO: Split phi and gamma optimization apart for parallelization purposes
# TODO: See if some sort of vectorization is possible for speed-up
def optVarParams(alpha,beta,phi,gamma,words):
    ## Optimize phi
    for m in range(M):
        for n in range(N[m]):
            for i in range(k):
                phi[m][n,i] = beta[words[m][n],i] * np.exp(digamma(gamma[m,i]) - digamma(np.sum(gamma[m,:])))
            # Normalize across states so phi represents probability over states for each word
            phi[m][n,:] = phi[m][n,:]/sum(phi[m][n,:])
    
    ## Optimize gamma
    gamma = np.tile(alpha,(M,1)) + np.array(list(map(lambda x: np.sum(x,axis=0),phi)))
    
    return phi,gamma

In [96]:
phi,gamma = optVarParams(alpha,beta,phi,gamma,w_struct)



In [97]:
phi

array([ array([[ 0.07243487,  0.11005002,  0.10910023,  0.07741408,  0.0930058 ,
         0.09731828,  0.08083785,  0.1848577 ,  0.08452286,  0.09045831],
       [ 0.07064096,  0.110401  ,  0.10938119,  0.07582152,  0.09221637,
         0.09679324,  0.07939991,  0.19255947,  0.08326543,  0.08952091],
       [ 0.07137439,  0.11025825,  0.10926707,  0.07647285,  0.0925398 ,
         0.09700864,  0.07998815,  0.18940608,  0.08378   ,  0.08990478],
       [ 0.07470869,  0.1096044 ,  0.10874335,  0.07943248,  0.09400574,
         0.09798304,  0.08266013,  0.17510009,  0.08611622,  0.09164586],
       [ 0.07470869,  0.1096044 ,  0.10874335,  0.07943248,  0.09400574,
         0.09798304,  0.08266013,  0.17510009,  0.08611622,  0.09164586],
       [ 0.07137439,  0.11025825,  0.10926707,  0.07647285,  0.0925398 ,
         0.09700864,  0.07998815,  0.18940608,  0.08378   ,  0.08990478],
       [ 0.07470869,  0.1096044 ,  0.10874335,  0.07943248,  0.09400574,
         0.09798304,  0.08266013,  0.

### Estimate model parameters $\alpha$ and $\beta$

In [99]:
def estModParams(alpha,beta,phi,gamma,words):
    ## Optimize beta
    for j in range (V):
        w_dnj = [np.tile((word == j),(k,1)).T for word in w]
        beta[j,:] = np.sum(np.array(list(map(lambda x: np.sum(x,axis=0),phi*w_dnj))),axis=0)
        
    # Normalize across states so beta represents probability of each word given the state
    for i in range(k):
        beta[:,i] = beta[:,i]/sum(beta[:,i])
    
    ## Optimize alpha
    nr_max_iters = 1000
    tol = 10**-4
    for it in range(nr_max_iters):
        alpha_old = alpha
        
        #  Calculate gradient 
        g = M*(digamma(np.sum(alpha))-digamma(alpha)) + np.sum(digamma(gamma)-np.tile(digamma(np.sum(gamma,axis=1)),(k,1)).T,axis=0)
        #  Calculate Hessian diagonal component
        h = -M*polygamma(1,alpha) 
        #  Calculate Hessian constant component
        z = polygamma(1,np.sum(alpha))
        #  Calculate constant
        c = np.sum(g/h)/(z**(-1)+np.sum(h**(-1)))

        #  Update alpha
        alpha = alpha - (g-c)/h
        
        #  Check convergence
        if sqrt(mean(square(alpha-alpha_old)))<tol:
            break

    return alpha,beta

In [98]:
estModParams(alpha,beta,phi,gamma,w_struct)

(array([ 2.25707885,  3.18005535,  3.15743801,  2.38338091,  2.76962137,
         2.87445828,  2.46930408,  4.90355497,  2.56103432,  2.7073288 ]),
 array([[ 0.0233352 ,  0.0215924 ,  0.02162572,  0.02304118,  0.02226148,
          0.02207672,  0.02285293,  0.01984268,  0.02266176,  0.02237629],
        [ 0.04272096,  0.04433923,  0.04430993,  0.04300445,  0.04373715,
          0.04390614,  0.04318402,  0.0457432 ,  0.04336471,  0.04363119],
        [ 0.03244107,  0.03309246,  0.03307874,  0.0325427 ,  0.03282741,
          0.03289851,  0.03260933,  0.03391304,  0.03267832,  0.03278398],
        [ 0.05354856,  0.05530391,  0.05526844,  0.05383252,  0.05460764,
          0.05479673,  0.05401657,  0.05732513,  0.0542054 ,  0.05449128],
        [ 0.04217531,  0.04439841,  0.04435275,  0.04253031,  0.04350796,
          0.04374855,  0.04276127,  0.04706261,  0.04299897,  0.04336032],
        [ 0.01187964,  0.01074244,  0.01076451,  0.01168992,  0.01118292,
          0.01106183,  0.01156805

### Expectation Maximization

#### Convergence Criterion
The variational inference parameter $\gamma$ contains the topic likelihoods of every document and is thus what is of interest here.

Calculate root-mean-square of the change in $\gamma$

In [101]:
def converged(gamma,gamma_old,convergence):
    print(sqrt(mean(square(gamma-gamma_old))))
    return sqrt(mean(square(gamma-gamma_old))) < convergence

#### Inference by iterative EM
Continue until convergence criterion above met

In [102]:
convergence = 10**(-2)
successfully_Converged = False
max_iters = 10**2

for iters in range(max_iters):
    print(iters)
    gamma_old = gamma
    phi,gamma  = optVarParams(alpha,beta,phi,gamma,w_struct)
    alpha,beta = estModParams(alpha,beta,phi,gamma,w_struct)
    if converged(gamma,gamma_old,convergence):
        successfully_Converged = True
        break

0
0.247705244483
1
3.01249373862
2
3.06190546629
3
3.18881537998
4
3.24078008219
5
3.24886560352
6




3.25166207208
7
3.25584855393
8
3.25898677158
9
3.26065825344
10
3.26222634823
11
3.26357732667
12
3.26453089327
13
3.26525158872
14
3.26584908579
15
3.26625393827
16
3.26657199405
17
3.26682294727
18
3.26702481661
19
3.26708908644
20
3.26712175047
21
3.26712978605
22
3.26711878969
23
3.2669942076
24
3.26685831193
25
3.26671417239
26
3.26656423165
27
3.26641040869
28
3.2661550345
29
3.2659982441
30
3.26574243268
31
3.26548791609
32
3.26523553301
33
3.26498594735
34
3.2646403721
35
3.26439819191
36
3.264060836
37
3.26382755816
38
3.26349986085
39
3.26317735298
40
3.26286024195
41
3.26244931058
42
3.2621436274
43
3.26184370213
44
3.26104827389
45
3.25874551124
46
3.25627538444
47
3.25363770769
48
3.25083279802
49
3.24786144389
50
3.24472487245
51
3.24142471661
52
3.237962981
53
3.23434200924
54
3.23056445101
55
3.22663323032
56
3.22255151485
57
3.21832268658
58
3.21395031383
59
3.2094381247
60
3.20478998212
61
3.20000986048
62
3.19510182402
63
3.19007000627
64
3.18491859161
65
3.17965179

In [103]:
alpha

array([ 183.04369631,  334.72173094,  330.92564562,  203.31313769,
        266.36569597,  283.71081609,  217.20980025,  630.34667355,
        232.13288762,  256.10185082])

In [104]:
beta

array([[ 0.02197802,  0.02197802,  0.02197802,  0.02197802,  0.02197802,
         0.02197802,  0.02197802,  0.02197802,  0.02197802,  0.02197802],
       [ 0.04395604,  0.04395604,  0.04395604,  0.04395604,  0.04395604,
         0.04395604,  0.04395604,  0.04395604,  0.04395604,  0.04395604],
       [ 0.03296703,  0.03296703,  0.03296703,  0.03296703,  0.03296703,
         0.03296703,  0.03296703,  0.03296703,  0.03296703,  0.03296703],
       [ 0.05494506,  0.05494505,  0.05494505,  0.05494505,  0.05494505,
         0.05494505,  0.05494505,  0.05494505,  0.05494505,  0.05494505],
       [ 0.04395604,  0.04395604,  0.04395604,  0.04395604,  0.04395604,
         0.04395604,  0.04395604,  0.04395604,  0.04395604,  0.04395604],
       [ 0.01098901,  0.01098901,  0.01098901,  0.01098901,  0.01098901,
         0.01098901,  0.01098901,  0.01098901,  0.01098901,  0.01098901],
       [ 0.06593407,  0.06593407,  0.06593407,  0.06593407,  0.06593407,
         0.06593407,  0.06593407,  0.06593407

### Tests 
Testing out syntax and array dimensions

In [11]:
# Word #11 in document 2 (w_dn)
w[1][10]

3

In [28]:
[doc == 3 for doc in w]

[array([False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False,  True, False, False], dtype=bool),
 array([False, False, False, False, False, False, False, False, False,
        False,  True, False, False, False, False, False, False, False,
         True, False, False, False, False,  True, False, False, False, False], dtype=bool),
 array([False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False,  True,
        False, False, False, False], dtype=bool)]

In [29]:
[doc == 3 for doc in w]*w

array([ array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0]),
       array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0,
       3, 0, 0, 0, 0]),
       array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0])], dtype=object)

In [11]:
np.sum(np.array(list(map(lambda x: np.sum(x,axis=0),phi))),axis=0)

array([ 9.1,  9.1,  9.1,  9.1,  9.1,  9.1,  9.1,  9.1,  9.1,  9.1])