In [1]:
import os
import time
import pandas as pd
import numpy as np
import numpy.random as npr
import copy
import re
import nltk
import matplotlib.pyplot as plt
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
from tqdm import tqdm
from scipy.special import digamma, loggamma
from scipy.sparse import csr_matrix

In [2]:
def log_sum_exp(vec):
    vec_max = np.max(vec, axis=0)
    exp_vec = np.exp(vec - vec_max)
    sum_exp_vec = np.sum(exp_vec)
    log_sum_exp = np.log(sum_exp_vec) + vec_max
    return log_sum_exp

def init_variational_params(documents, K, rs_int=npr.randint(low=0, high=100)):
    rs = npr.RandomState(rs_int)
    N, V = documents.shape
    LAMBDA = rs.uniform(low=0.1, high=1.0, size=(K, V))
    GAMMA = rs.uniform(low=0.1, high=1.0, size=(N, K))
    PHI = []
    for document in documents:
        M = np.sum((document > 0).astype("int32"))
        document_PHI = np.ones((M, K))
        document_PHI = document_PHI / K
        PHI.append(document_PHI)
        
    return LAMBDA, GAMMA, PHI

def compute_ELBO(LAMBDA, GAMMA, PHI, documents, nonzero_idxs, K):
    ELBO = 0
    N, _ = documents.shape

    E_log_p_BETA = np.sum((ETA-1) * (digamma(LAMBDA) - digamma(np.sum(LAMBDA, axis=1, keepdims=True))))
    ELBO += E_log_p_BETA

    E_log_p_THETA = np.sum((ALPHA-1) * (digamma(GAMMA) - digamma(np.sum(GAMMA, axis=1, keepdims=True))))
    ELBO += E_log_p_THETA

    E_log_p_x_z = 0
    for i in range(N):
        document = documents[i]
        nonzero_idx = nonzero_idxs[i]
        word_idx = 0
        for idx in nonzero_idx:
            E_log_p_x_z += np.sum(PHI[i][word_idx] * (digamma(GAMMA[i])-digamma(np.sum(GAMMA[i])))) \
                + np.sum(PHI[i][word_idx] * (digamma(LAMBDA[:, idx])-digamma(np.sum(LAMBDA, axis=1))))
            word_idx += 1
    ELBO += E_log_p_x_z

    E_log_q_BETA = np.sum(-loggamma(np.sum(LAMBDA, axis=1)) + np.sum(loggamma(LAMBDA), axis=1) \
        - np.sum((LAMBDA - 1) * (digamma(LAMBDA) - digamma(np.sum(LAMBDA, axis=1, keepdims=True))), axis=1))
    ELBO += E_log_q_BETA

    E_log_q_THETA = np.sum(-loggamma(np.sum(GAMMA, axis=1)) + np.sum(loggamma(GAMMA), axis=1) \
        - np.sum((GAMMA - 1) * (digamma(GAMMA) - digamma(np.sum(GAMMA, axis=1, keepdims=True))), axis=1))
    ELBO += E_log_q_THETA

    E_log_q_z = 0
    for i in range(N):
        document = documents[i]
        nonzero_idx = nonzero_idxs[i]
        word_idx = 0
        for idx in nonzero_idx:
            E_log_q_z += -np.sum(PHI[i][word_idx] * np.log(PHI[i][word_idx]))
            word_idx += 1
    ELBO += E_log_q_z

    return ELBO

In [None]:
def simulate_LDA(N, Ms, K, V, ETA, ALPHA, rs_int=np.random.randint(low=0, high=100)):
    rs = npr.RandomState(rs_int) 
    BETA = rs.dirichlet(np.full(V, ETA), size=K)
    THETA = rs.dirichlet(np.full(K, ALPHA), size=N)
    
    row_idxs = []
    col_idxs = []
    values = []
    nonzero_idxs = []

    for i in range(N):
        doc_word_counts = np.zeros(V)
        for _ in range(Ms[i]):
            z_ij = rs.choice(K, p=THETA[i])
            x_ij = rs.choice(V, p=BETA[z_ij])
            doc_word_counts[x_ij] += 1
        doc_nonzero = np.nonzero(doc_word_counts)[0]
        doc_nonzero = np.array(sorted(doc_nonzero))
        nonzero_idxs.append(doc_nonzero)

        row_idxs.extend([i] * len(doc_nonzero))
        col_idxs.extend(doc_nonzero)
        values.extend(doc_word_counts[doc_nonzero])
    documents = csr_matrix((values, (row_idxs, col_idxs)), shape=(N, V)).toarray()
    
    return documents, nonzero_idxs

In [7]:
N = 100
Ms = npr.poisson(200, size=N)
K = 10
V = 1000
ETA = 100 / V
ALPHA = 1 / K
documents, nonzero_idxs, BETA, THETA = simulate_LDA(N, Ms, K, V, ETA, ALPHA)
LAMBDA, GAMMA, PHI = init_variational_params(documents, K)

In [4]:
N = 200
Ms = npr.poisson(70, size=N)
K = 10
V = 500
ETA = 0.01
ALPHA = 0.1
documents, nonzero_idxs, BETA, THETA = simulate_LDA(N, Ms, K, V, ETA, ALPHA)
LAMBDA, GAMMA, PHI = init_variational_params(documents, K)

In [5]:
start = time.time()
ELBOs = []
prev_ELBO = -np.inf
curr_ELBO = compute_ELBO(LAMBDA, GAMMA, PHI, documents, nonzero_idxs, K)
ELBOs.append(curr_ELBO)
print(f"Initial ELBO: {ELBOs[0]}\n")

max_iterations = 200
tol = 10e-4
LAMBDA_t = copy.deepcopy(LAMBDA)
GAMMA_t = copy.deepcopy(GAMMA)
PHI_t = copy.deepcopy(PHI)

for t in range(max_iterations):
    print(f"Iteration {t+1}")
    for i in tqdm(range(N), desc="Updating PHI and GAMMA"):
        document = documents[i]
        nonzero_idx = nonzero_idxs[i]
        GAMMA_i_t = copy.deepcopy(GAMMA_t[i])
        word_idx = 0
        for idx in nonzero_idx:
            log_PHI_ij = np.zeros((K,))
            for k in range(K):
                LAMBDA_k_t = copy.deepcopy(LAMBDA_t[k])
                exp_propto = digamma(GAMMA_i_t[k]) - digamma(np.sum(GAMMA_i_t)) + digamma(LAMBDA_k_t[idx]) - digamma(np.sum(LAMBDA_k_t))
                log_PHI_ij[k] = exp_propto
            PHI_ij = np.exp(log_PHI_ij - log_sum_exp(log_PHI_ij))
            PHI_t[i][word_idx] = PHI_ij
            word_idx += 1
        GAMMA_i_t = np.zeros((K,)) + ALPHA
        for k in range(K):
            GAMMA_i_t[k] += np.sum(document[nonzero_idx] * PHI_t[i][:, k])
        GAMMA_t[i] = GAMMA_i_t

    for k in tqdm(range(K), desc="Updating LAMBDA"):
        LAMBDA_k_t = np.zeros((V,)) + ETA
        for i in range(N):
            document = documents[i]
            nonzero_idx = nonzero_idxs[i]
            word_idx = 0
            for idx in nonzero_idx:
                LAMBDA_k_t[idx] += document[idx] * PHI_t[i][word_idx][k]
                word_idx += 1
            LAMBDA_t[k] = LAMBDA_k_t

    prev_ELBO = curr_ELBO
    curr_ELBO = compute_ELBO(LAMBDA_t, GAMMA_t, PHI_t, documents, nonzero_idxs, K)
    ELBOs.append(curr_ELBO)
    print(f"Current ELBO: {curr_ELBO} | Change in ELBO: {curr_ELBO - prev_ELBO}\n")

    if abs(curr_ELBO - prev_ELBO) < tol:
        break
stop = time.time()

LAMBDA_final = copy.deepcopy(LAMBDA_t)
GAMMA_final = copy.deepcopy(GAMMA_t)
PHI_final = copy.deepcopy(PHI_t)

plt.ticklabel_format(style="sci", axis="y", scilimits=(0, 0))
plt.plot(np.linspace(0, stop-start, len(ELBOs)), ELBOs)

Initial ELBO: -32590.729059065692

Iteration 1


Updating PHI and GAMMA: 100%|██████████| 200/200 [00:00<00:00, 248.68it/s]
Updating LAMBDA: 100%|██████████| 10/10 [00:00<00:00, 260.86it/s]


Current ELBO: -7140.2651000656 | Change in ELBO: 25450.46395900009

Iteration 2


Updating PHI and GAMMA: 100%|██████████| 200/200 [00:00<00:00, 289.72it/s]
Updating LAMBDA: 100%|██████████| 10/10 [00:00<00:00, 269.29it/s]


Current ELBO: -5742.308122686419 | Change in ELBO: 1397.956977379181

Iteration 3


Updating PHI and GAMMA: 100%|██████████| 200/200 [00:00<00:00, 282.91it/s]
Updating LAMBDA: 100%|██████████| 10/10 [00:00<00:00, 219.78it/s]


Current ELBO: -4743.417296965547 | Change in ELBO: 998.8908257208714

Iteration 4


Updating PHI and GAMMA: 100%|██████████| 200/200 [00:00<00:00, 290.13it/s]
Updating LAMBDA: 100%|██████████| 10/10 [00:00<00:00, 266.02it/s]


Current ELBO: -3825.3973511578106 | Change in ELBO: 918.0199458077368

Iteration 5


Updating PHI and GAMMA: 100%|██████████| 200/200 [00:00<00:00, 265.81it/s]
Updating LAMBDA: 100%|██████████| 10/10 [00:00<00:00, 258.29it/s]


Current ELBO: -2900.892772315844 | Change in ELBO: 924.5045788419666

Iteration 6


Updating PHI and GAMMA: 100%|██████████| 200/200 [00:00<00:00, 262.60it/s]
Updating LAMBDA: 100%|██████████| 10/10 [00:00<00:00, 252.65it/s]


Current ELBO: -2054.051090913002 | Change in ELBO: 846.8416814028419

Iteration 7


Updating PHI and GAMMA: 100%|██████████| 200/200 [00:00<00:00, 279.20it/s]
Updating LAMBDA: 100%|██████████| 10/10 [00:00<00:00, 275.49it/s]


Current ELBO: -1361.9250994081908 | Change in ELBO: 692.1259915048113

Iteration 8


Updating PHI and GAMMA: 100%|██████████| 200/200 [00:00<00:00, 279.10it/s]
Updating LAMBDA: 100%|██████████| 10/10 [00:00<00:00, 260.25it/s]


Current ELBO: -820.5641735205722 | Change in ELBO: 541.3609258876186

Iteration 9


Updating PHI and GAMMA: 100%|██████████| 200/200 [00:00<00:00, 286.23it/s]
Updating LAMBDA: 100%|██████████| 10/10 [00:00<00:00, 266.68it/s]


Current ELBO: -428.4176429590011 | Change in ELBO: 392.1465305615711

Iteration 10


Updating PHI and GAMMA: 100%|██████████| 200/200 [00:00<00:00, 271.29it/s]
Updating LAMBDA: 100%|██████████| 10/10 [00:00<00:00, 238.94it/s]


Current ELBO: -150.50691860568168 | Change in ELBO: 277.91072435331944

Iteration 11


Updating PHI and GAMMA: 100%|██████████| 200/200 [00:00<00:00, 238.13it/s]
Updating LAMBDA: 100%|██████████| 10/10 [00:00<00:00, 238.17it/s]


Current ELBO: 73.61674085422169 | Change in ELBO: 224.12365945990337

Iteration 12


Updating PHI and GAMMA: 100%|██████████| 200/200 [00:00<00:00, 275.75it/s]
Updating LAMBDA: 100%|██████████| 10/10 [00:00<00:00, 263.35it/s]


Current ELBO: 212.39943879995099 | Change in ELBO: 138.7826979457293

Iteration 13


Updating PHI and GAMMA: 100%|██████████| 200/200 [00:00<00:00, 277.24it/s]
Updating LAMBDA: 100%|██████████| 10/10 [00:00<00:00, 267.93it/s]


Current ELBO: 305.0959639212392 | Change in ELBO: 92.69652512128823

Iteration 14


Updating PHI and GAMMA: 100%|██████████| 200/200 [00:00<00:00, 243.29it/s]
Updating LAMBDA: 100%|██████████| 10/10 [00:00<00:00, 242.57it/s]


Current ELBO: 373.8950341870384 | Change in ELBO: 68.79907026579917

Iteration 15


Updating PHI and GAMMA: 100%|██████████| 200/200 [00:00<00:00, 268.67it/s]
Updating LAMBDA: 100%|██████████| 10/10 [00:00<00:00, 225.28it/s]


Current ELBO: 433.28971583371344 | Change in ELBO: 59.39468164667505

Iteration 16


Updating PHI and GAMMA: 100%|██████████| 200/200 [00:00<00:00, 253.08it/s]
Updating LAMBDA: 100%|██████████| 10/10 [00:00<00:00, 245.97it/s]


Current ELBO: 482.14777340335263 | Change in ELBO: 48.858057569639186

Iteration 17


Updating PHI and GAMMA: 100%|██████████| 200/200 [00:00<00:00, 266.88it/s]
Updating LAMBDA: 100%|██████████| 10/10 [00:00<00:00, 222.37it/s]


Current ELBO: 519.8800906806068 | Change in ELBO: 37.7323172772542

Iteration 18


Updating PHI and GAMMA: 100%|██████████| 200/200 [00:00<00:00, 256.00it/s]
Updating LAMBDA: 100%|██████████| 10/10 [00:00<00:00, 219.01it/s]


Current ELBO: 544.9845047180374 | Change in ELBO: 25.104414037430615

Iteration 19


Updating PHI and GAMMA: 100%|██████████| 200/200 [00:00<00:00, 254.97it/s]
Updating LAMBDA: 100%|██████████| 10/10 [00:00<00:00, 239.25it/s]


Current ELBO: 563.6592112001101 | Change in ELBO: 18.67470648207268

Iteration 20


Updating PHI and GAMMA: 100%|██████████| 200/200 [00:00<00:00, 244.65it/s]
Updating LAMBDA: 100%|██████████| 10/10 [00:00<00:00, 263.07it/s]


KeyboardInterrupt: 

In [23]:
rs = npr.RandomState(0)
K, V, N = 10, 300, 30
eta0, alpha0 = 0.1, (50 / K)
Ms = rs.poisson(60, size=N)
documents, nonzero_idxs, BETA, THETA = simulate_LDA(N, Ms, K, V, eta0, alpha0, 0)
lambd, gamma, phi = init_variational_params(documents, K, 0)
compute_ELBO(lambd, gamma, phi, documents, nonzero_idxs, K)

np.float64(-9416.708794025422)

In [12]:
LAMBDA_final

array([[0.10002374, 0.10000619, 0.10012869, ..., 0.10005261, 2.67731004,
        0.10004352],
       [0.10000678, 0.10000381, 0.10004467, ..., 0.10001745, 0.10002651,
        0.10001298],
       [0.10006586, 0.10001275, 0.10004989, ..., 0.10002568, 0.10000259,
        0.10001517],
       ...,
       [5.09978464, 0.10000621, 0.10007385, ..., 0.10004511, 0.10002148,
        0.10000869],
       [0.10003886, 0.10001988, 2.09953683, ..., 7.49199056, 0.10002397,
        0.10001101],
       [0.10002628, 0.1000771 , 0.10003458, ..., 0.10001568, 0.100024  ,
        0.1000269 ]])