In [27]:
import os
import time
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
from tqdm import tqdm
from scipy.special import digamma, loggamma

assert os.path.exists("/Users/daany/Downloads/HT/LDA/ap.txt") and os.path.exists("/Users/daany/Downloads/HT/LDA/vocab.txt")

with open("vocab.txt", "r") as f:
    vocab = set(f.read().splitlines())

with open("ap.txt", "r") as f:
    raw_text = f.read()

texts = re.findall(r"<TEXT>(.*?)</TEXT>", raw_text, re.DOTALL)
stop_words = set(stopwords.words("english"))
documents = []

for text in texts:
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words and word in vocab]
    documents.append(tokens)

N = len(documents)
V = len(vocab)

doc_term_matrix = np.zeros((N, V))
vocab_to_index = {word: i for i, word in enumerate(vocab)}

for doc_idx, tokens in enumerate(documents):
    token_counts = Counter(tokens)
    for token, count in token_counts.items():
        if token in vocab_to_index:
            term_idx = vocab_to_index[token]
            doc_term_matrix[doc_idx, term_idx] = count

index_to_vocab = dict((val, key) for key, val in vocab_to_index.items())

nonzero_idxs = []
for doc in doc_term_matrix:
    nonzero_idx = np.where(doc > 0)[0]
    nonzero_idxs.append(sorted(nonzero_idx))

def init_variational_params(doc_term_matrix, K):
    N, V = doc_term_matrix.shape # N is number of documents
    LAMBDA = np.random.uniform(low=0.01, high=1.0, size=(K, V))
    GAMMA = np.ones((N, K))
    PHI = []
    for doc in doc_term_matrix:
        num_words_per_doc = np.sum((doc > 0).astype("int32"))
        doc_PHI = np.ones((num_words_per_doc, K))
        doc_PHI = doc_PHI / K
        PHI.append(doc_PHI)

    return LAMBDA, GAMMA, PHI

def compute_ELBO(LAMBDA, GAMMA, PHI, doc_term_matrix, nonzero_idxs, K):
    N, V = doc_term_matrix.shape
    ELBO = 0

    E_log_p_BETA = 0
    for k in range(K):
        E_log_p_BETA += (ETA - 1) * np.sum(digamma(LAMBDA[k]) - digamma(np.sum(LAMBDA[k])))
    ELBO += E_log_p_BETA

    E_log_p_THETA = 0
    for i in range(N):
        E_log_p_THETA += (ALPHA - 1) * np.sum(digamma(GAMMA[i]) - digamma(np.sum(GAMMA[i])))
    ELBO += E_log_p_THETA

    E_q_log_p_z_x = 0
    for i in range(N):
        doc = doc_term_matrix[i]
        nonzero_idx = nonzero_idxs[i]
        corr_idx = 0
        for idx in nonzero_idx:
            E_q_log_p_z_x += doc[idx] * np.sum(PHI[i][corr_idx] * (digamma(GAMMA[i]) - digamma(np.sum(GAMMA[i])))) \
                + doc[idx] * np.sum(PHI[i][corr_idx] * (digamma(LAMBDA[:, idx]) - digamma(np.sum(LAMBDA, axis=1))))
            corr_idx += 1
    ELBO += E_q_log_p_z_x

    E_log_q_BETA = 0
    for k in range(K):
        E_log_q_BETA += -loggamma(np.sum(LAMBDA[k])) + np.sum(loggamma(LAMBDA[k])) \
            - np.sum((LAMBDA[k] - 1) * (digamma(LAMBDA[k]) - digamma(np.sum(LAMBDA[k]))))
    ELBO += E_log_q_BETA

    E_log_q_THETA = 0
    for i in range(N):
        E_log_q_THETA += -loggamma(np.sum(GAMMA[i])) + np.sum(loggamma(GAMMA[i])) \
            - np.sum((GAMMA[i] - 1) * (digamma(GAMMA[i]) - digamma(np.sum(GAMMA[i]))))
    ELBO += E_log_q_THETA

    E_q_log_z = 0
    for i in range(N):
        doc = doc_term_matrix[i]
        nonzero_idx = nonzero_idxs[i]
        corr_idx = 0
        for idx in nonzero_idx:
            E_q_log_z += -doc[idx] * np.sum(PHI[i][corr_idx] * np.log(PHI[i][corr_idx]))
            corr_idx += 1
    ELBO += E_q_log_z

    return ELBO

def log_sum_exp(vec):
    a = np.max(vec, axis=0)
    log_sum_exp = np.log(np.sum(np.exp(vec - a))) + a

    return log_sum_exp

def update_variational_params(LAMBDA, GAMMA, PHI, doc_term_matrix, nonzero_idxs, K):
    N, V = doc_term_matrix.shape

    print("Updating PHI and GAMMA")
    for i in tqdm(range(N)):
        doc = doc_term_matrix[i]
        nonzero_idx = nonzero_idxs[i]
        corr_idx = 0
        for idx in nonzero_idx:
            log_PHI_ij = np.zeros((K, ))
            for k in range(K):
                exponent = digamma(GAMMA[i][k]) - digamma(np.sum(GAMMA[i])) \
                    + digamma(LAMBDA[k][idx]) - digamma(np.sum(LAMBDA[k]))
                log_PHI_ij[k] = exponent
            PHI_ij = np.exp(log_PHI_ij - log_sum_exp(log_PHI_ij))
            PHI[i][corr_idx] = PHI_ij
            corr_idx += 1

        GAMMA_i = np.zeros((K, )) + ALPHA
        for k in range(K):
            GAMMA_i[k] += np.sum(doc[nonzero_idx] * PHI[i][:, k])
        GAMMA[i] = GAMMA_i

    print("Updating LAMBDA")
    for k in tqdm(range(K)):
        LAMBDA_k = np.zeros((V, )) + ETA
        for i in range(N):
            doc = doc_term_matrix[i]
            nonzero_idx = nonzero_idxs[i]
            corr_idx = 0
            for idx in nonzero_idx:
                LAMBDA_k[idx] += doc[idx] * PHI[i][corr_idx][k]
                corr_idx += 1
        LAMBDA[k] = LAMBDA_k

    return LAMBDA, GAMMA, PHI

In [28]:
doc_term_matrix = doc_term_matrix[0:1000]
ETA = 1 / V
ALPHA = 0.1
K = 30 
tol = 30
iteration = 1
curr_ELBO = 0
prev_ELBO = 100
ELBOs = []

start = time.time()
LAMBDA, GAMMA, PHI = init_variational_params(doc_term_matrix, K)
ELBOs.append(compute_ELBO(LAMBDA, GAMMA, PHI, doc_term_matrix, nonzero_idxs, K))
while np.abs(curr_ELBO - prev_ELBO) > tol:
    print(f"Iteration: {iteration}")
    LAMBDA, GAMMA, PHI = update_variational_params(LAMBDA, GAMMA, PHI, doc_term_matrix, nonzero_idxs, K)
    prev_ELBO = curr_ELBO
    print("Computing the ELBO for current variational parameters")
    curr_ELBO = compute_ELBO(LAMBDA, GAMMA, PHI, doc_term_matrix, nonzero_idxs, K)
    ELBOs.append(curr_ELBO)
    iteration += 1
    print("\n")
end = time.time

Iteration: 1
Updating PHI and GAMMA


100%|██████████| 1000/1000 [01:36<00:00, 10.34it/s]


Updating LAMBDA


100%|██████████| 30/30 [00:03<00:00,  9.07it/s]


Computing the ELBO for current variational parameters


Iteration: 2
Updating PHI and GAMMA


100%|██████████| 1000/1000 [01:22<00:00, 12.14it/s]


Updating LAMBDA


100%|██████████| 30/30 [00:03<00:00,  9.55it/s]


Computing the ELBO for current variational parameters


  E_q_log_z += -doc[idx] * np.sum(PHI[i][corr_idx] * np.log(PHI[i][corr_idx]))
  E_q_log_z += -doc[idx] * np.sum(PHI[i][corr_idx] * np.log(PHI[i][corr_idx]))




