<a href="https://colab.research.google.com/github/chasubeen/ML_lab/blob/main/2_Variational_Inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import time
import os
import re
import numpy as np
import matplotlib.pyplot as plt
import copy
from collections import defaultdict
from scipy.special import digamma, polygamma, loggamma

# **1. Preprocessing**

## **1-1. Load data**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# data path

train_path = "/content/drive/MyDrive/ML_lab/2_LDA/data/pos_tag/train.txt"
test_path = "/content/drive/MyDrive/ML_lab/2_LDA/data/pos_tag/test.txt"

In [None]:
def read_file(path):
  raw = open(path, 'r').read().split('\n') # 파일을 읽어와 줄마다 나누기
  data, doc = list(), list()

  for i, line in enumerate(raw):
    if line.strip(): # 빈 줄이 아니면
      word = line.split(' ')[0].lower() # 첫 번째 단어를 얻어온 후 소문자로 변환
      doc.append(word)
    else:
      data.append(doc) # EOD(End Of Document)
      doc = list()
  data.append(doc) # 맨 마지막 문서 추가

  return data

In [None]:
# 데이터 불러오기
train_data = read_file(train_path)
test_data = read_file(test_path)

# 맨 마지막 문서는 제거
del train_data[-1], test_data[-1]

## **1-2. Preprocess Data**

In [None]:
# some words that we do not want to consider
stopwords = [""," "]

In [None]:
## remove numbers, special characters, etc.
# only consider words with alphabet

def only_alphabet(corpus,stopword_list = stopwords):
  data = list()
  for doc in corpus:
    temp = list()
    for word in doc:
      word = re.sub('[^a-z]', '', word) # non-alphabet 문자 제거(=> 정규표현식 활용)
      if word not in stopwords:
        temp.append(word)
    data.append(temp)

  return data

In [None]:
train_data = only_alphabet(train_data)
test_data = only_alphabet(test_data)

In [None]:
## count word occurrence in corpus

def count_vocab(corpus):
  vocab = defaultdict(int) # dictionary 형태로 word occurence를 기록
  for doc in corpus:
    for word in doc:
      vocab[word] += 1

  return vocab

In [None]:
vocab_count = count_vocab(train_data)

In [None]:
## 특정 빈도수 이상인 단어들만 고려
# sparse한 단어는 무시

def vocab_top(vocab,cnt):
  temp = defaultdict(int)
  for voca, count in vocab.items():
    if count > cnt:
      temp[voca] = count

  return temp

In [None]:
vocab_count = vocab_top(vocab_count,cnt = 5)

In [None]:
## filter out low-occurrence words

def filter_vocab(corpus,vocab):
  data = list()
  for doc in corpus:
    temp = list()
    for word in doc:
      if word in vocab.keys():
        temp.append(word)
    if temp:
      data.append(temp)

  return data

In [None]:
train_data = filter_vocab(train_data,vocab_count)
test_data = filter_vocab(test_data,vocab_count)

In [None]:
## construct voca-index-matching dictionary

def voca_index(vocab):
  vocab_to_index, index_to_vocab = dict(), dict()
  for i, voca in enumerate(vocab.keys()):
    vocab_to_index[voca] = i
    index_to_vocab[i] = voca

  return vocab_to_index, index_to_vocab

In [None]:
v_t_i, i_t_v = voca_index(vocab_count)

In [None]:
## convert corpus-with-words to corpus-with-index

def corpus_to_index(corpus,vocab_to_index):
  data = list()
  for doc in corpus:
    temp = list()
    for word in doc:
      temp.append(vocab_to_index[word])
    data.append(temp)

  return data

In [None]:
train_data_idx = corpus_to_index(train_data,v_t_i)
test_data_idx = corpus_to_index(test_data,v_t_i)

# **2. Modeling**

In [None]:
class LDA_VI:
  def __init__(self,docs, num_topic=10, vocab = None, alpha=1.,num_iter=100,lr=1e-3):
    self.docs = docs
    self.num_topic = num_topic
    self.vocab = vocab
    self.num_vocab = len(self.vocab)
    self.num_docs = len(self.docs)

    # Initialize alpha and beta
    self.alpha = np.ones(self.num_topic)*(1/self.num_topic)
    self.beta = np.ones((self.num_topic, self.num_vocab))*(1/self.num_vocab)
    # beta 초기값 업데이트
    for k in range(self.num_topic):
      for v in range(self.num_vocab):
        self.beta[k][v] += np.random.rand(1) * 0.01
      # beta normailize 진행
      for v in range(self.num_vocab):
        self.beta[k][v] /= np.sum(self.beta[k])

    # Initialize gamma and phi
    self.gamma = np.ones((self.num_docs,self.num_topic))*(1/self.num_topic)
    self.phi = np.ones((self.num_docs, self.num_vocab, self.num_topic))*(1/self.num_topic) # <- 수정

    # Initialize z (topic assignments for words in documents)
    self.z = list()
    for doc in self.docs:
      temp = list()
      for word in doc:
        temp.append(None)
      self.z.append(temp)

    # Initialize parameters for variational inference
    self.num_iter = num_iter
    self.lr = lr

  def update_gamma(self):
    ## Update gamma (document-topic distributions)
    # d: document의 index
    for d in range(self.num_docs):
      for k in range(self.num_topic):
        self.gamma[d,k] = self.alpha[k]
        for n in range(len(self.docs[d])):
          self.gamma[d,k] += self.phi[d,n,k]

  def update_phi(self):
    ## Update phi (topic-word distributions)
    for d in range(self.num_docs):
      for n in range(len(self.docs[d])):
        for k in range(self.num_topic):
          self.phi[d, n, k] = self.beta[k, self.docs[d][n]] * np.exp(digamma(self.gamma[d, k]))
        # phi 정규화
        for k in range(self.num_topic):
          self.phi[d, n, k] /= np.sum(self.phi[d, n])

  def update_beta(self):
    ## Update beta(topic-word prior)
    for k in range(self.num_topic):
      for d in range(self.num_docs):
        for n in range(len(self.docs[d])):
          self.beta[k, self.docs[d][n]] += self.phi[d, n, k]
      # beta 정규화
      for v in range(self.num_vocab):
        self.beta[k, v] /= np.sum(self.beta[k])

  def update_alpha(self):
    ## Update alpha(document-topic prior)
    M = self.num_docs
    g = np.zeros(self.num_topic) # gradient
    H = np.zeros((self.num_topic, self.num_topic)) # Hessian matrix

    for i in range(self.num_topic):
      g[i] = M * ( digamma(np.sum(self.alpha)) - digamma(self.alpha[i]) )
      for d in range(M):
        g[i] += ( digamma(self.gamma[d, i]) - digamma(np.sum(self.gamma[d])) )
      for j in range(self.num_topic):
        H[i, j] = 0
        if i == j:
          H[i, j] -= M * polygamma(1, self.alpha[i])
        H[i, j] += M * (polygamma(1, np.sum(self.alpha)))

    # alpha 업데이트(=> Newton-Raphson)
    deltaAlpha = np.dot(np.linalg.inv(H), g)
    for k in range(self.num_topic):
      self.alpha[k] -= deltaAlpha[k]

  def e_step(self):
    ## Perform E-step of the variational inference
    self.update_phi()
    self.update_gamma()

  def m_step(self):
    ## Perform M-step of the variational inference
    self.update_beta()
    self.update_alpha()

  def compute_elbo(self):
    ## Compute Evidence Lower Bound (ELBO)

    # 필요한 파라미터 정의
    elbo = 0
    M = self.num_docs
    k = self.num_topic

    # 1번 수식
    for d in range(M): # document
      for n in range(len(self.docs[d])):
        j = self.docs[d][n] # word idx
        for i in range(k): # topic
          elbo += self.phi[d, n, i] * np.log(self.beta[i, j])

    # 2번 수식
    for d in range(M):
      for n in range(len(self.docs[d])):
        for i in range(k):
          elbo += self.phi[d, n, i] * (digamma(self.gamma[d, i]) - digamma(np.sum(self.gamma[d])))

    # 3번 수식
    for d in range(M):
      elbo += loggamma(np.sum(self.alpha))
      for i in range(k):
        elbo -= loggamma(self.alpha[i])
        elbo += (self.alpha[i] - 1) * (digamma(self.gamma[d, i]) - digamma(np.sum(self.gamma[d])))

    # 4번, 5번 수식
    for d in range(M):
      elbo -= loggamma(np.sum(self.gamma[d]))
      for i in range(k):
        elbo += loggamma(np.sum(self.gamma[d, i]))
        elbo -= (self.gamma[d, i] - 1)*( digamma(self.gamma[d, i]) - digamma(np.sum(self.gamma[d])) )
        for n in range(len(self.docs[d])):
          elbo -= self.phi[d, n, i] * np.log(self.phi[d, n, i])

    return elbo

  def compute_perplexity(self, elbo):
    # 분모
    N = sum(len(doc) for doc in self.docs)

    perplexity = np.exp(-elbo / N)
    return perplexity

  def run(self):
    start_time = time.time()
    print("EM Algorithm 시작!")

    ## Run the variational inference algorithm
    for iter in range(self.num_iter):
      print(f"=== Iteration: {iter + 1} ===")
      print(f"start time: {round(time.time() - start_time, 2)}")
      self.e_step()
      self.m_step()

      elbo = self.compute_elbo()
      print(f"ELBO: {elbo}")
      perplexity = self.compute_perplexity(elbo)
      print(f"Perplexity: {perplexity}")

      print()

    print("EM Algorithm 끝!")

# **3. Run!!**

In [None]:
model = LDA_VI(train_data_idx,num_topic = 10, vocab = v_t_i, alpha = 1.,num_iter = 100,lr = 1e-3)
model.run()

  self.beta[k][v] += np.random.rand(1) * 0.01


EM Algorithm 시작!
=== Iteration: 1 ===
start time: 0.0
ELBO: -13567.009199459579
Perplexity: 1.4980510371416667

=== Iteration: 2 ===
start time: 9.18
ELBO: -180201.51893353215
Perplexity: 214.4878442112404

=== Iteration: 3 ===
start time: 16.48
ELBO: -178161.7247086239
Perplexity: 201.8423672371536

=== Iteration: 4 ===
start time: 25.66
ELBO: -177031.22805220686
Perplexity: 195.15794881618368

=== Iteration: 5 ===
start time: 34.18
ELBO: -176321.32710088676
Perplexity: 191.07405569082096

=== Iteration: 6 ===
start time: 41.99
ELBO: -175831.57419488177
Perplexity: 188.3065470982896

=== Iteration: 7 ===
start time: 51.2
ELBO: -175480.20275508176
Perplexity: 186.3457375368859

=== Iteration: 8 ===
start time: 58.51
ELBO: -175229.31011513976
Perplexity: 184.95815145400383

=== Iteration: 9 ===
start time: 67.72
ELBO: -175054.00923558066
Perplexity: 183.99476929640215

=== Iteration: 10 ===
start time: 75.78
ELBO: -174932.9992215256
Perplexity: 183.3326766596598

=== Iteration: 11 ===
s

In [None]:
## ELBO 시각화



# **4. Inference**

In [None]:
## analyze each doc's topic proportion

for d, gamma in enumerate(model.gamma):
  print(f"Document {d} topic distribution: {gamma / np.sum(gamma)}")

In [None]:
## analyze topic-word proportion

for k, beta in enumerate(model.beta):
  top_words = np.argsort(beta)[-10:]
  print(f"Topic {k} top words: {[i_t_v[i] for i in top_words]}")