# LDA - Latent Dirichlet Allocation

<B>Paper</b> : https://coli-saar.github.io/cl19/materials/darling-lda.pdf

In [18]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [42]:
from datasets import load_dataset, Split
import numpy as np
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
import random

np.random.seed(1)

In [20]:
data = load_dataset('squad', split=Split.VALIDATION)



In [21]:
print("Total number of documents:",len(data))

Total number of documents: 10570


In [84]:
N_DOC = 1000
ALPHA = 0.2
BETA = 0.2
TOPIC = 40

In [85]:
sdata = data['context'][:N_DOC]

In [86]:
stop_words = list(text.ENGLISH_STOP_WORDS)
cv = CountVectorizer(stop_words=list(text.ENGLISH_STOP_WORDS))
freq = cv.fit_transform(sdata)

In [87]:
freq.toarray().shape

(1000, 2605)

In [88]:
frequency = cv.vocabulary_

In [89]:
k2i = { k:i for i,k in enumerate(frequency.keys())}
i2k = { v:k for k,v in k2i.items()}

In [90]:
VOCAB_SIZE = len(k2i)
print(f"Vocab size |V| = {VOCAB_SIZE}")

Vocab size |V| = 2605


In [91]:
corpus = []
for d in sdata:
  words = []
  for w in d.split():
    if w in k2i:
      words.append(k2i[w])
  corpus.append(np.asarray(words))

In [92]:
corpus[0].shape

(28,)

In [93]:
N_ITER = 1000
### Initialize Z and counts
Z = []
for i, doc in enumerate(corpus):
  zd = np.random.randint(low=0,high=TOPIC,size=(len(doc)))
  Z.append(zd)

ndk = np.zeros((N_DOC,TOPIC))

for i in range(N_DOC):
  for k in range(TOPIC):
    ndk[i,k] = np.sum(Z[i] == k)

nkw = np.zeros((TOPIC, VOCAB_SIZE))

for i,doc in enumerate(corpus):
  for j, word in enumerate(doc):
    topic = Z[i][j]
    nkw[ topic, word] += 1

nk = nkw.sum(axis=1)

In [94]:
### Training part
from tqdm import tqdm

for _ in tqdm(range(N_ITER)):
  for doc_idx,doc in enumerate(corpus):
    for i in range(len(doc)):
      word = doc[i]
      topic = Z[doc_idx][i]

      #remove z_i bcz condition on z_(-i)
      ndk[doc_idx,topic] -= 1
      nkw[topic, word] -= 1
      nk[topic] -= 1
      p_z = (ndk[doc_idx, :] + ALPHA) * (nkw[:, word] + BETA) / (nk + VOCAB_SIZE*BETA)
      topic = random.choices(np.arange(TOPIC,dtype=np.int32),weights=p_z, k=1)[0]
      Z[doc_idx][i] = topic
      ndk[doc_idx][topic] += 1
      nkw[topic, word] += 1
      nk[topic] += 1


100%|██████████| 1000/1000 [17:31<00:00,  1.05s/it]


In [96]:
p = nkw / nk.reshape((TOPIC,1))
top_k = 10
for topic in range(TOPIC):
  print(f"TOPIC: {topic}")
  t = np.argsort(p[topic])[::-1][:top_k]
  s = p[topic]
  s = np.sort(s)[::-1][:top_k]
  for word_idx, score in zip(t,s):
    print(f"({i2k[word_idx]}, {score:.3f})",end=',')
  print("")
  print("_"*20)

TOPIC: 0
(theatre, 0.063),(musical, 0.063),(events, 0.057),(music, 0.044),(best, 0.038),(building, 0.032),(housed, 0.032),(monumental, 0.032),(example, 0.032),(hosts, 0.032),
____________________
TOPIC: 1
(paid, 0.147),(provided, 0.069),(digital, 0.058),(aired, 0.051),(free, 0.049),(broadcast, 0.040),(available, 0.031),(streaming, 0.031),(apps, 0.031),(smartphones, 0.031),
____________________
TOPIC: 2
(carry, 0.064),(game, 0.031),(color, 0.022),(local, 0.022),(station, 0.021),(significant, 0.020),(stations, 0.020),(city, 0.016),(based, 0.015),(population, 0.014),
____________________
TOPIC: 3
(halftime, 0.083),(headlined, 0.079),(broadcast, 0.047),(50, 0.046),(rock, 0.034),(group, 0.033),(possession, 0.024),(commercial, 0.024),(million, 0.024),(ball, 0.023),
____________________
TOPIC: 4
(stayed, 0.057),(divisional, 0.032),(left, 0.030),(attempt, 0.030),(final, 0.029),(minutes, 0.029),(intercepting, 0.029),(conversion, 0.029),(seconds, 0.029),(problems, 0.029),
____________________
TO