In [1]:
import pandas as pd
import gensim
from gensim import corpora
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt

# Load the cleaned data
df = pd.read_csv("../data/cleaned_comments.csv")

# Tokenize the cleaned comments
tokenized = [text.split() for text in df["cleaned_comment"].dropna()]

# Create Dictionary and Corpus
dictionary = corpora.Dictionary(tokenized)
corpus = [dictionary.doc2bow(text) for text in tokenized]

# Train LDA Model (try 5 topics first)
lda_model = gensim.models.LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=5,
    random_state=42,
    update_every=1,
    chunksize=100,
    passes=10,
    alpha='auto',
    per_word_topics=True
)

# Print top words per topic
for i, topic in lda_model.print_topics():
    print(f"\nTopic #{i}:")
    print(topic)

# Visualize Topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)



Topic #0:
0.030*"vikram" + 0.014*"assemble" + 0.013*"vera" + 0.012*"bekku" + 0.010*"baakiyalakshmi" + 0.010*"intha" + 0.010*"level" + 0.009*"character" + 0.009*"mass" + 0.009*"karthik"

Topic #1:
0.025*"hai" + 0.014*"ki" + 0.011*"fans" + 0.010*"ke" + 0.010*"ho" + 0.010*"se" + 0.009*"hi" + 0.009*"swag" + 0.009*"leo" + 0.008*"tha"

Topic #2:
0.014*"story" + 0.012*"chiyaan" + 0.011*"entry" + 0.011*"modi" + 0.010*"ye" + 0.009*"interview" + 0.007*"mr" + 0.006*"thapar" + 0.006*"malik" + 0.006*"want"

Topic #3:
0.029*"movie" + 0.028*"retro" + 0.021*"fan" + 0.019*"anna" + 0.018*"miss" + 0.017*"sir" + 0.014*"video" + 0.013*"nice" + 0.012*"la" + 0.011*"ott"

Topic #4:
0.015*"da" + 0.014*"end" + 0.014*"la" + 0.012*"yo" + 0.012*"not" + 0.011*"ku" + 0.011*"like" + 0.011*"release" + 0.011*"serial" + 0.010*"theatre"


  pid = os.fork()
  pid = os.fork()
  pid = os.fork()


In [3]:
import pickle

# Save model and data
with open("../models/lda_model.pkl", "wb") as f:
    pickle.dump(lda_model, f)
with open("../models/dictionary.pkl", "wb") as f:
    pickle.dump(dictionary, f)
with open("../models/corpus.pkl", "wb") as f:
    pickle.dump(corpus, f)