In [1]:
# !pip install gensim nltk

import gensim
from gensim import corpora
from gensim.models import LdaModel
import nltk
import string

print("Downloading NLTK resources...")
nltk.download('stopwords')
nltk.download('wordnet')
print("Downloads complete.")

Downloading NLTK resources...
Downloads complete.


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tarru\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tarru\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)
lemmatizer = WordNetLemmatizer()

In [3]:
doc1 = "Sugar is bad to consume. My sister likes to have sugar, but not my father."
doc2 = "My father spends a lot of time driving my sister around to dance practice."
doc3 = "Doctors suggest that driving may cause increased stress and blood pressure."
doc4 = "Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better."
doc5 = "Health experts say that Sugar is not good for your lifestyle."

doc_complete = [doc1, doc2, doc3, doc4, doc5]

print("--- Original Corpus ---")
for doc in doc_complete:
    print(doc)

--- Original Corpus ---
Sugar is bad to consume. My sister likes to have sugar, but not my father.
My father spends a lot of time driving my sister around to dance practice.
Doctors suggest that driving may cause increased stress and blood pressure.
Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better.
Health experts say that Sugar is not good for your lifestyle.


In [4]:
def preprocess_doc(doc):
    stop_free = " ".join([word for word in doc.lower().split() if word not in stop_words])
    punc_free = "".join(char for char in stop_free if char not in punctuation)
    normalized = " ".join(lemmatizer.lemmatize(word) for word in punc_free.split())
    return normalized.split()

doc_clean = [preprocess_doc(doc) for doc in doc_complete]

print("\n--- Processed Corpus (List of Tokens) ---")
for doc in doc_clean:
    print(doc)


--- Processed Corpus (List of Tokens) ---
['sugar', 'bad', 'consume', 'sister', 'like', 'sugar', 'father']
['father', 'spends', 'lot', 'time', 'driving', 'sister', 'around', 'dance', 'practice']
['doctor', 'suggest', 'driving', 'may', 'cause', 'increased', 'stress', 'blood', 'pressure']
['sometimes', 'feel', 'pressure', 'perform', 'well', 'school', 'father', 'never', 'seems', 'drive', 'sister', 'better']
['health', 'expert', 'say', 'sugar', 'good', 'lifestyle']


In [5]:
dictionary = corpora.Dictionary(doc_clean)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

print("--- Gensim Dictionary (Sample) ---")
print({k: dictionary[k] for k in range(5)})

print("\n--- Bag-of-Words Corpus (Document 1) ---")
print(doc_term_matrix[0])

--- Gensim Dictionary (Sample) ---
{0: 'bad', 1: 'consume', 2: 'father', 3: 'like', 4: 'sister'}

--- Bag-of-Words Corpus (Document 1) ---
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2)]


In [6]:
ldamodel = LdaModel(
    doc_term_matrix, 
    num_topics=3, 
    id2word=dictionary, 
    passes=50, 
    random_state=42
)

print("LDA Model trained to find 3 topics.")

LDA Model trained to find 3 topics.


In [7]:
print(f"--- Top 3 Topics Identified by LDA ---")

topics = ldamodel.print_topics(num_topics=3, num_words=3)

for i, topic in enumerate(topics):
    print(f"Topic {i+1}: {topic[1]}")

--- Top 3 Topics Identified by LDA ---
Topic 1: 0.135*"sugar" + 0.054*"like" + 0.054*"consume"
Topic 2: 0.079*"driving" + 0.045*"cause" + 0.045*"increased"
Topic 3: 0.057*"sister" + 0.057*"father" + 0.056*"pressure"
