In [1]:
# !pip install scikit-learn nltk

import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Download required NLTK data
print("Downloading NLTK resources...")
nltk.download('stopwords')
nltk.download('wordnet')
print("Downloads complete.")

Downloading NLTK resources...
Downloads complete.


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tarru\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tarru\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [3]:
doc1 = "Sugar is bad to consume. My sister likes to have sugar, but not my father."
doc2 = "My father spends a lot of time driving my sister around to dance practice."
doc3 = "Doctors suggest that driving may cause increased stress and blood pressure."
doc4 = "Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better."
doc5 = "Health experts say that Sugar is not good for your lifestyle."

doc_complete = [doc1, doc2, doc3, doc4, doc5]

print("--- Original Corpus ---")
for doc in doc_complete:
    print(doc)

--- Original Corpus ---
Sugar is bad to consume. My sister likes to have sugar, but not my father.
My father spends a lot of time driving my sister around to dance practice.
Doctors suggest that driving may cause increased stress and blood pressure.
Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better.
Health experts say that Sugar is not good for your lifestyle.


In [4]:
def preprocess_doc(text):
    text = text.lower()
    tokens = word_tokenize(text)
    clean_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in clean_tokens]
    return " ".join(lemmatized_tokens)

doc_clean = [preprocess_doc(doc) for doc in doc_complete]

print("\n--- Processed Corpus (Cleaned Strings) ---")
for doc in doc_clean:
    print(doc)


--- Processed Corpus (Cleaned Strings) ---
sugar bad consume sister like sugar father
father spends lot time driving sister around dance practice
doctor suggest driving may cause increased stress blood pressure
sometimes feel pressure perform well school father never seems drive sister better
health expert say sugar good lifestyle


In [5]:
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')

doc_term_matrix = vectorizer.fit_transform(doc_clean)

print("--- Document-Term Matrix Shape ---")
print(doc_term_matrix.shape)

--- Document-Term Matrix Shape ---
(5, 5)


In [6]:
num_topics = 2

lda_model = LatentDirichletAllocation(
    n_components=num_topics,
    random_state=42
)
lda_model.fit(doc_term_matrix)

print(f"LDA Model trained to find {num_topics} topics.")

LDA Model trained to find 2 topics.


In [7]:
def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx + 1}:")
        top_words = [feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]
        print(" ".join(top_words))

feature_names = vectorizer.get_feature_names_out()

print(f"\n--- Top {num_topics} Topics Identified by LDA ---")
display_topics(lda_model, feature_names, 4)


--- Top 2 Topics Identified by LDA ---
Topic 1:
sugar father sister pressure
Topic 2:
driving pressure sister father
