In [22]:
# Lab Assignment 3
# MUHAMMAD FARIHEEN BIN ABD RAHIM (SW01082818)
# MUHAMMAD ADEEB BIN ABDULLAH (SW01082814)

In [7]:
# Import required libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load dataset
df = pd.read_csv('news_dataset.csv')

# Remove null values
df = df.dropna(subset=['text'])

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Define preprocessing function
def preprocess_text(text):
    # Lowercase
    text = text.lower()

    # Tokenize
    tokens = word_tokenize(text)

    # Remove punctuation and non-alphabetic tokens
    tokens = [word for word in tokens if word.isalpha()]

    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize each token
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens

# Apply preprocessing
df['processed_text'] = df['text'].apply(preprocess_text)

# Show sample
df[['text', 'processed_text']].head()

df.to_csv("news_dataset_processed.csv", index=False)
# save in new file


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mdade\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mdade\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mdade\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\mdade\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [9]:
# Show sample
df[['text', 'processed_text']].head()

Unnamed: 0,text,processed_text
0,I was wondering if anyone out there could enli...,"[wondering, anyone, could, enlighten, car, saw..."
1,I recently posted an article asking what kind ...,"[recently, posted, article, asking, kind, rate..."
2,\nIt depends on your priorities. A lot of peo...,"[depends, priority, lot, people, put, higher, ..."
3,an excellent automatic can be found in the sub...,"[excellent, automatic, found, subaru, legacy, ..."
4,: Ford and his automobile. I need information...,"[ford, automobile, need, information, whether,..."


In [10]:
import pandas as pd
import ast
from gensim import corpora
from gensim.models import LdaModel

# Load preprocessed CSV
df = pd.read_csv("news_dataset_processed.csv")
df['processed_text'] = df['processed_text'].apply(ast.literal_eval)

# Optional cleanup: remove short or junk tokens (if needed)
df['processed_text'] = df['processed_text'].apply(lambda tokens: [t for t in tokens if t.isalpha() and len(t) > 2])

# Assign to list
preprocessed_documents = df['processed_text'].tolist()

# Create dictionary and corpus
dictionary = corpora.Dictionary(preprocessed_documents)
dictionary.filter_extremes(no_below=15, no_above=0.5)
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_documents]

# Train LDA model
lda_model = LdaModel(corpus=corpus,
                     id2word=dictionary,
                     num_topics=4,
                     passes=15,
                     random_state=42)

# Show top 10 terms per topic
print("Top Terms for Each Topic:\n")
for topic_id in range(lda_model.num_topics):
    print(f"Topic #{topic_id}:")
    terms = lda_model.show_topic(topic_id, topn=10)
    for term, weight in terms:
        print(f"- {term} ({weight:.4f})")
    print()

Top Terms for Each Topic:

Topic #0:
- people (0.0118)
- would (0.0112)
- one (0.0098)
- think (0.0065)
- say (0.0061)
- know (0.0058)
- right (0.0051)
- god (0.0051)
- time (0.0048)
- government (0.0047)

Topic #1:
- max (0.0270)
- president (0.0118)
- new (0.0087)
- american (0.0065)
- year (0.0060)
- national (0.0058)
- program (0.0057)
- university (0.0053)
- administration (0.0052)
- state (0.0051)

Topic #2:
- key (0.0143)
- use (0.0104)
- file (0.0098)
- system (0.0095)
- chip (0.0077)
- encryption (0.0067)
- one (0.0061)
- window (0.0060)
- program (0.0057)
- information (0.0055)

Topic #3:
- would (0.0102)
- one (0.0101)
- get (0.0091)
- like (0.0084)
- game (0.0079)
- year (0.0070)
- good (0.0067)
- time (0.0061)
- know (0.0061)
- think (0.0057)



In [12]:
from gensim.models import CoherenceModel

# Evaluate the coherence score
coherence_model = CoherenceModel(model=lda_model, 
                                  texts=preprocessed_documents, 
                                  dictionary=dictionary, 
                                  coherence='c_v')

coherence_score = coherence_model.get_coherence()

print(f"Coherence Score: {coherence_score:.4f}")

Coherence Score: 0.5078


In [18]:
# The LDA model achieved a coherence score of approximately 0.5078, indicating a moderate level of interpretability.
# This means the topics generated are generally meaningful and understandable, though there may still be some overlap.
# It's a decent result for exploratory analysis and shows the model has captured useful themes in the data.
# There’s still room for improvement, and the coherence could be increased by tuning topic numbers or refining preprocessing.