# LAB ASSIGNMENT 3 - Sentiment Analysis of Amazon Review

Group Members:-

Muhammad Fahmi Bin Misri (SW01081019)
Nik Muhammad Nafis Bin Nik Azlan (SW01081028)

In [1]:
# Step 1: Import the necessary libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
from gensim.models.coherencemodel import CoherenceModel
import re

# Download NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Fahmi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Fahmi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Fahmi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
df = pd.read_csv("news_dataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,text,target,title,date
0,0,I was wondering if anyone out there could enli...,7,rec.autos,2022-08-02 13:48:37.251043
1,17,I recently posted an article asking what kind ...,7,rec.autos,2022-08-02 13:48:37.251043
2,29,\nIt depends on your priorities. A lot of peo...,7,rec.autos,2022-08-02 13:48:37.251043
3,56,an excellent automatic can be found in the sub...,7,rec.autos,2022-08-02 13:48:37.251043
4,64,: Ford and his automobile. I need information...,7,rec.autos,2022-08-02 13:48:37.251043


In [3]:
df.shape

(11314, 5)

In [4]:
# select Text column
df_t = df['text']
df_t.head()

0    I was wondering if anyone out there could enli...
1    I recently posted an article asking what kind ...
2    \nIt depends on your priorities.  A lot of peo...
3    an excellent automatic can be found in the sub...
4    : Ford and his automobile.  I need information...
Name: text, dtype: object

In [5]:
# check no. of duplicates in Text column
df_t_duplicates = df_t.duplicated()
print(df_t_duplicates.sum())

320


In [6]:
# remove duplicates in Text column
df_rdup = df_t.drop_duplicates(keep='first')
df_rdup.shape

(10994,)

In [7]:
# Step 2: Read the data (use only the ‘text’ column)
df = pd.read_csv('news_dataset.csv')
df_text = df[['text']].dropna()

In [8]:
# Step 3: Perform text pre-processing
# Define the stop words
stop_words = set(stopwords.words('english'))

Removing HTML tags and unwanted characters and Tokenizing Text

In [9]:
# Initialize stopwords, lemmatizer, and stemmer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess(text):
    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub('[^a-zA-Z]', ' ', text).lower()
    
    # Tokenize the text
    words = nltk.word_tokenize(text)
    
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    
    # Stemming and Lemmatization
    words = [stemmer.stem(word) for word in words]
    words = [lemmatizer.lemmatize(word) for word in words]
    
    return words

# Apply preprocessing and create new column
df_text['processed'] = df_text['text'].apply(preprocess)

In [10]:
# Show the dataframe with the new 'processed' column
df_text[['text', 'processed']].head()

Unnamed: 0,text,processed
0,I was wondering if anyone out there could enli...,"[wonder, anyon, could, enlighten, car, saw, da..."
1,I recently posted an article asking what kind ...,"[recent, post, articl, ask, kind, rate, singl,..."
2,\nIt depends on your priorities. A lot of peo...,"[depend, prioriti, lot, peopl, put, higher, pr..."
3,an excellent automatic can be found in the sub...,"[excel, automat, found, subaru, legaci, switch..."
4,: Ford and his automobile. I need information...,"[ford, automobil, need, inform, whether, ford,..."


In [11]:
# Step 4: Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(df_text['processed'])
corpus = [dictionary.doc2bow(text) for text in df_text['processed']]

# Step 5: Perform LDA using Gensim
lda_model = models.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=15)

In [12]:
# Step 6: Evaluate the LDA model using Coherence score
coherence_model_lda = CoherenceModel(model=lda_model, texts=df_text['processed'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

In [19]:
# Step 7: Interpret the result
print(f'Coherence Score (C_V): {coherence_lda:.4f}')

Coherence Score (C_V): 0.6951


In [18]:
# Print the topics
topics = lda_model.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.041*"space" + 0.016*"nasa" + 0.015*"launch" + 0.013*"orbit" + 0.010*"satellit" + 0.009*"leaf" + 0.009*"earth" + 0.008*"mission" + 0.007*"moon" + 0.007*"rocket"')
(1, '0.012*"would" + 0.011*"one" + 0.010*"peopl" + 0.008*"think" + 0.008*"like" + 0.007*"say" + 0.007*"know" + 0.007*"go" + 0.007*"get" + 0.006*"time"')
(2, '0.032*"game" + 0.025*"team" + 0.019*"play" + 0.016*"player" + 0.013*"win" + 0.012*"season" + 0.011*"leagu" + 0.009*"hockey" + 0.009*"score" + 0.008*"turkey"')
(3, '0.010*"govern" + 0.007*"state" + 0.006*"presid" + 0.006*"secur" + 0.005*"new" + 0.005*"mr" + 0.005*"law" + 0.005*"u" + 0.005*"public" + 0.005*"nation"')
(4, '0.022*"use" + 0.020*"key" + 0.012*"encrypt" + 0.010*"chip" + 0.009*"system" + 0.009*"one" + 0.008*"bit" + 0.007*"would" + 0.007*"get" + 0.006*"window"')
(5, '0.049*"god" + 0.027*"christian" + 0.020*"jesu" + 0.017*"jew" + 0.014*"church" + 0.012*"bibl" + 0.012*"believ" + 0.011*"faith" + 0.011*"religion" + 0.010*"christ"')
(6, '0.049*"x" + 0.016*"edu" 

In [16]:
# Step 8: Interpret the result
# Print the topics with the top 10 words for each topic along with their weights
topics = lda_model.show_topics(num_topics=10, num_words=10, formatted=False)
for topic_num, terms in topics:
    print(f'Topic #{topic_num + 1}:')
    for term, weight in terms:
        print(f'{term} ({weight:.4f})')
    print()

Topic #1:
space (0.0409)
nasa (0.0155)
launch (0.0146)
orbit (0.0128)
satellit (0.0098)
leaf (0.0094)
earth (0.0094)
mission (0.0085)
moon (0.0071)
rocket (0.0067)

Topic #2:
would (0.0120)
one (0.0114)
peopl (0.0102)
think (0.0080)
like (0.0076)
say (0.0074)
know (0.0072)
go (0.0070)
get (0.0067)
time (0.0061)

Topic #3:
game (0.0320)
team (0.0251)
play (0.0193)
player (0.0156)
win (0.0134)
season (0.0119)
leagu (0.0106)
hockey (0.0095)
score (0.0091)
turkey (0.0085)

Topic #4:
govern (0.0099)
state (0.0073)
presid (0.0063)
secur (0.0056)
new (0.0055)
mr (0.0053)
law (0.0051)
u (0.0050)
public (0.0050)
nation (0.0047)

Topic #5:
use (0.0218)
key (0.0196)
encrypt (0.0117)
chip (0.0098)
system (0.0091)
one (0.0085)
bit (0.0082)
would (0.0066)
get (0.0065)
window (0.0063)

Topic #6:
god (0.0488)
christian (0.0274)
jesu (0.0201)
jew (0.0174)
church (0.0137)
bibl (0.0123)
believ (0.0117)
faith (0.0113)
religion (0.0106)
christ (0.0098)

Topic #7:
x (0.0486)
edu (0.0160)
file (0.0159)
anony