In [86]:
# Task-2  Sentiment Analysis with Natural Language Processing 
# The Objective of this task is to Analyze customer reviews to classify sentiments as positive or negative.
#Importing required libraries
import nltk   # Installed nltk library 
nltk.download('stopwords')

import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report

# This section is to suppresses warnings generated by the code:
def warn(*args,**kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ganga\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [87]:
# Data selection 
# We will use Customer dataset containing customer reviews to  classify sentiments. The dataset is taken from (https://www.kaggle.com/datasets/abhi8923shriv/sentiment-analysis-dataset/data) from a provided source file and is available on kaggle. the dataset contains columns like 
Data = pd.read_csv("C:/Users/ganga/Downloads/test.csv", encoding='latin1')

In [88]:
# Display the first few five of the dataset
Data.head()

Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45,Algeria,43851044.0,2381740.0,18.0
3,01082688c6,happy bday!,positive,morning,46-60,Andorra,77265.0,470.0,164.0
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70,Angola,32866272.0,1246700.0,26.0


In [89]:
# Display basic Information of the dataset
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4815 entries, 0 to 4814
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   textID            3534 non-null   object 
 1   text              3534 non-null   object 
 2   sentiment         3534 non-null   object 
 3   Time of Tweet     3534 non-null   object 
 4   Age of User       3534 non-null   object 
 5   Country           3534 non-null   object 
 6   Population -2020  3534 non-null   float64
 7   Land Area (Km²)   3534 non-null   float64
 8   Density (P/Km²)   3534 non-null   float64
dtypes: float64(3), object(6)
memory usage: 338.7+ KB


In [90]:
# Checking for null values
print(Data.isnull().sum())

textID              1281
text                1281
sentiment           1281
Time of Tweet       1281
Age of User         1281
Country             1281
Population -2020    1281
Land Area (Km²)     1281
Density (P/Km²)     1281
dtype: int64


In [91]:
# Drop the null values
Data.dropna()

Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45,Algeria,43851044.0,2381740.0,18.0
3,01082688c6,happy bday!,positive,morning,46-60,Andorra,77265.0,470.0,164.0
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70,Angola,32866272.0,1246700.0,26.0
...,...,...,...,...,...,...,...,...,...
3529,e5f0e6ef4b,"its at 3 am, im very tired but i can`t sleep ...",negative,noon,21-30,Nicaragua,6624554.0,120340.0,55.0
3530,416863ce47,All alone in this old house again. Thanks for...,positive,night,31-45,Niger,24206644.0,1266700.0,19.0
3531,6332da480c,I know what you mean. My little dog is sinkin...,negative,morning,46-60,Nigeria,206139589.0,910770.0,226.0
3532,df1baec676,_sutra what is your next youtube video gonna b...,positive,noon,60-70,North Korea,25778816.0,120410.0,214.0


In [92]:
# Function to clean and process text
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)    # Remove special characters
    text = text.lower()   # Convert to lower case
    text = text.split()   # Tokenization
    text = [word for word in text if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(text)

# Ensure all entries in 'text' column are strings and replace NaNs with an empty string
Data['text'] = Data['text'].astype(str).fillna('')

# Applying preprocessing
Data['cleaned_review'] = Data['text'].apply(preprocess_text)

print("\nPreprocessed Data")
print(Data[['text', 'cleaned_review']].head())



Preprocessed Data
                                                text  \
0  Last session of the day  http://twitpic.com/67ezh   
1   Shanghai is also really exciting (precisely -...   
2  Recession hit Veronique Branquinho, she has to...   
3                                        happy bday!   
4             http://twitpic.com/4w75p - I like it!!   

                                      cleaned_review  
0            last session day http twitpic com 67ezh  
1  shanghai also really exciting precisely skyscr...  
2  recession hit veronique branquinho quit compan...  
3                                         happy bday  
4                        http twitpic com 4w75p like  


In [93]:
columns = ['cleaned_review','sentiment']
Data[columns].drop_duplicates()


Unnamed: 0,cleaned_review,sentiment
0,last session day http twitpic com 67ezh,neutral
1,shanghai also really exciting precisely skyscr...,positive
2,recession hit veronique branquinho quit compan...,negative
3,happy bday,positive
4,http twitpic com 4w75p like,positive
...,...,...
3530,alone old house thanks net keeps alive kicking...,positive
3531,know mean little dog sinking depression wants ...,negative
3532,_sutra next youtube video gonna love videos,positive
3533,http twitpic com 4woj2 omgssh ang cute ng bby,positive


In [94]:
Data[columns].dropna()

Unnamed: 0,cleaned_review,sentiment
0,last session day http twitpic com 67ezh,neutral
1,shanghai also really exciting precisely skyscr...,positive
2,recession hit veronique branquinho quit compan...,negative
3,happy bday,positive
4,http twitpic com 4w75p like,positive
...,...,...
3529,3 im tired sleep try,negative
3530,alone old house thanks net keeps alive kicking...,positive
3531,know mean little dog sinking depression wants ...,negative
3532,_sutra next youtube video gonna love videos,positive


In [95]:
# Convert text to numerical format using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)  #Keep top 1000 words
x = vectorizer.fit_transform(Data['cleaned_review']).toarray()
y = Data['sentiment']  #Target variable (0 =Negative,1= neutral, 2 = positive)

# Spli the data into training and testing dataset
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)
print(f"Training samples: {x_train.shape[0]}, Testing samples: {x_test.shape[0]}")



Training samples: 3852, Testing samples: 963


In [96]:

# Handle missing values
imputer = SimpleImputer(strategy='most_frequent')
X_train = imputer.fit_transform(x_train)
X_test = imputer.transform(x_test)
# Train the model 
model = LogisticRegression()
model.fit(x_train, y_train)

ValueError: Input contains NaN

In [79]:
# Maping  categorical sentiment values to numerical ones
sentiment_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
Data['sentiment'] = Data['sentiment'].map(sentiment_mapping)

print(Data.isnull().sum())  #checking null values after mapping

Data['sentiment'].fillna(-1, inplace=True)  # Filling null values

Data['sentiment'] = Data['sentiment'].astype(int) # Checking sentiment column is integer type

# Re-run the TF-IDF vectorizer and the model training
vectorizer = TfidfVectorizer(max_features=1000)  # Keep top 1000 words
x = vectorizer.fit_transform(Data['cleaned_review']).toarray()
y = Data['sentiment']  # Target variable (0 = Negative, 1 = Neutral, 2 = Positive)

# Split the data into training and testing datasets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print(f"Training samples: {x_train.shape[0]}, Testing samples: {x_test.shape[0]}")

textID              1281
text                   0
sentiment           1281
Time of Tweet       1281
Age of User         1281
Country             1281
Population -2020    1281
Land Area (Km²)     1281
Density (P/Km²)     1281
cleaned_review         0
dtype: int64
Training samples: 3852, Testing samples: 963


In [80]:
# Training the model
model = LogisticRegression()
model.fit(x_train, y_train)

In [84]:
# Make predictions
y_predict = model.predict(x_test)

#print accuracy
accuracy = accuracy_score(y_test,y_predict)
print(f"Model Accuracy: {accuracy: .2f}")

Model Accuracy:  0.73


In [85]:
# Classification report
print("\n Classification report:")
print(classification_report(y_test,y_predict))


 Classification report:
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00       258
           0       0.75      0.47      0.58       226
           1       0.54      0.75      0.63       268
           2       0.72      0.65      0.69       211

    accuracy                           0.73       963
   macro avg       0.75      0.72      0.72       963
weighted avg       0.75      0.73      0.73       963

