In [None]:
import numpy as np
import pandas as pd

In [None]:
import os
for dirname, _, filenames in os.walk('https://www.kaggle.com/datasets/niraliivaghani/flipkart-product-customer-reviews-dataset'):
  for filename in filenames:
    print(os.path.join(dirname, filename))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from matplotlib import style
style.use('ggplot')
import csv

In [None]:
df=pd.read_csv("/content/Dataset-SA.csv")
df.head()

Unnamed: 0,product_name,product_price,Rate,Review,Summary,Sentiment
0,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,super!,great cooler excellent air flow and for this p...,positive
1,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,awesome,best budget 2 fit cooler nice cooling,positive
2,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,the quality is good but the power of air is de...,positive
3,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,1,useless product,very bad product its a only a fan,negative
4,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,ok ok product,neutral


In [None]:
df.shape


(205052, 6)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205052 entries, 0 to 205051
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   product_name   205052 non-null  object
 1   product_price  205052 non-null  object
 2   Rate           205052 non-null  object
 3   Review         180388 non-null  object
 4   Summary        205041 non-null  object
 5   Sentiment      205052 non-null  object
dtypes: object(6)
memory usage: 9.4+ MB


In [None]:
df.describe()

Unnamed: 0,product_name,product_price,Rate,Review,Summary,Sentiment
count,205052,205052,205052,180388,205041,205052
unique,958,525,8,1324,92923,3
top,cello Pack of 18 Opalware Cello Dazzle Lush Fi...,1299,5,wonderful,good,positive
freq,6005,9150,118765,9016,17430,166581


In [None]:
df.isnull().sum()

Unnamed: 0,0
product_name,0
product_price,0
Rate,0
Review,24664
Summary,11
Sentiment,0


In [None]:
df.columns

Index(['product_name', 'product_price', 'Rate', 'Review', 'Summary',
       'Sentiment'],
      dtype='object')

In [None]:
df['product_name'].nunique()

958

In [None]:
#drop the null values
df.dropna(inplace=True, axis=0)
df.isnull().sum()

Unnamed: 0,0
product_name,0
product_price,0
Rate,0
Review,0
Summary,0
Sentiment,0


In [None]:
#Pre-processing Reviews
for review in df['Review'][0:5]:
    print(review,'\n')

super! 

awesome 

fair 

useless product 

fair 



In [None]:
#Remove HTML tags Using BeautifulSoup from bs4 module to remove the html tags. We have already removed the html tags with pattern "64...", we will use get_text() to remove the html tags if there are any.

#Remove the stopwords like "a", "the", "I" etc. Remove symbols and special characters. We will remove the special characters from our reviews like '#' ,'&' ,'@' etc.

#Tokenize We will tokenize the words. We will split the sentences with spaces e.g "I might come" --> "I", "might", "come"

#Stemming -Remove the suffixes from the words to get the root form of the word. e.g 'Wording' --> "Word"

In [None]:
#import the libraries for pre-processing
from bs4 import BeautifulSoup
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

# Download the stop words corpus
nltk.download('stopwords')

stops = set(stopwords.words('english')) #english stopwords

stemmer = SnowballStemmer('english') #SnowballStemmer

def review_to_words(raw_review):
    # 1. Delete HTML
    review_text = BeautifulSoup(raw_review, 'html.parser').get_text()
    # 2. Make a space
    letters_only = re.sub('[^a-zA-Z]', ' ', review_text)
    # 3. lower letters
    words = letters_only.lower().split()
    # 5. Stopwords
    meaningful_words = [w for w in words if not w in stops]
    # 6. Stemming
    stemming_words = [stemmer.stem(w) for w in meaningful_words]
    # 7. space join words
    return( ' '.join(stemming_words))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
#apply review_to_words function on reviews
df['Review'] = df['Review'].apply(review_to_words)

In [None]:
from sklearn.preprocessing import LabelEncoder

#apply label encoding to the sentiment column
encoder = LabelEncoder()
df['sentiment_encoded'] = encoder.fit_transform(df['Sentiment'])

*Model Building*

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import confusion_matrix

from sklearn.naive_bayes import MultinomialNB

from sklearn.ensemble import RandomForestClassifier

We all know that we cannot pass raw text features in our model. We have to convert them into numeric values. We will use TfidfVectorizer to convert our reviews in numeric features

In [None]:
# Creates TF-IDF vectorizer and transforms the corpus
vectorizer = TfidfVectorizer()
reviews_corpus = vectorizer.fit_transform(df.Review)
reviews_corpus.shape

(180379, 1018)

In [None]:
#dependent feature
sentiment = df['sentiment_encoded']
sentiment.shape

(180379,)

In [None]:
#split the data in train and test
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(reviews_corpus,sentiment,test_size=0.33,random_state=42)
print('Train data shape ',X_train.shape,Y_train.shape)
print('Test data shape ',X_test.shape,Y_test.shape)

Train data shape  (120853, 1018) (120853,)
Test data shape  (59526, 1018) (59526,)


Multinomial Naive Bayes is useful for determining the emotional tone of a piece of text, such as a tweet, product review, or customer feedbacks. The model works by computing the probability of each possible sentiment label (positive, negative, neutral) given a piece of text.

In [None]:
clf = MultinomialNB().fit(X_train, Y_train) #fit the training data

pred = clf.predict(X_test) #predict the sentiment for test data

print("Accuracy: %s" % str(clf.score(X_test, Y_test))) #check accuracy
print("Confusion Matrix")
print(confusion_matrix(pred, Y_test)) #print confusion matrix

Accuracy: 0.9029163726774855
Confusion Matrix
[[ 5804   511   493]
 [   18     7    93]
 [ 2273  2391 47936]]


In [None]:
#fit the model and predicct the output

clf = RandomForestClassifier().fit(X_train, Y_train)

pred = clf.predict(X_test)

print("Accuracy: %s" % str(clf.score(X_test, Y_test)))
print("Confusion Matrix")
print(confusion_matrix(pred, Y_test))

Accuracy: 0.9096865235359339
Confusion Matrix
[[ 6121   524   492]
 [    0     0     1]
 [ 1974  2385 48029]]
