### Import Libraries

In [352]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from wordcloud import WordCloud, STOPWORDS
sns.set_context("talk", font_scale = 0.8, rc={"grid.linewidth": 5})
import re
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold,RandomizedSearchCV,GridSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix,roc_auc_score,recall_score,classification_report
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.metrics.pairwise import pairwise_distances
import pickle

In [3]:
df= pd.read_csv('sample30.csv')
print('Dataset Shape:',df.shape)
df.head(3)

Dataset Shape: (30000, 15)


Unnamed: 0,id,brand,categories,manufacturer,name,reviews_date,reviews_didPurchase,reviews_doRecommend,reviews_rating,reviews_text,reviews_title,reviews_userCity,reviews_userProvince,reviews_username,user_sentiment
0,AV13O1A8GV-KLJ3akUyj,Universal Music,"Movies, Music & Books,Music,R&b,Movies & TV,Mo...",Universal Music Group / Cash Money,Pink Friday: Roman Reloaded Re-Up (w/dvd),2012-11-30T06:21:45.000Z,,,5,i love this album. it's very good. more to the...,Just Awesome,Los Angeles,,joshua,Positive
1,AV14LG0R-jtxr-f38QfS,Lundberg,"Food,Packaged Foods,Snacks,Crackers,Snacks, Co...",Lundberg,Lundberg Organic Cinnamon Toast Rice Cakes,2017-07-09T00:00:00.000Z,True,,5,Good flavor. This review was collected as part...,Good,,,dorothy w,Positive
2,AV14LG0R-jtxr-f38QfS,Lundberg,"Food,Packaged Foods,Snacks,Crackers,Snacks, Co...",Lundberg,Lundberg Organic Cinnamon Toast Rice Cakes,2017-07-09T00:00:00.000Z,True,,5,Good flavor.,Good,,,dorothy w,Positive


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   id                    30000 non-null  object
 1   brand                 30000 non-null  object
 2   categories            30000 non-null  object
 3   manufacturer          29859 non-null  object
 4   name                  30000 non-null  object
 5   reviews_date          29954 non-null  object
 6   reviews_didPurchase   15932 non-null  object
 7   reviews_doRecommend   27430 non-null  object
 8   reviews_rating        30000 non-null  int64 
 9   reviews_text          30000 non-null  object
 10  reviews_title         29810 non-null  object
 11  reviews_userCity      1929 non-null   object
 12  reviews_userProvince  170 non-null    object
 13  reviews_username      29937 non-null  object
 14  user_sentiment        29999 non-null  object
dtypes: int64(1), object(14)
memory usage

In [5]:
#Function to check percentage of null values present in dataset 
def calnullpercentage(df):
    missing_num= df[df.columns].isna().sum().sort_values(ascending=False)
    missing_perc= (df[df.columns].isna().sum()/len(df)*100).sort_values(ascending=False)
    missing= pd.concat([missing_num,missing_perc],keys=['Total','Percentage'],axis=1)
    missing= missing[missing['Percentage']>0]
    return missing

In [6]:
calnullpercentage(df)

Unnamed: 0,Total,Percentage
reviews_userProvince,29830,99.433333
reviews_userCity,28071,93.57
reviews_didPurchase,14068,46.893333
reviews_doRecommend,2570,8.566667
reviews_title,190,0.633333
manufacturer,141,0.47
reviews_username,63,0.21
reviews_date,46,0.153333
user_sentiment,1,0.003333


## Data Cleaning & Text Preprocessing

In [10]:
# Dropping rows where user sentiment has NaN values (only 1 datapoint is there)
df= df[~(df['user_sentiment'].isna())]

In [12]:
def preprocess(document, stem=True):

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)

    # remove stop words or punctuation
    words=[word for word in words if word not in stopwords.words('english') or word not in string.punctuation]
    
    # join words to make sentence
    document = " ".join(words)

    return document

In [13]:
reviews_txt=[preprocess(document) for document in df['reviews_text']]
#reviews_txt = [rev for rev in reviews_txt if len(rev)>=3]
print(reviews_txt)



## Feature extraction

In [14]:
# TFIDF vectorizer
vect = TfidfVectorizer()
tfidf_vect=vect.fit_transform(reviews_txt)

In [15]:
train_features = pd.DataFrame(tfidf_vect.toarray(),columns=vect.get_feature_names())
dummy_ratingdf =pd.get_dummies(df['reviews_rating'],drop_first=True) # creating dummy variable for reviews rating(1,2..,5)
train_features.reset_index(drop=True,inplace=True)
dummy_ratingdf.reset_index(drop=True,inplace=True)
train_features= pd.concat([train_features,dummy_ratingdf],axis=1)
X= train_features
y = df['user_sentiment'].map({'Positive':1,'Negative':0})

In [16]:
len(train_features)

29999

In [17]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Training a text classification model

### `Prediction and Evaluation using Multinomial Naive Bayes Model`

In [142]:
params={}
mn_bayes =GridSearchCV(MultinomialNB(),cv=fold,scoring='accuracy',param_grid=params,verbose=1,return_train_score=True)
mn_bayes.fit(X_train,y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=True),
             estimator=MultinomialNB(), param_grid={}, return_train_score=True,
             scoring='accuracy', verbose=1)

In [145]:
y_train_pred = mn_bayes.predict(X_train) # predicted label for test data
y_test_pred = mn_bayes.predict(X_test) # predicted label for test data
mnbayes_train_accuracy= accuracy_score(y_train,y_train_pred) # train set accuracy
mnbayes_test_accuracy = accuracy_score(y_test,y_test_pred) # model accuracy
print("Multinumaial NB Training set accuracy:",round(mnbayes_train_accuracy*100,2),'%')
print("Multinumaial NB Testing set accuracy:",round(mnbayes_test_accuracy*100,2),'%')

Multinumaial NB Training set accuracy: 88.81 %
Multinumaial NB Testing set accuracy: 89.15 %


### `Prediction and Evaluation using XGBoost Model`

In [159]:
xgbc= XGBClassifier(random_state=42,class_weight='balanced')
xgbc.fit(X_train,y_train)

XGBClassifier(class_weight='balanced', random_state=42)

In [160]:
y_train_pred = xgbc.predict(X_train) # predicted label for test data
y_test_pred = xgbc.predict(X_test) # predicted label for test data
xgbc_train_accuracy= accuracy_score(y_train,y_train_pred) # train set accuracy
xgbc_test_acc = accuracy_score(y_test,y_test_pred) # model accuracy
print("XGBoost Training set accuracy:",round(xgbc_train_accuracy*100,2),'%')
print("XGBoost Testing set accuracy:",round(xgbc_test_acc*100,2),'%')

XGBoost Training set accuracy: 90.2 %
XGBoost Testing set accuracy: 89.75 %
