**Import libraries**

In [163]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

**Import Dataset**

In [164]:
df = pd.read_csv('https://raw.githubusercontent.com/YBIFoundation/ProjectHub-MachineLearning/main/Women%20Clothing%20E-Commerce%20Review.csv')
df.head()
df.info()
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23486 entries, 0 to 23485
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Clothing ID        23486 non-null  int64 
 1   Age                23486 non-null  int64 
 2   Title              19676 non-null  object
 3   Review             22641 non-null  object
 4   Rating             23486 non-null  int64 
 5   Recommended        23486 non-null  int64 
 6   Positive Feedback  23486 non-null  int64 
 7   Division           23472 non-null  object
 8   Department         23472 non-null  object
 9   Category           23472 non-null  object
dtypes: int64(5), object(5)
memory usage: 1.8+ MB


(23486, 10)

<b>Missing values</b>

In [165]:
df.isna().sum()
df[df['Review']==""] = np.NaN
df['Review'].fillna("No Review", inplace = True)
df.isna().sum()

Clothing ID             0
Age                     0
Title                3810
Review                  0
Rating                  0
Recommended             0
Positive Feedback       0
Division               14
Department             14
Category               14
dtype: int64

<b> Define target(y) and Feature(X) <b>

In [166]:
df.columns
X = df['Review']
y = df['Rating']
df['Rating'].value_counts()

5.0    13131
4.0     5077
3.0     2871
2.0     1565
1.0      842
Name: Rating, dtype: int64

<b> Train test split </b>

In [167]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, stratify = y, random_state = 2529)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16440,), (7046,), (16440,), (7046,))

<b> Get Feature text conversion to tokens </b>

In [168]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(lowercase = True, analyzer = 'word', ngram_range=(2,3), stop_words='english', max_features=5000)
X_train = cv.fit_transform(X_train)
cv.get_feature_names_out()
X_train.toarray()
X_test = cv.fit_transform(X_test)
cv.get_feature_names_out()
X_test.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

<b> Get Model train </b>

In [169]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)

<b> Get Model prediction </b>

In [170]:
y_pred = model.predict(X_test)
y_pred.shape
y_pred

array([1., 5., 5., ..., 5., 5., 5.])

<b> Get probability of each predicted class </b>

In [171]:
model.predict_proba(X_test)

array([[0.71118473, 0.02625165, 0.15465118, 0.01496876, 0.09294369],
       [0.02416867, 0.04769471, 0.35268622, 0.16185007, 0.41360034],
       [0.03582725, 0.06660584, 0.12226277, 0.21618005, 0.55912409],
       ...,
       [0.02320281, 0.08950939, 0.08962183, 0.16719203, 0.63047394],
       [0.01167675, 0.00202714, 0.08539004, 0.34347398, 0.55743209],
       [0.03959824, 0.05612822, 0.00688869, 0.1560574 , 0.74132745]])

<b> Get model Evaluation </b>

In [172]:
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[  15   13   45   36  144]
 [  43   43   86   85  213]
 [ 116   78  113  166  388]
 [ 166  108  194  336  719]
 [ 371  272  349  722 2225]]
              precision    recall  f1-score   support

         1.0       0.02      0.06      0.03       253
         2.0       0.08      0.09      0.09       470
         3.0       0.14      0.13      0.14       861
         4.0       0.25      0.22      0.23      1523
         5.0       0.60      0.56      0.58      3939

    accuracy                           0.39      7046
   macro avg       0.22      0.21      0.21      7046
weighted avg       0.42      0.39      0.40      7046



<b> Recatagorize ratings as poor(0) or good(1) </b>

In [173]:
df['Rating'].value_counts()
# rerating 1, 2, 3 as 0 and 4, 5 as 1
df.replace({'Rating': {1: 0, 2: 0, 3: 0, 4:1, 5: 1}}, inplace = True)
y = df['Rating']

<b> Train Test Split </b>

In [182]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, stratify = y, random_state = 2529)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16440,), (7046,), (16440,), (7046,))

<b> Get feature text conversions to tokens </b>

In [183]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(lowercase = True, analyzer = 'word', ngram_range=(2,3), stop_words='english', max_features=5000)
X_train = cv.fit_transform(X_train)
cv.get_feature_names_out()
X_train.toarray()
X_test = cv.fit_transform(X_test)
cv.get_feature_names_out()
X_test.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

<b> Get Model re-train </b>

In [184]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)

<b> Get model prediction </b>

In [185]:
y_pred = model.predict(X_test)
y_pred.shape
y_pred

array([1., 1., 1., ..., 1., 1., 1.])

<b> Get model prediction </b>

In [186]:
model.predict_proba(X_test)
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[ 449 1134]
 [ 989 4474]]
              precision    recall  f1-score   support

         0.0       0.31      0.28      0.30      1583
         1.0       0.80      0.82      0.81      5463

    accuracy                           0.70      7046
   macro avg       0.56      0.55      0.55      7046
weighted avg       0.69      0.70      0.69      7046

