<a href="https://colab.research.google.com/github/devpatel25/ML_Algorithms/blob/main/Women_Cloth_Review_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Women Cloth Review Prediction

---

## Objective
 The aim of this project is to predict cloth rating on the basis of the feature variables.

In [1]:
# import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# import Dataset
df=pd.read_csv('https://github.com/AFAgarap/ecommerce-reviews-analysis/raw/master/Womens%20Clothing%20E-Commerce%20Reviews.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23486 entries, 0 to 23485
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Unnamed: 0               23486 non-null  int64 
 1   Clothing ID              23486 non-null  int64 
 2   Age                      23486 non-null  int64 
 3   Title                    19676 non-null  object
 4   Review Text              22641 non-null  object
 5   Rating                   23486 non-null  int64 
 6   Recommended IND          23486 non-null  int64 
 7   Positive Feedback Count  23486 non-null  int64 
 8   Division Name            23472 non-null  object
 9   Department Name          23472 non-null  object
 10  Class Name               23472 non-null  object
dtypes: int64(6), object(5)
memory usage: 2.0+ MB


##Missing Values

removing the missing values with No Review text.

In [5]:
df['Review Text'].isna().sum()

845

In [6]:
df[df['Review Text']==""]=np.nan
df['Review Text'].fillna("No Review",inplace=True)

In [7]:
df['Review Text'].isna().sum()

0

In [8]:
df['Review Text']

Unnamed: 0,Review Text
0,Absolutely wonderful - silky and sexy and comf...
1,Love this dress! it's sooo pretty. i happene...
2,I had such high hopes for this dress and reall...
3,"I love, love, love this jumpsuit. it's fun, fl..."
4,This shirt is very flattering to all due to th...
...,...
23481,I was very happy to snag this dress at such a ...
23482,"It reminds me of maternity clothes. soft, stre..."
23483,"This fit well, but the top was very see throug..."
23484,I bought this dress for a wedding i have this ...


# Define target y and feature x.

In [9]:
df.columns

Index(['Unnamed: 0', 'Clothing ID', 'Age', 'Title', 'Review Text', 'Rating',
       'Recommended IND', 'Positive Feedback Count', 'Division Name',
       'Department Name', 'Class Name'],
      dtype='object')

In [10]:
x=df['Review Text']
y=df['Rating']

# Train Test Split

In [11]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.7,random_state=2529,stratify=y)

#Converting text features into tokens.

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(lowercase=True,analyzer='word',ngram_range=(2,3),stop_words='english',max_features=5000)
x_train=cv.fit_transform(x_train)

In [13]:
cv.get_feature_names_out()

array(['10 12', '10 bought', '10 fit', ..., 'yes runs', 'yoga pants',
       'zipper little'], dtype=object)

In [14]:
x_train.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [15]:
x_test=cv.fit_transform(x_test)

# Get Model Train

In [16]:
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()
model.fit(x_train,y_train)

In [17]:
model.fit(x_train,y_train)

In [18]:
y_pred=model.predict(x_test)
y_pred.shape

(16441,)

In [19]:
y_pred

array([3, 4, 2, ..., 5, 4, 3])

# Get Probability of each Pridiction

In [20]:
model.predict_proba(x_test)

array([[2.50243939e-01, 1.21534354e-01, 4.79327448e-01, 1.27392454e-02,
        1.36155013e-01],
       [7.07567572e-02, 4.58818848e-02, 2.63088820e-01, 6.12325394e-01,
        7.94714345e-03],
       [2.01162807e-01, 3.38200504e-01, 2.95746505e-01, 9.24481824e-02,
        7.24420016e-02],
       ...,
       [3.91842500e-03, 2.98245742e-03, 3.09715894e-04, 7.72959108e-03,
        9.85059811e-01],
       [1.61530659e-01, 3.51476805e-02, 5.52149762e-02, 5.43909305e-01,
        2.04197379e-01],
       [1.26442837e-01, 4.89792929e-02, 3.28750971e-01, 2.68648730e-01,
        2.27178169e-01]])

#Model Evaluation

In [21]:
from sklearn.metrics import confusion_matrix,classification_report

In [22]:
print(confusion_matrix(y_test,y_pred))

[[  37   76   66  130  280]
 [  83  166  182  203  462]
 [ 196  285  342  396  791]
 [ 386  392  457  695 1624]
 [ 885  837  963 1584 4923]]


In [23]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.02      0.06      0.03       589
           2       0.09      0.15      0.12      1096
           3       0.17      0.17      0.17      2010
           4       0.23      0.20      0.21      3554
           5       0.61      0.54      0.57      9192

    accuracy                           0.37     16441
   macro avg       0.23      0.22      0.22     16441
weighted avg       0.42      0.37      0.39     16441



In [27]:
df.replace({'Rating' : {1:0, 2:0, 3:0, 4:1, 5:1}},inplace = True)

In [29]:
y=df['Rating']

In [30]:
x=df['Review Text']

In [31]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.7,random_state=2529,stratify=y)

In [32]:
cv=CountVectorizer(lowercase=True,analyzer='word',ngram_range=(2,3),stop_words='english',max_features=5000)
x_train=cv.fit_transform(x_train)
x_test=cv.fit_transform(x_test)

#Get model Retrain

In [33]:
model=MultinomialNB()
model.fit(x_train,y_train)

In [34]:
y_pred=model.predict(x_test)
y_pred.shape

(16441,)

In [35]:
y_pred

array([0, 0, 1, ..., 1, 1, 0])

#Get Model Evaluation

In [36]:
print(confusion_matrix(y_test,y_pred))

[[1125 2570]
 [2756 9990]]


In [37]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.29      0.30      0.30      3695
           1       0.80      0.78      0.79     12746

    accuracy                           0.68     16441
   macro avg       0.54      0.54      0.54     16441
weighted avg       0.68      0.68      0.68     16441





---

