**Sentiment Analysis on Clothing Reviews**

**Import All Required Packages**

In [33]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping

**Read data from csv**

In [34]:
df = pd.read_csv('clothing_review_rated.csv')
print(len(df))
df.head()

24000


Unnamed: 0,review,Negative,Neutral,Positive
0,Absolutely wonderful silky sexy comfortable,False,False,True
1,Love dress sooo pretty happened find store im ...,False,False,True
2,love love love jumpsuit fun flirty fabulous ev...,False,False,True
3,shirt flattering due adjustable front tie perf...,False,False,True
4,aded basket hte last mintue see would look lik...,False,False,True


**Apply Train Test Split. Add Check Phrases for Evaluation**

In [63]:
X = df['review'].values
y = df.drop('review', axis=1).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
print(len(X_train), len(X_test), len(y_train), len(y_test))

# check phrases fpr later evaluation of results
checks = ["this dress is absolutly gorgeous", 
            "suit is ugly was made by handless taylor",
            "roses were fresh and nice"]
X_test = np.append(X_test, checks)
print(len(X_test), X_test[-3:])

16800 7200 16800 7200
7203 ['this dress is absolutly gorgeous'
 'suit is ugly was made by handless taylor' 'roses were fresh and nice']


**Apply vectorization**

In [64]:
vect = CountVectorizer()
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

**Apply frequency, inverse document frequency:**

In [65]:
# create an instance of TfidfTransformer to convert raw term frequencies into TF-IDF scores.
tfidf = TfidfTransformer()

# fit the transformer to the training data and transform X_train into a TF-IDF weighted matrix.
X_train = tfidf.fit_transform(X_train)
# transform the test data using the parameters learned from the training data (without refitting).
X_test = tfidf.transform(X_test)

# convert the sparse matrix of TF-IDF features for the training data into a dense NumPy array.
X_train = X_train.toarray()
X_test = X_test.toarray()

# extract the last 3 rows to remove check phrases from the data
X_test_checks = X_test[-3:, :]
X_test = X_test[: -3]
print(len(X_train), len(X_test), len( y_train), len(y_test))

16800 7200 16800 7200


**Set up the Model: 12680 inputs and 3 outputs for Neg, Pos and Neutral**

In [66]:
# create an EarlyStopping callback to halt training when the validation loss stops decreasing.
# - monitor: the metric to monitor during training; here it's 'val_loss' (validation loss).
# - mode: 'min' indicates that training should stop when the monitored metric has stopped decreasing (i.e., we are looking for a minimum).
# - verbose: verbosity mode; 1 means that a message is printed when training stops.
# - patience: number of epochs to wait after the last time the monitored metric improved before stopping.
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)

# Dense (fully connected) layer
# - units: the number of neurons in the layer (12680 in this case).
# - activation: the activation function applied to the output of this layer ('relu' for Rectified Linear Unit).
# Dropout layer:
# - the parameter (0.5) represents the fraction of the input units to drop during training (helps prevent overfitting).
model = Sequential([
    Dense(units=12680,activation='relu'),
    Dropout(0.5),
    Dense(units=4000,activation='relu'),
    Dropout(0.5),
    Dense(units=500,activation='relu'),
    Dropout(0.5),
    Dense(units=3, activation='softmax')
])

opt=tf.keras.optimizers.Adam(learning_rate=0.001)
# compile the model
# - loss: 'categorical_crossentropy' is used for multi-class classification problems.
# - optimizer: the optimizer instance to use during training, here - Adam.
# - metrics: 'accuracy' is used to measure classification accuracy.
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])


**Fit the Model**

In [67]:
# train the model 
# x: Input reviews used for training (X_train).
# y: Target review labels used for training (y_train).
# batch_size: Number of samples per gradient update,256 samples are processed before updating the model.
# epochs: The number of complete passes through the training dataset.
# validation_data: A tuple (X_test, y_test) used to evaluate the model at the end of each epoch.
# verbose: Controls the verbosity of the training output (0,1,2) 1 displays a progress bar and epoch details.
# callbacks: List of callback functions to apply during training; here, early_stop will halt training if validation loss stops improving.
model.fit(x=X_train, y=y_train, batch_size=256, epochs=100, validation_data=(X_test, y_test), verbose=1, callbacks=early_stop)

Epoch 1/100
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 1s/step - accuracy: 0.5909 - loss: 0.8033 - val_accuracy: 0.8889 - val_loss: 0.3229
Epoch 2/100
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 1s/step - accuracy: 0.9626 - loss: 0.1177 - val_accuracy: 0.9146 - val_loss: 0.2682
Epoch 3/100
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 1s/step - accuracy: 0.9944 - loss: 0.0216 - val_accuracy: 0.9164 - val_loss: 0.3403
Epoch 4/100
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 1s/step - accuracy: 0.9984 - loss: 0.0065 - val_accuracy: 0.9149 - val_loss: 0.4129
Epoch 4: early stopping


<keras.src.callbacks.history.History at 0x26f6df7d1f0>

**Evaluation of Results**

In [68]:
model_score = model.evaluate(X_test, y_test, batch_size=64, verbose=1)
print('Test accuracy:', model_score[1])

# reviews on which we need to predict
pred = model.predict(X_test_checks)
pred = np.array([np.argmax(i) for i in pred])
for i in range(len(checks)):
    print(checks[i], '\t:', pred[i])


[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 45ms/step - accuracy: 0.9176 - loss: 0.4036
Test accuracy: 0.9148610830307007
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
this dress is absolutly gorgeous 	: 2
suit is ugly was made by handless taylor 	: 0
roses were fresh and nice 	: 2
