# Rating Prediction for Women’s <br> E-commerce Clothing Reviews with RNN 
## Binary classification for the predictions - experiment
<br>__Gabriela Ayala__


## RNN using TF-IDF as feature engineering for binary classification 
Output is between 0 and 1 (0 representing it not being a 5 star review, 1 is a 5 star review)

In [15]:
import numpy as np
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [16]:
full_df = pd.read_csv('Womens_Clothing_E-Commerce_Reviews.csv', encoding='latin-1')
full_df['Review'] = full_df['Title'].fillna('') + ' ' + full_df['Review Text'].fillna('')
data = full_df[["Review", "Rating"]].copy()

In [17]:
custom_stop_words = set(stopwords.words('english')) - {'no', 'not'}
custom_stop_words.update([
    'knit', 'lounge', 'trend', 'casual bottom', 'skirt', 'outerwear',
    'sweater', 'intimate', 'jacket', 'fine gauge', 'blouse', 'legwear', 'swim',
    'pant', 'chemise', 'sleep', 'layering', 'dress', 'jean', 'short', 'top', 'shirt'
])

def preprocess(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    text = ' '.join([word for word in text.split() if word not in custom_stop_words])
    return text

In [18]:
reviews = data['Review'].apply(preprocess)

In [19]:
# Compute TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

In [20]:
tfidf_features = tfidf_vectorizer.fit_transform(reviews).toarray()

In [21]:
# Reshape TF-IDF features to fit RNN input requirements
tfidf_features = tfidf_features.reshape((tfidf_features.shape[0], 1, tfidf_features.shape[1]))

In [22]:
labels = np.array(data['Rating'].apply(lambda x: 1 if x == 5 else 0))

In [23]:
train_inputs, test_inputs, train_labels, test_labels = train_test_split(tfidf_features, labels, train_size=0.7, random_state=100)

In [9]:
embedding_dim = 128
inputs_layer = tf.keras.Input(shape=(1, tfidf_features.shape[2]))
gru = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(128, return_sequences=True))(inputs_layer)
flatten = tf.keras.layers.Flatten()(gru)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(flatten)
model = tf.keras.Model(inputs_layer, outputs)

In [10]:
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

# Train the model
history = model.fit(train_inputs, train_labels, validation_split=0.2, 
                    batch_size=32, epochs=20,
                    callbacks=[tf.keras.callbacks.EarlyStopping(
                        monitor='val_accuracy', 
                        patience=5, 
                        restore_best_weights=True)])

Epoch 1/20
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 59ms/step - accuracy: 0.7168 - loss: 0.5572 - val_accuracy: 0.8072 - val_loss: 0.4324
Epoch 2/20
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 55ms/step - accuracy: 0.8493 - loss: 0.3444 - val_accuracy: 0.8032 - val_loss: 0.4446
Epoch 3/20
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 55ms/step - accuracy: 0.8704 - loss: 0.3091 - val_accuracy: 0.7950 - val_loss: 0.4821
Epoch 4/20
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 55ms/step - accuracy: 0.8872 - loss: 0.2821 - val_accuracy: 0.7865 - val_loss: 0.5245
Epoch 5/20
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 57ms/step - accuracy: 0.8943 - loss: 0.2672 - val_accuracy: 0.7792 - val_loss: 0.5771
Epoch 6/20
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 56ms/step - accuracy: 0.9036 - loss: 0.2486 - val_accuracy: 0.7728 - val_loss: 0.6035


In [11]:
# Evaluate the model
model.evaluate(test_inputs, test_labels)

[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8160 - loss: 0.4276


[0.42935043573379517, 0.8123758435249329]

In [24]:
import pickle

In [None]:
model_pkl_file = "rnn_tfidf_binary.pkl"  

with open(model_pkl_file, 'wb') as file:  
    pickle.dump(model, file)

In [25]:
with open("rnn_tfidf_binary.pkl", 'rb') as file:  
    model = pickle.load(file)

In [26]:
example_sentence = "This dress is really beautiful but the sizes are innacurate. it doesnt fit me!"
preprocessed_sentence = preprocess(example_sentence)
tfidf_example = tfidf_vectorizer.transform([preprocessed_sentence]).toarray()
tfidf_example = tfidf_example.reshape((tfidf_example.shape[0], 1, tfidf_example.shape[1]))
prediction = model.predict(tfidf_example)
predicted_class = 1 if prediction[0][0] >= 0.5 else 0

if predicted_class == 1:
    print(f"Likely a 5 stars review")
else:
    print(f"Not a 5 stars review")

print(f"Prediction (raw): {prediction[0][0]:.4f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step
Not a 5 stars review
Prediction (raw): 0.3713


## RNN using word embedding as feature engineering for binary classification 

In [None]:
num_words = 10000

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(reviews)

sequences = tokenizer.texts_to_sequences(reviews)

In [None]:
max_seq_length = np.max(list(map(lambda x: len(x), sequences)))

print("Max sequence length:", max_seq_length)

In [None]:
inputs = pad_sequences(sequences, maxlen=max_seq_length, padding='post')

In [None]:
labels = np.array(data['Rating'].apply(lambda x: 1 if x == 5 else 0))

In [None]:
train_inputs, test_inputs, train_labels, test_labels = train_test_split(inputs, labels, train_size=0.7, random_state=100, stratify=labels)

In [None]:
embedding_dim = 128
inputs_layer = tf.keras.Input(shape=(max_seq_length,))
embedding = tf.keras.layers.Embedding(input_dim=num_words, output_dim=embedding_dim, input_length=max_seq_length)(inputs_layer)
gru = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(128, return_sequences=True))(embedding)
flatten = tf.keras.layers.Flatten()(gru)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(flatten)
model = tf.keras.Model(inputs_layer, outputs)

In [None]:
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

# Train the model
history = model.fit(train_inputs, train_labels, validation_split=0.2, 
                    batch_size=32, epochs=20,
                    callbacks=[tf.keras.callbacks.EarlyStopping(
                        monitor='val_accuracy', 
                        patience=5, 
                        restore_best_weights=True)])

In [None]:
model.evaluate(test_inputs, test_labels)

In [None]:
import pickle
model_pkl_file = "rnn_we_binary.pkl"  

with open(model_pkl_file, 'wb') as file:  
    pickle.dump(model, file)

In [None]:
with open(model_pkl_file, 'rb') as file:  
    model = pickle.load(file)