# Rating Prediction for Women’s <br> E-commerce Clothing Reviews with RNN 
<br>__Gabriela Ayala__


## RNN using TF-IDF as feature engineering

Output is a value between 0 and 1 representing the rating 

In [None]:
import numpy as np
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [None]:
full_df = pd.read_csv('Womens_Clothing_E-Commerce_Reviews.csv', encoding='latin-1')
full_df['Review'] = full_df['Title'].fillna('') + ' ' + full_df['Review Text'].fillna('')
data = full_df[["Review", "Rating"]].copy()

In [None]:
custom_stop_words = set(stopwords.words('english')) - {'no', 'not'}
custom_stop_words.update([
    'knit', 'lounge', 'trend', 'casual bottom', 'skirt', 'outerwear',
    'sweater', 'intimate', 'jacket', 'fine gauge', 'blouse', 'legwear', 'swim',
    'pant', 'chemise', 'sleep', 'layering', 'dress', 'jean', 'short', 'top', 'shirt'
])

def preprocess(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    text = ' '.join([word for word in text.split() if word not in custom_stop_words])
    return text

In [None]:
reviews = data['Review'].apply(preprocess)

In [None]:
# Compute TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
tfidf_features = tfidf_vectorizer.fit_transform(reviews).toarray()

In [None]:
# Reshape TF-IDF features to fit RNN input requirements
tfidf_features = tfidf_features.reshape((tfidf_features.shape[0], 1, tfidf_features.shape[1]))

In [None]:
labels = (data['Rating'] - 1) / 4.0

In [None]:
train_inputs, test_inputs, train_labels, test_labels = train_test_split(tfidf_features, labels, train_size=0.7, random_state=100)

In [None]:
embedding_dim = 128
inputs_layer = tf.keras.Input(shape=(1, tfidf_features.shape[2]))
gru = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(128, return_sequences=True))(inputs_layer)
flatten = tf.keras.layers.Flatten()(gru)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(flatten)
model = tf.keras.Model(inputs_layer, outputs)

In [None]:
model.compile(optimizer='adam', 
              loss='mean_squared_error', metrics=['mae'])

# Train model
history = model.fit(train_inputs, train_labels, validation_split=0.2, 
                    batch_size=32, epochs=20,
                    callbacks=[tf.keras.callbacks.EarlyStopping(
                        monitor='val_mae', 
                        patience=5, 
                        restore_best_weights=True)])


In [None]:
loss, mae = model.evaluate(test_inputs, test_labels)
print(f"mae = {mae:.4f}")

In [None]:
example_sentence = "Very beautiful dress"
preprocessed_sentence = preprocess(example_sentence)
tfidf_example = tfidf_vectorizer.transform([preprocessed_sentence]).toarray()
tfidf_example = tfidf_example.reshape((tfidf_example.shape[0], 1, tfidf_example.shape[1]))
prediction = model.predict(tfidf_example)
predicted_rating = round(prediction[0][0] * 4.0 + 1.0)

full_star = "★"
empty_star = "☆"
print(f"Predicted rating: {predicted_rating * full_star}{(5 - predicted_rating) * empty_star}")
print(f"Prediction (raw): {prediction[0][0]}")

## RNN using word embedding as feature engineering

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
num_words = 10000

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(reviews)

sequences = tokenizer.texts_to_sequences(reviews)

In [None]:
max_seq_length = np.max(list(map(lambda x: len(x), sequences)))

print("Max sequence length:", max_seq_length)

In [None]:
inputs = pad_sequences(sequences, maxlen=max_seq_length, padding='post')

In [None]:
labels = (data['Rating'] - 1) / 4.0

In [None]:
train_inputs, test_inputs, train_labels, test_labels = train_test_split(inputs, labels, train_size=0.7, random_state=100, stratify=labels)

In [None]:
embedding_dim = 128
inputs_layer = tf.keras.Input(shape=(max_seq_length,))
embedding = tf.keras.layers.Embedding(input_dim=num_words, output_dim=embedding_dim, input_length=max_seq_length)(inputs_layer)
gru = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(128, return_sequences=True))(embedding)
flatten = tf.keras.layers.Flatten()(gru)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(flatten)
model = tf.keras.Model(inputs_layer, outputs)

In [None]:
model.compile(optimizer='adam', 
              loss='mean_squared_error', metrics=['mae'])

# Train model
history = model.fit(train_inputs, train_labels, validation_split=0.2, 
                    batch_size=32, epochs=20,
                    callbacks=[tf.keras.callbacks.EarlyStopping(
                        monitor='val_mae', 
                        patience=5, 
                        restore_best_weights=True)])