# Objective

Predict the rating score of 1 to 5 given the user ratings and product information for women's e-commerce clothing.

In [35]:
import pandas as pd
import sys
import re
import string
import os
import numpy as np
import codecs
from scipy.spatial import distance
from nltk.corpus import stopwords
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score

In [3]:
stop = set(stopwords.words('english'))

## Read in data

In [4]:
df = pd.read_csv("/Users/deena/Documents/Summer/NLP/FinalProject/Womens Clothing E-Commerce Reviews.csv", encoding='iso-8859-1')

In [5]:
sum(df['Review Text'].isnull().values)

845

In [6]:
len(df['Review Text'])

23486

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [8]:
# Remove null reviews and null Department Name
df = df.dropna(subset=['Review Text', 'Division Name'], how='any').reset_index()
len(df)

22628

## Functions for processing reviews

In [9]:
def load_glove(filename):
    """
    Read all lines from the indicated file and return a dictionary
    mapping word:vector where vectors are of numpy `array` type.
    GloVe file lines are of the form:

    the 0.418 0.24968 -0.41242 0.1217 ...

    So split each line on spaces into a list; the first element is the word
    and the remaining elements represent factor components. The length of the vector
    should not matter; read vectors of any length.
    """
    d = {}
    with open(filename, 'r') as f:
        text = f.readlines()
        for line in text:
            l = line.split(' ')
            d[l[0]] = []
            for i in range(1,len(l)):
                d[l[0]].append(float(l[i].strip()))
            d[l[0]] = np.array(d[l[0]])

    return d

In [10]:
# Function to clean reviews and get words
def words(text):
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", text)
    words = nopunct.split(' ')
    words = [w for w in words if len(w) > 2]
    words = [w.lower() for w in words]
    goodwords = [w for w in words if w not in stop]

    return goodwords

In [11]:
# Create word embedding from text
def doc2vec(text, gloves):
    """
    Return the word vector centroid for the text. Sum the word vectors
    for each word and then divide by the number of words. Ignore words
    not in gloves.
    """
    word = words(text)
    array_sum = np.zeros(len(list(gloves.values())[0]))
    cnt = 0
    for w in word:
        if w in gloves:
            array_sum = np.add(gloves[w], array_sum)
            cnt += 1

    if cnt != 0:
        return array_sum/cnt
    else:
        return array_sum

## Feature Engineering

In [12]:
df['Title_review'] = df['Title'].fillna('') + ' ' + df['Review Text']

In [13]:
# Label encoding Categorical columns

lb_make = LabelEncoder()
df['division_name_encoded'] = lb_make.fit_transform(df['Division Name'])
df['dept_name_encoded'] = lb_make.fit_transform(df['Department Name'])
df['class_name_encoded'] = lb_make.fit_transform(df['Class Name'])

In [14]:
# Create embeddings of review text

glove = load_glove('./glove.6B/glove.6B.100d.txt')
embedding = [doc2vec(review, glove) for review in  df['Title_review']]

In [15]:
df.head()

Unnamed: 0.1,index,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,Title_review,division_name_encoded,dept_name_encoded,class_name_encoded
0,0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates,Absolutely wonderful - silky and sexy and com...,2,2,5
1,1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses,Love this dress! it's sooo pretty. i happen...,0,1,3
2,2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses,Some major design flaws I had such high hopes ...,0,1,3
3,3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants,"My favorite buy! I love, love, love this jumps...",1,0,13
4,4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses,Flattering shirt This shirt is very flattering...,0,4,0


In [16]:
features = []
for i in range(0, len(df)):
    features.append(embedding[i] + [df['division_name_encoded'][i] +  df['Clothing ID'][i] 
                    + df['dept_name_encoded'][i] + df['class_name_encoded'][i] + df['Age'][i]])

## Classification Model to predict Rating

In [17]:
# Divide into train and test
X_train, X_test, Y_train, Y_test = train_test_split(embedding,
                                          df['Rating'], test_size = 0.2, random_state=42)

In [18]:
# ------------ Random Forest

forest = RandomForestClassifier(n_estimators = 1000, min_samples_leaf = 2)
forest = forest.fit(X_train, Y_train)

predictions = forest.predict(X_test)


print("Accuracy: ", accuracy_score(Y_test, predictions))

Accuracy:  0.5720282810428634


In [32]:
# ------------ Logistic Regression

logisticRegr = LogisticRegression(C=10, penalty = 'l2')
logisticRegr.fit(X_train, Y_train)

predictions = logisticRegr.predict(X_test)

print("Accuracy: ", accuracy_score(Y_test, predictions))


Accuracy:  0.603844454264251


## Trying MLP - Multi Layer Perceptron

In [86]:
# Divide into train and test
x_train, x_test, y_train, y_test = train_test_split(embedding,
                                          df['Rating'], test_size = 0.2, random_state=42)

In [87]:
x_train = np.array(x_train)
x_test = np.array(x_test)

y_train = keras.utils.to_categorical(y_train)
y_test = keras.utils.to_categorical(y_test,)

In [88]:
print('Building model...')

model = Sequential()

#model.add(Dense(2,input_shape=(100,)))
#model.add(Activation('softmax'))
#model.add(Dense(2)) 
#model.add(Activation('sigmoid'))

model.add(Dense(input_dim=100, output_dim=12, activation='sigmoid'))
model.add(Dense(output_dim=6, activation='softmax'))

print("Training model...")

model.compile(
              loss='binary_crossentropy', # TODO: What should the loss function be?
              optimizer='adam',
              metrics=['accuracy'])

batch_size = 32
epochs = 5

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=True,
                    validation_split=0.1)

Building model...
Training model...


  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


Train on 16291 samples, validate on 1811 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [89]:
loss, accuracy = model.evaluate(x_test, y_test,
                                batch_size=batch_size, 
                                verbose=True)

print(f'Test Loss: {loss:.3}')
print(f'Test accuracy: {accuracy:.3}')

Test Loss: 0.289
Test accuracy: 0.881
