In [103]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy as np
import string
import random
import string
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem.porter import *
import pandas

In [32]:
import warnings
warnings.filterwarnings("ignore")

In [33]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        try:
            yield eval(l)
        except:
            continue

In [34]:
def readCSV(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        u,b,r = l.strip().split(',')
        r = int(r)
        yield u,b,r

In [35]:
dataset = []
for d in readGz("renttherunway_final_data.json.gz"):
    dataset.append(d)


In [36]:
len(dataset)

192462

In [37]:
def heightConversion(h):
    ft, inch = h.split('\' ')
    ft = int(ft)
    inch = int(inch.replace('\"', ''))
    return ft * 12 + inch

In [38]:
new_dataset = []
punctuation = set(string.punctuation)

for d in dataset:
    feature_needed = ['weight', 'height', 'fit', 'size', 'body type', 'review_text', 'review_summary', 'rating']
    keys = list(d.keys())
    if all([i in keys for i in feature_needed]):
        d['weight'] = int(d['weight'].replace("lbs", ""))
        d['height'] = heightConversion(d['height'])
        d['review_text'] = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
        d['review_summary'] = ''.join([c for c in d['review_summary'].lower() if not c in punctuation])
        new_dataset.append(d)

In [41]:
len(new_dataset)

153441

In [138]:
y = [int(d['rating']) for d in new_dataset]

In [150]:
dataTrain, data_vt, ytrain, y_vt = train_test_split(new_dataset, y, test_size=0.3, random_state=42)
dataValid, dataTest, yvalid, ytest = train_test_split(data_vt, y_vt, test_size=0.5, random_state=42)

In [152]:
Xtrain = [[1, d['weight'], d['height'], d['size']] for d in dataTrain]
Xvalid = [[1, d['weight'], d['height'], d['size']] for d in dataValid]
Xtest = [[1, d['weight'], d['height'], d['size']] for d in dataTest]

In [45]:
def accuracy(pred, y):
    correct = pred == y
    return sum(correct)/len(y)

In [120]:
# Baseline Model - using only weight, height and size
Cs = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
mods = {}
accs = {}
accura = []
for i in Cs:
    baselineMod = linear_model.LogisticRegression(C=i, solver='newton-cg')
    baselineMod.fit(Xtrain,ytrain)
    blPred = baselineMod.predict(Xvalid)
    blacc = accuracy(blPred, yvalid)
    mods[i] = baselineMod
    accs[i] = blacc
    accura.append(blacc)
    print("C = {}, accuracy is {}.".format(str(i), str(blacc)))

C = 0.0001, accuracy is 0.7334028501911714.
C = 0.001, accuracy is 0.7330118178658325.
C = 0.01, accuracy is 0.7330118178658325.
C = 0.1, accuracy is 0.7330552659019812.
C = 1, accuracy is 0.7330552659019812.
C = 10, accuracy is 0.7330552659019812.
C = 100, accuracy is 0.7330552659019812.
C = 1000, accuracy is 0.7330552659019812.


In [123]:
max(accura)

0.7334028501911714

In [153]:
wordCount = defaultdict(int)
stemmer = PorterStemmer()
sw = stopwords.words("English")
for d in new_dataset:
    for w in d['review_text'].split():
        w = stemmer.stem(w)
        wordCount[w] += 1

len(wordCount)

35538

In [154]:
counts = [(wordCount[w],w) for w in wordCount if w not in sw]
counts.sort()
counts.reverse()

In [155]:
words = [x[1] for x in counts[:5000]]

In [156]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [157]:
body_types = set()
for d in new_dataset:
    body_types.add(d['body type'])
body_types = list(body_types)
body_types = dict(zip(body_types, range(0, len(body_types))))
body_types

{'petite': 0,
 'apple': 1,
 'full bust': 2,
 'athletic': 3,
 'straight & narrow': 4,
 'pear': 5,
 'hourglass': 6}

In [158]:
def allFeat(d):
    body_type = [0] * len(body_types)
    feat = [1]
    feat += [0] * len(words)
    for w in d['review_text']:
        if w in words:
            feat[wordId[w]] += 1
    body_type[body_types[d['body type']]] = 1
    feat += [d['weight'], d['height'], d['size'], int(d['rating'])] + body_type[1:]
    return feat

In [159]:
Xtrain2 = [allFeat(d) for d in dataTrain]
Xvalid2 = [allFeat(d) for d in dataValid]
Xtest2 = [allFeat(d) for d in dataTest]

In [160]:
model2 = linear_model.LogisticRegression(C=1)
model2.fit(Xtrain2, ytrain)
ypred2 = model2.predict(Xvalid2)
acc2 = accuracy(ypred2, yvalid)
acc2

0.9470368439346541

In [161]:
ypred3 = model2.predict(Xtest2)
acc3 = accuracy(ypred3, ytest)
acc3

0.9489507755137507

In [162]:
ypred4 = model2.predict(Xtrain2)
acc4 = accuracy(ypred4, ytrain)
acc4

0.9475737375242068