In [60]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy as np
import string
import random
import string
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem.porter import *
import pandas as np

In [61]:
import warnings
warnings.filterwarnings("ignore")

In [62]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        try:
            yield eval(l)
        except:
            continue

In [63]:
def readCSV(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        u,b,r = l.strip().split(',')
        r = int(r)
        yield u,b,r

In [64]:
dataset = []
for d in readGz("renttherunway_final_data.json.gz"):
    dataset.append(d)


In [65]:
len(dataset)

192462

In [66]:
def heightConversion(h):
    ft, inch = h.split('\' ')
    ft = int(ft)
    inch = int(inch.replace('\"', ''))
    return ft * 12 + inch

In [67]:
new_dataset = []
punctuation = set(string.punctuation)

for d in dataset:
    feature_needed = ['weight', 'height', 'fit', 'size', 'body type', 'review_text', 'review_summary', 'rating']
    keys = list(d.keys())
    if all([i in keys for i in feature_needed]):
        d['weight'] = int(d['weight'].replace("lbs", ""))
        d['height'] = heightConversion(d['height'])
        new_dataset.append(d)

In [68]:
len(new_dataset)

153441

In [69]:
y = [d['fit'] for d in new_dataset]

In [70]:
dataTrain, data_vt, ytrain, y_vt = train_test_split(new_dataset, y, test_size=0.3, random_state=42)
dataValid, dataTest, yvalid, ytest = train_test_split(data_vt, y_vt, test_size=0.5, random_state=42)

In [71]:
Xtrain = [[1, d['weight'], d['height'], d['size']] for d in dataTrain]
Xvalid = [[1, d['weight'], d['height'], d['size']] for d in dataValid]
Xtest = [[1, d['weight'], d['height'], d['size']] for d in dataTest]

In [72]:
def accuracy(pred, y):
    correct = pred == y
    return sum(correct)/len(y)

In [73]:
# Baseline Model - using only weight, height and size
Cs = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
mods = {}
accs = {}
accura = []
for i in Cs:
    baselineMod = linear_model.LogisticRegression(C=i, solver='newton-cg')
    baselineMod.fit(Xtrain,ytrain)
    blPred = baselineMod.predict(Xvalid)
    blacc = accuracy(blPred, yvalid)
    mods[i] = baselineMod
    accs[i] = blacc
    accura.append(blacc)
    print("C = {}, accuracy is {}.".format(str(i), str(blacc)))

C = 0.0001, accuracy is 0.7334028501911714.
C = 0.001, accuracy is 0.7330118178658325.
C = 0.01, accuracy is 0.7330118178658325.
C = 0.1, accuracy is 0.7330552659019812.
C = 1, accuracy is 0.7330552659019812.
C = 10, accuracy is 0.7330552659019812.
C = 100, accuracy is 0.7330552659019812.
C = 1000, accuracy is 0.7330552659019812.


In [74]:
max(accura)

0.7334028501911714

In [75]:
y = [d['fit'] for d in dataset]

In [76]:
dataTrain, data_vt, ytrain, y_vt = train_test_split(dataset, y, test_size=0.3, random_state=42)
dataValid, dataTest, yvalid, ytest = train_test_split(data_vt, y_vt, test_size=0.5, random_state=42)
len(dataTrain), len(dataValid), len(dataTest)

(134723, 28869, 28870)

In [77]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
stemmer = PorterStemmer()
sw = stopwords.words("English")
for d in dataTrain:
    r = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
    for w in r.split():
        w = stemmer.stem(w)
        wordCount[w] += 1

len(wordCount)

32111

In [78]:
counts = [(wordCount[w],w) for w in wordCount if w not in sw]
counts.sort()
counts.reverse()
len(counts)

31996

In [79]:
words = [x[1] for x in counts[:5000]]

In [80]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [81]:
max_rating = float('-inf')
min_rating = float('inf')
for d in dataset:
    max_rating = max(int(d['rating']), max_rating)
    min_rating = min(int(d['rating']), min_rating)
max_rating, min_rating

(10, 2)

In [82]:
def feature(d): 
    feat = [0]*len(words)
    r = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
    for w in r.split():
        w = stemmer.stem(w)
        if w in words:
            feat[wordId[w]] += 1
    feat2 = [0]*(max_rating-min_rating+1)
    feat2[max_rating-int(d['rating'])] = 1
    return [1] + feat + feat2[1:]

In [83]:
Xtrain = [feature(d) for d in dataTrain]

In [84]:
Xvalid = [feature(d) for d in dataValid]

In [85]:
Xtest= [feature(d) for d in dataTest]

In [86]:
model = linear_model.LogisticRegression(C=1)
model.fit(Xtrain, ytrain)

LogisticRegression(C=1)

In [87]:
predictValid = model.predict(Xvalid)
accValid = accuracy(predictValid, yvalid)
accValid

0.8023831791887491

In [88]:
predictTest = model.predict(Xtest)
accTest = accuracy(predictTest, ytest)
accTest

0.7934534118462071

In [89]:
predictTrain = model.predict(Xtrain)
accTrain = accuracy(predictTrain, ytrain)
accTrain

0.814820038152357