In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from collections import defaultdict
import string
import numpy as np
import gzip
import torch

In [None]:
def preprocess(x):
    x = x.replace('\t', ' ')
    x = x.replace('\n', ' ')
    x = x.translate(str.maketrans('', '', string.punctuation)).lower()
    return x

In [None]:
path = 'data/renttherunway_final_data.json.gz'

text = []
labels = []
ratings = []
with gzip.open(path) as f:
    for line in f:
        try:
            line = eval(line)
        except:
            continue
        review = preprocess(line['review_text'] + ' ' + line['review_summary'])
        text.append(review)
        labels.append(line['fit'])
        ratings.append(int(line['rating']))

In [None]:
numReviews = len(labels)

Xtrain, ytrain = text[:int(numReviews*0.8)], labels[:int(numReviews*0.8)]
Xvalid, yvalid = text[int(numReviews*0.8):int(numReviews*0.9)], labels[int(numReviews*0.8):int(numReviews*0.9)]
Xtest, ytest = text[int(numReviews*0.9):], labels[int(numReviews*0.9):]

torch.save((Xtrain, ytrain, Xvalid, yvalid, Xtest, ytest), 'build/raw.pt')

## Fit Classification

In [None]:
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    dtype='int32',
    analyzer='word',
    strip_accents='unicode',
    decode_error='replace',
    min_df=2
)

Xtrain_trans = vectorizer.fit_transform(Xtrain)
Xvalid_trans = vectorizer.transform(Xvalid)
Xtest_trans = vectorizer.transform(Xtest)

In [None]:
top = 1000
selector = SelectKBest(f_classif, k=top)
selector.fit(Xtrain_trans, ytrain)

In [None]:
Xtrain = selector.transform(Xtrain_trans).astype('float32')
Xvalid = selector.transform(Xvalid_trans).astype('float32')
Xtest = selector.transform(Xtest_trans).astype('float32')
torch.save((Xtrain, ytrain, Xvalid, yvalid, Xtest, ytest), 'build/extracted.pt')

## Rating Regression

In [None]:
numReviews = len(ratings)

Xtrain, ytrain = text[:int(numReviews*0.8)], ratings[:int(numReviews*0.8)]
Xvalid, yvalid = text[int(numReviews*0.8):int(numReviews*0.9)], ratings[int(numReviews*0.8):int(numReviews*0.9)]
Xtest, ytest = text[int(numReviews*0.9):], ratings[int(numReviews*0.9):]

In [None]:
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    dtype='int32',
    analyzer='word',
    strip_accents='unicode',
    decode_error='replace',
    min_df=2
)

Xtrain_trans = vectorizer.fit_transform(Xtrain)
Xvalid_trans = vectorizer.transform(Xvalid)
Xtest_trans = vectorizer.transform(Xtest)

In [None]:
top = 1000
selector = SelectKBest(f_classif, k=top)
selector.fit(Xtrain_trans, ytrain)

In [None]:
Xtrain = selector.transform(Xtrain_trans).astype('float32')
Xvalid = selector.transform(Xvalid_trans).astype('float32')
Xtest = selector.transform(Xtest_trans).astype('float32')
torch.save((Xtrain, ytrain, Xvalid, yvalid, Xtest, ytest), 'build/extracted+reg.pt')