In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
import string
import numpy as np
import torch

In [None]:
def preprocess(x):
    x = x.replace('\t', ' ')
    x = x.replace('\n', ' ')
    x = x.translate(str.maketrans('', '', string.punctuation)).lower()
    return x

In [None]:
validGenres = ['Indie', 'Action', 'Casual', 'Adventure', 'Strategy', 'Simulation', 'RPG', 'Sports', 'Massively Multiplayer', 'Racing']
itemGenres = {}
with open('data/steam_games.json') as f:
    for line in f:
        line = eval(line)
        if 'genres' not in line or 'id' not in line:
            continue
        genres = [x for x in validGenres if x in line['genres']]
        if len(genres) == 0:
            continue
        itemGenres[line['id']] = genres

In [None]:
text = []
labels = []
with open('data/steam_reviews.json') as f:
    for line in f:
        line = eval(line)
        item = line['product_id']
        if item not in itemGenres:
            continue
        text.append(preprocess(line['text']))
        labels.append(itemGenres[item])

In [None]:
numReviews = len(labels)

Xtrain, ytrain = text[:int(numReviews*0.8)], labels[:int(numReviews*0.8)]
Xvalid, yvalid = text[int(numReviews*0.8):int(numReviews*0.9)], labels[int(numReviews*0.8):int(numReviews*0.9)]
Xtest, ytest = text[int(numReviews*0.9):], labels[int(numReviews*0.9):]

torch.save((Xtrain, ytrain, Xvalid, yvalid, Xtest, ytest), 'build/raw.pt')

In [None]:
vectorizer = TfidfVectorizer(
    ngram_range=(1, 1),
    dtype='int32',
    analyzer='word',
    strip_accents='unicode',
    decode_error='replace',
    min_df=2
)

Xtrain_trans = vectorizer.fit_transform(Xtrain)
Xvalid_trans = vectorizer.transform(Xvalid)
Xtest_trans = vectorizer.transform(Xtest)

In [None]:
scores = []
for g in validGenres:
    selector = SelectKBest(chi2, k='all')
    y = [(g in x) for x in ytrain]
    selector.fit(Xtrain_trans, y)
    scores.append(list(selector.scores_))

s = np.max(scores, axis=0)
thresh = np.percentile(s, (1 - 20000/s.shape[0])*100)

In [None]:
Xtrain = Xtrain_trans[:, s > thresh]
Xvalid = Xvalid_trans[:, s > thresh]
Xtest = Xtest_trans[:, s > thresh]
torch.save((Xtrain, ytrain, Xvalid, yvalid, Xtest, ytest), 'build/extracted.pt')