In [None]:
import gzip
from tqdm import tqdm

import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

import string
import gensim
from nltk.corpus import stopwords

# nltk.download('stopwords')

In [None]:
fpath = "../data/beeradvocate.json.gz"

def readGz(path):
    for l in gzip.open(path, 'rt', encoding="utf-8"):
        yield eval(l)

data = []
for l in tqdm(readGz(fpath)):
    data.append(l)
    # if len(data) >= 50000:
    #     break
data = data[:-1] # drop last datapoint (empty review)

In [None]:
sp = set(list(zip(*string.punctuation)) + stopwords.words('english'))

def preprocess(d):
    tokens = gensim.utils.simple_preprocess(d)
    return [t for t in tokens if t not in sp]

In [None]:
np.random.seed(0)
np.random.shuffle(data)

data = data[:100000] # take subset of data
n = len(data)

dataTrain = data[:int(n*0.9)]
# dataVal = data[int(n*0.8):int(n*0.9)] 
dataTest = data[int(n*0.9):] 

Xtrain = [d['review/text'] for d in dataTrain]
ytrain = [d['beer/style'] for d in dataTrain]

Xtest = [d['review/text'] for d in dataTest]
ytest = [d['beer/style'] for d in dataTest]

In [None]:
n_gs = 1000

### Bag of Words model

In [None]:
bow_model = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', LogisticRegression()),
])

bow_params = {
    'clf__C': [1,2,3,4,5,10,15,25,50,100]
}

In [None]:
best_bow_model = GridSearchCV(bow_model, bow_params, cv=2)
best_bow_model = best_bow_model.fit(Xtrain[:n_gs], ytrain[:n_gs])
bow_preds = best_bow_model.predict(Xtest)
print(np.mean(bow_preds == ytest)) # accuracy

In [None]:
print(best_bow_model.cv_results_)
print(best_bow_model.best_params_)
print(best_bow_model.best_score_)

In [None]:
## train best BOW model on full training set
bow_model = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', LogisticRegression(C=25)),
])

bow_model.fit(Xtrain, ytrain)
preds = bow_model.predict(Xtest)
print(np.mean(preds == ytest)) # accuracy

### TF-IDF model

In [None]:
## TF-IDF
tfidf_model = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression()),
])

tfidf_params = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 3)],
    'vect__preprocessor': [preprocess, None],
    'clf__C': np.arange(1,10,2)
}

In [None]:
best_tfidf_model = GridSearchCV(tfidf_model, tfidf_params, cv=2)
best_tfidf_model = best_tfidf_model.fit(Xtrain[:n_gs], ytrain[:n_gs])
tfidf_preds = best_tfidf_model.predict(Xtest)
print(np.mean(tfidf_preds == ytest)) # accuracy

In [None]:
print(best_tfidf_model.cv_results_)
print(best_tfidf_model.best_params_)
print(best_tfidf_model.best_score_)

In [None]:
## train best TF-IDF on full training set
tfidf_model = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression()),
])
tfidf_model.fit(Xtrain, ytrain, fit_params=best_tfidf_model.best_params_)
preds = bow_model.predict(Xtest)
print(np.mean(preds == ytest)) # accuracy