In [1]:
import gzip
from tqdm import tqdm

import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

import string
import gensim
from nltk.corpus import stopwords

# nltk.download('stopwords')

In [2]:
fpath = "../data/beeradvocate.json.gz"

def readGz(path):
    for l in gzip.open(path, 'rt', encoding="utf-8"):
        yield eval(l)

data = []
for l in tqdm(readGz(fpath)):
    data.append(l)
    # if len(data) >= 50000:
    #     break
data = data[:-1] # drop last datapoint (empty review)

49999it [00:09, 5076.50it/s]


In [3]:
sp = set(list(zip(*string.punctuation)) + stopwords.words('english'))

def preprocess(d):
    tokens = gensim.utils.simple_preprocess(d)
    return [t for t in tokens if t not in sp]

In [4]:
np.random.seed(0)
np.random.shuffle(data)

data = data[:100000]
n = len(data)

dataTrain = data[:int(n*0.9)]
# dataVal = data[int(n*0.8):int(n*0.9)] 
dataTest = data[int(n*0.9):] 

Xtrain = [d['review/text'] for d in dataTrain]
ytrain = [d['beer/style'] for d in dataTrain]

Xtest = [d['review/text'] for d in dataTest]
ytest = [d['beer/style'] for d in dataTest]

In [5]:
## BOW
bow_model = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', LogisticRegression()),
])

bow_params = {
    'clf__C': [1,5,10]
}

In [6]:
best_bow_model = GridSearchCV(bow_model, bow_params, cv=2)
best_bow_model = best_bow_model.fit(Xtrain[:1000], ytrain[:1000])
bow_preds = best_bow_model.predict(Xtest)
print(np.mean(bow_preds == ytest)) # accuracy

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.3338


In [9]:
print(best_bow_model.cv_results_)
print(best_bow_model.best_params_)
print(best_bow_model.best_score_)

{'mean_fit_time': array([44.42471995, 44.73912601, 46.3986423 ]), 'std_fit_time': array([0.58002514, 1.57738324, 1.33572793]), 'mean_score_time': array([0.08928881, 0.08867545, 0.06897802]), 'std_score_time': array([0.01515614, 0.022926  , 0.01227012]), 'param_clf__C': masked_array(data=[1, 5, 10],
             mask=[False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'clf__C': 1}, {'clf__C': 5}, {'clf__C': 10}], 'split0_test_score': array([0.38, 0.39, 0.39]), 'split1_test_score': array([0.29, 0.29, 0.29]), 'split2_test_score': array([0.34, 0.34, 0.34]), 'split3_test_score': array([0.41, 0.43, 0.44]), 'split4_test_score': array([0.34, 0.35, 0.35]), 'mean_test_score': array([0.352, 0.36 , 0.362]), 'std_test_score': array([0.04069398, 0.04732864, 0.05035871]), 'rank_test_score': array([3, 2, 1])}
{'clf__C': 10}
0.362


In [12]:
## train best BOW model on full training set
bow_model.fit(Xtrain, ytrain, fit_params=best_bow_model.best_params_)
preds = bow_model.predict(Xtest)
print(np.mean(preds == ytest)) # accuracy

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6346


In [None]:
## TF-IDF
tfidf_model = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression()),
])

tfidf_params = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 3)],
    'vect__preprocessor': [preprocess, None],
    'clf__C': np.arange(1,10,2)
}

In [10]:
best_tfidf_model = GridSearchCV(tfidf_model, tfidf_params, cv=2)
best_tfidf_model = best_tfidf_model.fit(Xtrain[:1000], ytrain[:1000])
tfidf_preds = best_tfidf_model.predict(Xtest)
print(np.mean(tfidf_preds == ytest)) # accuracy

30 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\kyeling\Desktop\CSE258_RecommenderSystems\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\kyeling\Desktop\CSE258_RecommenderSystems\.venv\lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\kyeling\Desktop\CSE258_RecommenderSystems\.venv\lib\site-packages\sklearn\pipeline.py", line 423, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "c:\Users\kyeling\Desktop\CSE258_Reco

0.3832


In [11]:
print(best_tfidf_model.cv_results_)
print(best_tfidf_model.best_params_)
print(best_tfidf_model.best_score_)

{'mean_fit_time': array([6.00290298e-03, 3.55507183e+00, 3.04102898e-04, 1.85762960e+01,
       0.00000000e+00, 3.16685889e+01, 0.00000000e+00, 3.49305964e+00,
       1.04415417e-03, 1.88123931e+01, 5.07473946e-04, 3.68510844e+01,
       0.00000000e+00, 2.78349161e+00, 0.00000000e+00, 1.74336296e+01,
       1.99759007e-03, 3.22966712e+01, 0.00000000e+00, 3.19625390e+00,
       0.00000000e+00, 1.86520813e+01, 9.99450684e-04, 4.51483248e+01,
       5.19990921e-04, 4.13289285e+00, 0.00000000e+00, 1.99646604e+01,
       5.26666641e-03, 3.90184669e+01]), 'std_fit_time': array([3.00335884e-03, 2.93977737e-01, 3.04102898e-04, 1.09966338e+00,
       0.00000000e+00, 4.27707601e+00, 0.00000000e+00, 1.45372391e-01,
       1.04415417e-03, 1.65083325e+00, 5.07473946e-04, 9.50883627e-02,
       0.00000000e+00, 2.83516645e-01, 0.00000000e+00, 5.15881419e-01,
       5.84125519e-06, 3.15440178e-01, 0.00000000e+00, 6.32005930e-02,
       0.00000000e+00, 4.86008167e-01, 9.53674316e-07, 2.47928202e+00,
  

In [14]:
## train best TF-IDF on full training set
tfidf_model.fit(Xtrain, ytrain, fit_params=best_tfidf_model.best_params_)
preds = bow_model.predict(Xtest)
print(np.mean(preds == ytest)) # accuracy