In [None]:
import gzip
from tqdm import tqdm
import numpy as np
import string
from collections import defaultdict
from sklearn import linear_model
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

In [None]:
fpath = "../data/beeradvocate.json.gz"

def readGz(path):
    for l in gzip.open(path, 'rt', encoding="utf-8"):
        yield eval(l)

data = []
for l in tqdm(readGz(fpath)):
    data.append(l)
    if len(data) >= 100000:
        break

In [None]:
n = len(data)

dataTrain = data[:int(n*0.8)]
dataVal = data[int(n*0.8):int(n*0.9)] 
dataTest = data[int(n*0.9):] 

In [None]:
ytrain = [d['beer/style'] for d in dataTrain]
yval   = [d['beer/style'] for d in dataVal]
# ytest  = [d['beer/style'] for d in dataTest]

In [None]:
sp = set(list(zip(*string.punctuation)) + stopwords.words('english'))

In [None]:
# create dictionary of size V unigrams
# word counts should be based on training data
def create_dict(V):    
    wordCount = defaultdict(int)
    for d in tqdm(dataTrain):
        r = ''.join([c for c in d['review/text'].lower() if not c in sp])
        for w in r.split():
            wordCount[w] += 1

    counts = [(wordCount[w], w) for w in wordCount]
    counts.sort(reverse=True)

    words = [x[1] for x in counts[:V]]
    return words

In [None]:
def get_bow_vecs(data, words):
  X = []
  for d in tqdm(data):
    wordCount = defaultdict(int)
    r = ''.join([c for c in d['review/text'].lower() if not c in sp])
    for w in r.split():
        wordCount[w] += 1
    X.append([wordCount[w] for w in words])
  return X



In [None]:
def run_model(X_train, y_train, X_val, y_val, C=1):
    mod = linear_model.LogisticRegression(C=C)
    mod.fit(X_train, y_train)
    
    y_preds = mod.predict(X_val)
    correct = [int(yp == y) for yp, y in zip(y_preds, y_val)]
    acc = sum(correct) / len(correct)

    print(f"accuracy: {acc}")
    return acc

In [None]:
words = create_dict(V=1000)
Xtrain = get_bow_vecs(dataTrain, words)
Xval = get_bow_vecs(dataVal, words)
acc = run_model(Xtrain, ytrain, Xval, yval)

In [None]:
### TF-IDF ###

In [None]:
# tf is based on respective data (train, val, or test)
def get_tf(d):
  tf = defaultdict(int)
  r = ''.join([c for c in d['review/text'].lower() if not c in sp])
  for w in r.split():
    tf[w] += 1
  return tf

# idf should be based on train data
def get_df():
    df = defaultdict(int)
    for d in tqdm(dataTrain):
        r = ''.join([c for c in d['review/text'].lower() if not c in sp])
        for w in set(r.split()):
            df[w] += 1

def get_tfidf_vecs(data, words, df):
  X = []
  for d in data:
    tf = get_tf(d)
    X.append([tf[w] * np.log10(len(data) / df[w]) for w in words])
  return X

In [None]:
# words = create_dict(V=1000)
df = get_df()
Xtrain = get_tfidf_vecs(dataTrain, words, df)
Xval = get_tfidf_vecs(dataVal, words, df)
acc = run_model(Xtrain, ytrain, Xval, yval)