In [1]:
import numpy as np
import json_lines

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, accuracy_score

In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/henorvell/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/henorvell/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
X = []; y_voted_up=[]; y_early_access=[]
with open('data/reviews_112_trans-en.jl', 'rb') as reader:
    for line in json_lines.reader(reader):
        X.append(line['trans_en'])
        y_voted_up.append(line['voted_up'])
        y_early_access.append(line['early_access'])

In [4]:
y_voted_up = np.array(y_voted_up).astype(int)
y_early_access = np.array(y_early_access).astype(int)

In [5]:
for ngram in range(1, 4):
    vectorizer = TfidfVectorizer(
        stop_words = nltk.corpus.stopwords.words('english'),
        max_df=0.2, ngram_range=(1, ngram))
    X_vec = vectorizer.fit_transform(X)
    print(ngram, "-gram vector shape: ", X_vec.shape)

1 -gram vector shape:  (5000, 17115)
2 -gram vector shape:  (5000, 143868)
3 -gram vector shape:  (5000, 294497)


In [6]:
def print_confusion_matrix(preds, y_true):
    tn, fp, fn, tp = confusion_matrix(y_true, preds).ravel()

    print('Confusion matrix:')
    print('[{}, {}]'.format(tp, fp))
    print('[{}, {}]'.format(fn, tn))

In [7]:
import matplotlib.pyplot as plt
def plot_roc(pred, y):
    fpr, tpr, thresholds = roc_curve(y, pred)
    try:
        auc = roc_auc_score(y, pred)
    except ValueError:
        auc = "undefined"

    fig, ax = plt.subplots(1, figsize=(8,8))
    ax.plot(fpr, tpr, color='red')
    ax.plot([0,1], [0,1], color='black', linestyle='--')
    ax.set_title(f"AUC: {auc}")

In [8]:
from sklearn.dummy import DummyClassifier

In [9]:
model = DummyClassifier(strategy='most_frequent')
model.fit(X_vec, y_voted_up)
preds_proba = model.predict_proba(X_vec)
preds = model.predict(X_vec)
print_confusion_matrix(preds, y_voted_up)
print('Accuracy: %.4f' % accuracy_score(y_voted_up, preds))

Confusion matrix:
[0, 0]
[2500, 2500]
Accuracy: 0.5000


Early access

In [10]:
model = DummyClassifier(strategy='most_frequent')
model.fit(X_vec, y_early_access)
preds_proba = model.predict_proba(X_vec)
preds = model.predict(X_vec)
print_confusion_matrix(preds, y_early_access)
print('Accuracy: %.4f' % accuracy_score(y_early_access, preds))

Confusion matrix:
[0, 0]
[551, 4449]
Accuracy: 0.8898
