To run the code, please setup the anaconda environment by running the following code.
```bash
conda env create -f env-gpu.yaml
conda activate ML
```

In [1]:
import numpy as np
import json_lines

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/henorvell/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/henorvell/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
X = []; y_voted_up=[]; y_early_access=[]
with open('data/reviews_112_trans-en.jl', 'rb') as reader:
    for line in json_lines.reader(reader):
        X.append(line['trans_en'])
        y_voted_up.append(line['voted_up'])
        y_early_access.append(line['early_access'])

In [4]:
y_voted_up = np.array(y_voted_up).astype(int)
y_early_access = np.array(y_early_access).astype(int)

In [5]:
vectorizer = TfidfVectorizer(
    stop_words = nltk.corpus.stopwords.words('english'),
    max_df=0.2, ngram_range=(1, 2))

In [6]:
X_vec = vectorizer.fit_transform(X)

In [7]:
train, test = train_test_split(np.arange(len(X)), test_size=0.1)

In [8]:
tuned_parameters = {
    "n_neighbors": [3, 4, 5]
}
model = KNeighborsClassifier(metric=cosine_distances)
clf = GridSearchCV(model, tuned_parameters, cv=10, refit=False, scoring='accuracy')
clf.fit(X_vec[train], y_voted_up[train])
print(clf.best_estimator_.get_params())

In [9]:
preds_proba = model.predict_proba(X_vec[test])
preds = preds_proba.argmax(axis=-1)
tn, fp, fn, tp = confusion_matrix(y_voted_up[test], preds).ravel()

print('Confusion matrix:')
print('[{}, {}]'.format(tp, fp))
print('[{}, {}]'.format(fn, tn))

Confusion matrix:
[181, 90]
[82, 147]


In [10]:
print('Accuracy: %.4f' % accuracy_score(y_voted_up[test], preds))

Accuracy: 0.6560


In [9]:
def plot_roc(pred, y):
    fpr, tpr, thresholds = roc_curve(y, pred)
    try:
        auc = roc_auc_score(y, pred)
    except ValueError:
        auc = "undefined"

    fig, ax = plt.subplots(1, figsize=(8,8))
    ax.plot(fpr, tpr, color='red')
    ax.plot([0,1], [0,1], color='black', linestyle='--')
    ax.set_title(f"AUC: {auc}")

In [None]:
plot_roc(preds_proba[:, 1], y_voted_up[test])

# Kernelised kNN

In [None]:
def gaussian_kernel_builder(gamma):
    def gaussian_kernel(distances):
        weights = np.exp(-1*gamma*(distances**2))
        return weights/np.sum(weights)
    return gaussian_kernel