In [1]:
import os, glob, pickle

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import seaborn_image as isns
from PIL import Image

import IPython



# Notebook description

## Goal

Using the embeddings, we will train many models, score them using selected metrics, and record all the results.

Then we will build an ensemble with the best performing models.

## Methods

For metrics, we will consider the 

- F1-score (per-class and average)
- Accuracy (per-class)
- Balanced Accuracy


### Load embeddings and labels

In [2]:
with open("../models/embedding_train.pickle", "rb") as handle:
    train_feats = pickle.load(handle)

with open("../models/embedding_validation.pickle", "rb") as handle:
    validation_feats = pickle.load(handle)

with open("../models/labels.pickle", "rb") as handle:
    labels_dict = pickle.load(handle)

In [3]:
train_labels = labels_dict["train"]
validation_labels = labels_dict["validation"]
categories = labels_dict["categorical"]

In [4]:
print(train_feats.shape, train_labels.shape)

(14034, 2048) (14034,)


### Choose models and set GridSearch

- k-Nearest Neighbors
- Decision Trees
- Naive Bayes
- Random Forest
- Gradient Boosting

In [9]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB


from scipy.stats import randint, norm, uniform

##########
kNN = KNeighborsClassifier()

kNN_params = {
    "n_neighbors": [5, 10, 15, 20, 30]
}
##########
decision_tree = DecisionTreeClassifier()

decision_tree_params = {}
##########
random_forest = RandomForestClassifier()

random_forest_params = {
    "n_estimators": [100, 150, 200, 250],
}
##########
sgd_hinge = SGDClassifier()

sgd_hinge_params = {
    "loss": ["hinge"],
    "alpha": [1e-4, 1e-3, 1e-2]
}
##########
sgd_log = SGDClassifier()

sgd_log_params = {
    "loss": ["log_loss"],
    "alpha": [1e-4, 1e-3, 1e-2]
}
##########
sgd_huber = SGDClassifier()

sgd_huber_params = {
    "loss": ["modified_huber"],
    "alpha": [1e-4, 1e-3, 1e-2]
}
##########
gaussianNB = GaussianNB()

gaussianNB_params = {}
##########
adaboost = AdaBoostClassifier()

adaboost_params = {
    "n_estimators": [50, 100],
    "learning_rate": [0.1, 1., 10.]
}
##########


models = [kNN, decision_tree, random_forest,
          sgd_hinge, sgd_log, sgd_huber,
          gaussianNB, adaboost]

model_params = [kNN_params, decision_tree_params, random_forest_params,
                sgd_hinge_params, sgd_log_params, sgd_huber_params,
                gaussianNB_params, adaboost_params]

In [10]:
best_models = []
best_params = []
best_scores = []

for model, param in zip(models, model_params):

    cv = GridSearchCV(model, param, scoring="balanced_accuracy", verbose=2)
    search = cv.fit(train_feats, train_labels)

    best_models.append(search.best_estimator_)
    best_params.append(search.best_params_)
    best_scores.append(search.best_score_)   


Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] END ......................................n_neighbors=5; total time=   3.0s
[CV] END ......................................n_neighbors=5; total time=   2.8s
[CV] END ......................................n_neighbors=5; total time=   2.5s
[CV] END ......................................n_neighbors=5; total time=   2.4s
[CV] END ......................................n_neighbors=5; total time=   2.2s
[CV] END .....................................n_neighbors=10; total time=   2.4s
[CV] END .....................................n_neighbors=10; total time=   2.4s
[CV] END .....................................n_neighbors=10; total time=   2.3s
[CV] END .....................................n_neighbors=10; total time=   2.3s
[CV] END .....................................n_neighbors=10; total time=   2.2s
[CV] END .....................................n_neighbors=15; total time=   2.3s
[CV] END .....................................n_n

In [11]:
for model, score in zip(best_models, best_scores):
    print(f"{model}, score: {score*100:1f}")

KNeighborsClassifier(n_neighbors=20), score: 85.894450
DecisionTreeClassifier(), score: 75.588079
RandomForestClassifier(n_estimators=250), score: 87.855905
SGDClassifier(alpha=0.01), score: 88.989602
SGDClassifier(alpha=0.01, loss='log_loss'), score: 88.847057
SGDClassifier(alpha=0.01, loss='modified_huber'), score: 87.707370
GaussianNB(), score: 75.014220
AdaBoostClassifier(learning_rate=0.1, n_estimators=100), score: 77.550139


In [12]:
# my_results = {"models": best_models, "scores": best_scores, "params": best_params}

# with open("../models/classifiers.pickle", mode="wb") as handle:
#     pickle.dump(my_results, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [2]:
# with open("../models/classifiers.pickle", mode="rb") as handle:
#     results = pickle.load(handle)

EOFError: Ran out of input

In [15]:
from sklearn.metrics import f1_score, balanced_accuracy_score

balAcc = []
f1Score = []

for model in best_models:
    preds = model.predict(validation_feats)
    f1Score.append(f1_score(validation_labels, preds, average="micro"))
    balAcc.append(balanced_accuracy_score(validation_labels, preds))


In [21]:
print(balAcc)

[0.8649622670880048, 0.7463760786866117, 0.876061337397413, 0.9046821209329184, 0.907755037819561, 0.8855345257895495, 0.7393969206966595, 0.7831770779748322]


In [22]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

xgboost = XGBClassifier(objective="multi:softmax", num_class=6, num_estimators=100, max_depth=2, use_label_encoder=False)

xgboost_scores = cross_val_score(xgboost, train_feats, train_labels, scoring="balanced_accuracy")

  from pandas import MultiIndex, Int64Index


Parameters: { "num_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "num_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "num_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "num_estimators" } migh

In [24]:
(xgboost_scores.max())*100

89.81938479861999

In [27]:
xgboost.fit(train_feats, train_labels)
preds = xgboost.predict(validation_feats)
val_scores = balanced_accuracy_score(validation_labels, preds)

Parameters: { "num_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


