In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score

import pandas as pd
import numpy as np
from pickle import load
from tqdm import tqdm
from itertools import product

from utils import one_vs_all, plot_roc, get_prob_and_pred

In [2]:
class RandomPredictor:
    def __init__(self):
        self.__class__.__name__ = "----RANDOM----"
    def fit(self, X, y):
        return self
    def predict(self, X):
        return np.random.randint(0, 2, len(X))

    def predict_proba(self, X):
        return np.array([1/3, 1/3, 1/3]*len(X)).reshape(-1, 3)

class AllZeros:
    def __init__(self):
        self.__class__.__name__ = "----ALL 0's----"
    def fit(self, X, y):
        return self
    def predict(self, X):
        return np.zeros(X.shape[0])

    def predict_proba(self, X):
        return np.array([1, 0, 0]*len(X)).reshape(-1, 3)

class AllOnes:
    def __init__(self):
        self.__class__.__name__ = "----ALL 1's----"
    def fit(self, X, y):
        return self
    def predict(self, X):
        return np.ones(X.shape[0])
    def predict_proba(self, X):
        return np.array([0, 1, 0]*len(X)).reshape(-1, 3)

In [3]:
with open("data/train_ds.pkl", "rb") as f:
    ds = load(f)

X, y, names = ds.get_X(), ds.get_y(), ds.get_feature_names()

In [4]:
with open("data/test_ds.pkl", "rb") as f:
    ds = load(f)

X_test, y_test, names = ds.get_X(), ds.get_y(), ds.get_feature_names()

In [5]:
models = [
    RandomPredictor(),
    AllZeros(),
    AllOnes(),
    DecisionTreeClassifier(), 
    GaussianNB(), 
    LogisticRegression(max_iter=300), 
    # Perceptron(max_iter=100), 
    SVC(max_iter=300, probability=True), 
    RandomForestClassifier(), 
    KNeighborsClassifier(), 
    SGDClassifier(loss="modified_huber"), 
    # XGBClassifier()
]

In [6]:
scores_multi_class = {
    "model_name": [],
    "f1_score": [],
    "precision": [],
    "recall": [],
    "accuracy": [],
    "roc_auc": []
}

scores_not_functional = {
    "model_name": [],
    "f1_score": [],
    "precision": [],
    "recall": [],
    "accuracy": [],
    "roc_auc": []
}

scores_needs_repair = {
    "model_name": [],
    "f1_score": [],
    "precision": [],
    "recall": [],
    "accuracy": [],
    "roc_auc": []
}

mask_train_not_functional = y!=1
mask_test_not_functional  = y_test!=1

mask_train_needs_repair   = y!=2
mask_test_needs_repair    = y_test!=2

l = [
        (...,                         ...,                        scores_multi_class),
        (mask_train_not_functional,   mask_test_not_functional,   scores_not_functional),
        (mask_train_needs_repair,     mask_test_needs_repair,     scores_needs_repair)
    ]

In [7]:
metrics = [
    ("f1_score", lambda y_true, y_pred, y_prob: f1_score(one_vs_all(y_true), one_vs_all(y_pred))),
    ("precision", lambda y_true, y_pred, y_prob: precision_score(one_vs_all(y_true), one_vs_all(y_pred), zero_division=0)),
    ("recall", lambda y_true, y_pred, y_prob: recall_score(one_vs_all(y_true), one_vs_all(y_pred), zero_division=0)),
    ("accuracy", lambda y_true, y_pred, y_prob: accuracy_score(one_vs_all(y_true), one_vs_all(y_pred))),
    ("roc_auc", lambda y_true, y_pred, y_prob: roc_auc_score(one_vs_all(y_true), y_prob))
]

In [8]:
for model, (train_mask, test_mask, scores) in tqdm(product(models, l), total=len(models)*len(l)):
    model = model.fit(X[train_mask], y[train_mask])
    y_prob, y_pred = get_prob_and_pred(model, X_test[test_mask])
    scores["model_name"] += [model.__class__.__name__]
    for name, function in metrics:
        scores[name] += [function(y_test[test_mask], y_pred, y_prob)]
    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
100%|██████████| 30/30 [02:06<00:00,  4.21s/it]


In [9]:
df = pd.DataFrame(data=scores_not_functional).sort_values("roc_auc", ascending=False)
df

Unnamed: 0,model_name,f1_score,precision,recall,accuracy,roc_auc
7,RandomForestClassifier,0.767889,0.79688,0.740934,0.813301,0.882236
8,KNeighborsClassifier,0.736232,0.776384,0.700029,0.790931,0.84919
3,DecisionTreeClassifier,0.733759,0.733546,0.733972,0.777993,0.771762
5,LogisticRegression,0.608239,0.731648,0.520453,0.720556,0.768484
9,SGDClassifier,0.617553,0.68931,0.559327,0.711245,0.758575
4,GaussianNB,0.541792,0.771582,0.417464,0.705683,0.739196
6,SVC,0.070825,0.397626,0.038874,0.574849,0.508017
0,----RANDOM----,0.588376,0.416808,1.0,0.416808,0.5
1,----ALL 0's----,0.0,0.0,0.0,0.583192,0.5
2,----ALL 1's----,0.588376,0.416808,1.0,0.416808,0.5


In [11]:
df = pd.DataFrame(data=scores_needs_repair).sort_values("roc_auc", ascending=False)
df

Unnamed: 0,model_name,f1_score,precision,recall,accuracy,roc_auc
7,RandomForestClassifier,0.47521,0.592075,0.396875,0.897309,0.851267
8,KNeighborsClassifier,0.444444,0.559242,0.36875,0.892001,0.798524
5,LogisticRegression,0.033183,0.478261,0.017188,0.882665,0.721013
3,DecisionTreeClassifier,0.448788,0.449139,0.448437,0.87095,0.712797
4,GaussianNB,0.227538,0.129364,0.94375,0.249314,0.67129
9,SGDClassifier,0.0,0.0,0.0,0.882848,0.670003
6,SVC,0.0,0.0,0.0,0.882848,0.528296
0,----RANDOM----,0.209733,0.117152,1.0,0.117152,0.5
1,----ALL 0's----,0.0,0.0,0.0,0.882848,0.5
2,----ALL 1's----,0.209733,0.117152,1.0,0.117152,0.5


In [13]:
df = pd.DataFrame(data=scores_multi_class).sort_values("roc_auc", ascending=False)
df

Unnamed: 0,model_name,f1_score,precision,recall,accuracy,roc_auc
7,RandomForestClassifier,0.757848,0.771885,0.744311,0.781818,0.860095
8,KNeighborsClassifier,0.727296,0.757351,0.699535,0.759371,0.82746
3,DecisionTreeClassifier,0.728175,0.720757,0.735747,0.748036,0.750074
5,LogisticRegression,0.601363,0.716266,0.518229,0.684848,0.745619
9,SGDClassifier,0.566902,0.763147,0.450942,0.683951,0.742013
4,GaussianNB,0.628784,0.474247,0.932713,0.494837,0.626764
6,SVC,0.0,0.0,0.0,0.541302,0.510319
0,----RANDOM----,0.628914,0.458698,1.0,0.458698,0.5
1,----ALL 0's----,0.0,0.0,0.0,0.541302,0.5
2,----ALL 1's----,0.628914,0.458698,1.0,0.458698,0.5
