In [1]:
import pyedflib
import numpy as np
from tqdm.notebook import tqdm
import pandas as pd
from sklearn.model_selection import KFold
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 
%matplotlib notebook

In [155]:
data = pd.read_csv("dataset_window_31metrics.csv")

In [156]:
data = data.drop('Unnamed: 0', axis=1)
data = data.set_index('name')

In [165]:
data = data.fillna(0)

In [166]:
data['target'].value_counts()

0    3314
1     141
Name: target, dtype: int64

In [167]:
X, y = data.drop('target', axis=1), data.target

In [179]:
X_train, X_test, y_train, y_test \
    = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train, X_val, y_train, y_val \
    = train_test_split(X_train, y_train, test_size=0.25, random_state=42, stratify=y_train) # 0.25 x 0.8 = 0.2

In [180]:
names = ["Nearest Neighbors", "Linear SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "AdaBoost",
         "Naive Bayes"]

In [181]:
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    AdaBoostClassifier(),
    GaussianNB()]


In [182]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for name, clf in zip(names, classifiers):
        clf.fit(X_train, y_train)

        score_test = clf.score(X_test, y_test)

        print(f"{name} is fitted")

Nearest Neighbors is fitted
Linear SVM is fitted
Gaussian Process is fitted
Decision Tree is fitted
Random Forest is fitted
AdaBoost is fitted
Naive Bayes is fitted


In [178]:
for name, clf in zip(names, classifiers):
    pred = clf.predict(X_test)
    tn, fp, fn, tp = sklearn.metrics.confusion_matrix(y_test, pred).ravel()
    print(f"{name} - tn: {tn}, fp: {fp}, fn: {fn}, tp: {tp}, recall = {sklearn.metrics.recall_score(y_test, pred)}")

Nearest Neighbors - tn: 657, fp: 6, fn: 21, tp: 7, recall = 0.25
Linear SVM - tn: 642, fp: 21, fn: 21, tp: 7, recall = 0.25
Gaussian Process - tn: 663, fp: 0, fn: 26, tp: 2, recall = 0.07142857142857142
Decision Tree - tn: 654, fp: 9, fn: 21, tp: 7, recall = 0.25
Random Forest - tn: 663, fp: 0, fn: 28, tp: 0, recall = 0.0
AdaBoost - tn: 657, fp: 6, fn: 22, tp: 6, recall = 0.21428571428571427
Naive Bayes - tn: 447, fp: 216, fn: 6, tp: 22, recall = 0.7857142857142857


In [194]:
from catboost import CatBoostClassifier, Pool

train_dataset = Pool(data=X_train,
                     label=y_train,
                     )

eval_dataset = Pool(data=X_val,
                    label=y_val,
                    )

clf = CatBoostClassifier(loss_function='CrossEntropy')

In [196]:
clf.fit(train_dataset,
          use_best_model=True,
          eval_set=eval_dataset)

0:	learn: 0.6459916	test: 0.6468713	best: 0.6468713 (0)	total: 58.5ms	remaining: 58.4s
1:	learn: 0.6056263	test: 0.6068727	best: 0.6068727 (1)	total: 101ms	remaining: 50.2s
2:	learn: 0.5669794	test: 0.5686547	best: 0.5686547 (2)	total: 141ms	remaining: 46.9s
3:	learn: 0.5303330	test: 0.5323411	best: 0.5323411 (3)	total: 179ms	remaining: 44.6s
4:	learn: 0.4976814	test: 0.4997763	best: 0.4997763 (4)	total: 216ms	remaining: 42.9s
5:	learn: 0.4690741	test: 0.4713824	best: 0.4713824 (5)	total: 255ms	remaining: 42.3s
6:	learn: 0.4414951	test: 0.4444354	best: 0.4444354 (6)	total: 294ms	remaining: 41.7s
7:	learn: 0.4151169	test: 0.4191366	best: 0.4191366 (7)	total: 334ms	remaining: 41.4s
8:	learn: 0.3922745	test: 0.3965389	best: 0.3965389 (8)	total: 372ms	remaining: 41s
9:	learn: 0.3723471	test: 0.3765025	best: 0.3765025 (9)	total: 409ms	remaining: 40.5s
10:	learn: 0.3533708	test: 0.3580518	best: 0.3580518 (10)	total: 444ms	remaining: 39.9s
11:	learn: 0.3362826	test: 0.3413570	best: 0.3413570 

<catboost.core.CatBoostClassifier at 0x7f0139c63d50>

In [201]:
y_pred = clf.predict(Pool(data=X_test))
tn, fp, fn, tp = sklearn.metrics.confusion_matrix(y_test, pred).ravel()
print(f"Catboost - tn: {tn}, fp: {fp}, fn: {fn}, tp: {tp}, recall = {sklearn.metrics.recall_score(y_test, pred)}")

Catboost - tn: 447, fp: 216, fn: 6, tp: 22, recall = 0.7857142857142857


In [202]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, shuffle=True, random_state=10, stratify=y)

In [207]:
from sklearn.model_selection import KFold
n_fold = 5
cv = KFold(n_splits=n_fold, shuffle=True, random_state=10)
rec_, f1_ = [], []

params = {
            'iterations':1000,
            'learning_rate':0.1,
            'depth':6,
            'eval_metric':'CrossEntropy'
}

for fold, (train_index, val_index) in enumerate(cv.split(X)):
    X_train = X.iloc[train_index,:]
    X_val = X.iloc[val_index,:]

    y_train = y.iloc[train_index]
    y_val = y.iloc[val_index]

    clf = CatBoostClassifier(**params)

    train_dataset = Pool(data=X_train,
                     label=y_train,
                     )

    eval_dataset = Pool(data=X_val,
                    label=y_val,
                    )

    clf.fit(train_dataset,
              use_best_model=True,
              verbose = 0,
              eval_set=eval_dataset)

    y_pred = clf.predict(Pool(data=X_test))

    rec_.append(sklearn.metrics.recall_score(y_test, y_pred))
    f1_.append(sklearn.metrics.f1_score(y_test, y_pred))

    print(f"fold: {fold}, f1: {sklearn.metrics.f1_score(y_test, y_pred)}")
    print(f"fold: {fold}, recall: {sklearn.metrics.recall_score(y_test, y_pred)}")

print('CV mean recall:  {0:.4f}, std: {1:.4f}.'.format(np.mean(rec_), np.std(rec_)))
print('CV mean f1: {0:.4f}, std: {1:.4f}.'.format(np.mean(f1_), np.std(f1_)))

fold: 0, f1: 0.7499999999999999
fold: 0, recall: 0.6
fold: 1, f1: 0.8571428571428572
fold: 1, recall: 0.7714285714285715
fold: 2, f1: 0.7719298245614035
fold: 2, recall: 0.6285714285714286
fold: 3, f1: 0.888888888888889
fold: 3, recall: 0.8
fold: 4, f1: 0.8524590163934427
fold: 4, recall: 0.7428571428571429
CV mean recall:  0.7086, std: 0.0796.
CV mean f1: 0.8241, std: 0.0535.
