In [1]:
import pyedflib
import numpy as np
from tqdm.notebook import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 
%matplotlib notebook

### Dataset preprocesing

In [2]:
data = pd.read_csv("dataset.csv")

In [3]:
data = data.fillna(0)
data = data.drop('Unnamed: 0', axis=1)
data = data.set_index('name')

In [4]:
X, y = data.drop('target', axis=1), data.target

In [5]:
data

Unnamed: 0_level_0,FZ_f1,FZ_f2,FZ_f3,FZ_f4,FZ_f5,FP1-F3_f1,FP1-F3_f2,FP1-F3_f3,FP1-F3_f4,FP1-F3_f5,...,FP2-F4_f2,FP2-F4_f3,FP2-F4_f4,FP2-F4_f5,PZ-OZ_f1,PZ-OZ_f2,PZ-OZ_f3,PZ-OZ_f4,PZ-OZ_f5,target
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
chb01/chb01_01.edf,0.0,0.0,0.0,0.0,0.0,0.282966,0.419375,2.110119,2.110119,0.537364,...,0.399762,2.130556,2.130556,0.538337,0.0,0.0,0.0,0.0,0.0,0
chb01/chb01_02.edf,0.0,0.0,0.0,0.0,0.0,0.321758,0.326767,2.094710,2.094710,0.541912,...,0.304273,2.127468,2.127468,0.542834,0.0,0.0,0.0,0.0,0.0,0
chb01/chb01_03.edf,0.0,0.0,0.0,0.0,0.0,0.256891,0.149394,1.942853,1.942853,0.549902,...,0.131128,1.984159,1.984159,0.551019,0.0,0.0,0.0,0.0,0.0,1
chb01/chb01_04.edf,0.0,0.0,0.0,0.0,0.0,0.303344,0.240111,2.010897,2.010897,0.544291,...,0.216625,2.024955,2.024955,0.544914,0.0,0.0,0.0,0.0,0.0,1
chb01/chb01_05.edf,0.0,0.0,0.0,0.0,0.0,0.278327,0.416439,2.018175,2.018175,0.537289,...,0.404853,2.047666,2.047666,0.537862,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chb24/chb24_18.edf,0.0,0.0,0.0,0.0,0.0,0.340151,0.361201,1.863158,1.863158,0.539235,...,0.380705,1.875336,1.875336,0.536165,0.0,0.0,0.0,0.0,0.0,0
chb24/chb24_19.edf,0.0,0.0,0.0,0.0,0.0,0.330748,0.401785,1.860644,1.860644,0.537920,...,0.421383,1.853779,1.853779,0.535353,0.0,0.0,0.0,0.0,0.0,0
chb24/chb24_20.edf,0.0,0.0,0.0,0.0,0.0,0.346129,0.426923,1.915160,1.915160,0.537012,...,0.435893,1.896212,1.896212,0.535176,0.0,0.0,0.0,0.0,0.0,0
chb24/chb24_21.edf,0.0,0.0,0.0,0.0,0.0,0.350716,0.403931,1.971527,1.971527,0.537573,...,0.404639,1.955397,1.955397,0.536030,0.0,0.0,0.0,0.0,0.0,1


In [6]:
X_train, X_test, y_train, y_test \
    = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train, X_val, y_train, y_val \
    = train_test_split(X_train, y_train, test_size=0.25, random_state=42, stratify=y_train) # 0.25 x 0.8 = 0.2

In [7]:
names = ["Nearest Neighbors", "Linear SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "AdaBoost",
         "Naive Bayes"]

In [8]:
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    AdaBoostClassifier(),
    GaussianNB()]


In [9]:
clf_score_val, clf_score_test, clf_score_all = [], [], []
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for name, clf in zip(names, classifiers):
        clf.fit(X_train, y_train)
        
        score_val = clf.score(X_val, y_val)
        clf_score_val.append([score_val,name])
        
        score_test = clf.score(X_test, y_test)
        clf_score_test.append([score_test,name])
        
        score_all = clf.score(X, y)
        clf_score_all.append([score_all,name])

In [10]:
# Validation score
clf_score_val

[[0.781021897810219, 'Nearest Neighbors'],
 [0.7956204379562044, 'Linear SVM'],
 [0.7956204379562044, 'Gaussian Process'],
 [0.7883211678832117, 'Decision Tree'],
 [0.7883211678832117, 'Random Forest'],
 [0.781021897810219, 'AdaBoost'],
 [0.8029197080291971, 'Naive Bayes']]

In [11]:
# test score
clf_score_test

[[0.7898550724637681, 'Nearest Neighbors'],
 [0.7971014492753623, 'Linear SVM'],
 [0.7971014492753623, 'Gaussian Process'],
 [0.7536231884057971, 'Decision Tree'],
 [0.8043478260869565, 'Random Forest'],
 [0.7753623188405797, 'AdaBoost'],
 [0.8043478260869565, 'Naive Bayes']]

In [12]:
# all data score
clf_score_all

[[0.8352769679300291, 'Nearest Neighbors'],
 [0.7959183673469388, 'Linear SVM'],
 [0.7944606413994169, 'Gaussian Process'],
 [0.8396501457725948, 'Decision Tree'],
 [0.7988338192419825, 'Random Forest'],
 [0.8892128279883382, 'AdaBoost'],
 [0.7988338192419825, 'Naive Bayes']]

In [13]:
# Посмотрим какой классификатор предсказывает лучше всего TP для target
for name, clf in zip(names, classifiers):
    tp = clf.predict(data[data.target == 1].drop('target', axis=1)).sum()
    print(f"{name} - {tp}")

Nearest Neighbors - 62
Linear SVM - 1
Gaussian Process - 0
Decision Tree - 50
Random Forest - 5
AdaBoost - 90
Naive Bayes - 3


**На данном этапе классификатор Adaboost можно считать лучшим, его точность на test - 0.77, recall на всех данных = 0.64**