In [40]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import scipy
import sklearn
# plt.style.use('fivethirtyeight')
sns.set_style("whitegrid")
sns.set_context("notebook")
DATA_PATH = '../data/'

VAL_SPLITS = 4

In [41]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [46]:
df = pd.read_csv(os.path.join(DATA_PATH,'df_train.csv'))
df.drop(columns= df.columns[0:2],inplace=True)
df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V24,V25,V26,V27,V28,Class,TimeScaled,TimeSin,TimeCos,AmountBC
0,-0.829392,1.118573,0.926038,1.163686,0.009824,0.527347,0.17337,0.723997,-0.638939,-0.162923,...,-0.298908,-0.060301,-0.217935,0.291312,0.120779,0,0.460069,-0.480989,0.876727,3.195062
1,-2.814527,1.613321,0.654307,0.581821,0.399491,0.73004,0.456233,-2.464347,0.654797,2.248682,...,-0.329526,-0.307374,-0.440007,-2.135657,0.011041,0,0.266395,-0.204567,-0.978853,3.125269
2,2.105028,-0.7004,-1.338043,-0.596395,-0.395217,-0.75505,-0.276951,-0.291562,-0.965418,1.107179,...,-0.278137,-0.040685,0.789267,-0.066054,-0.069956,0,0.762303,-0.153992,-0.988072,3.421235
3,2.205839,-1.023897,-1.270137,-0.950174,-0.868712,-0.975492,-0.475464,-0.280564,0.503713,0.448173,...,-0.041177,0.089158,1.105794,-0.066285,-0.079881,0,0.87974,-0.998227,0.059524,1.072145
4,2.02709,-0.778666,-1.552755,-0.558679,0.020939,-0.026071,-0.20781,-0.124288,-0.635953,0.817757,...,0.033477,-0.157992,-0.606327,-0.003931,-0.039868,0,0.821649,-0.783558,-0.621319,3.97149


In [79]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [49]:
val_split = StratifiedShuffleSplit(n_splits=VAL_SPLITS,test_size=0.15,random_state=0)
clf = LogisticRegression(solver='sag',random_state=0)

metrics = []
metrics_train = []

for idx_t, idx_v in val_split.split(df,df.Class):
    df_ = df[['Class','V12','AmountBC','V16','V9']]
    X_train = df_.iloc[idx_t].drop(columns='Class').to_numpy()
    y_train = df_.iloc[idx_t]['Class'].to_numpy()
    X_val = df_.iloc[idx_v].drop(columns='Class').to_numpy()
    y_val = df_.iloc[idx_v]['Class'].to_numpy()
    
    
    clf.fit(X_train,y_train)
    
    y_pred = clf.predict(X_val)
    metric = f1_score(y_val,y_pred)
    metrics.append(metric)
    
    y_t_pred = clf.predict(X_train)
    metric_train = f1_score(y_train,y_t_pred)
    metrics_train.append(metric_train)
    print('{}-fold / {} completed!'.format(i+1,VAL_SPLITS))
metric_mean = np.mean(metrics)
metric_std = np.std(metrics, ddof=1)
metric_t_mean = np.mean(metrics_train)
metric_t_std = np.std(metrics_train, ddof=1)
print('Metric value (Train): {:.2f} ± {:.2f}'.format(metric_t_mean,metric_t_std))
print('Metric value(Val): {:.2f} ± {:.2f}'.format(metric_mean,metric_std))

Metric value (Train): 0.61 ± 0.01
Metric value(Val): 0.63 ± 0.06


In [None]:
val_split = StratifiedShuffleSplit(n_splits=VAL_SPLITS,test_size=0.15,random_state=0)
clf = LogisticRegression(solver='sag',random_state=0)

metrics = []
metrics_train = []

for idx_t, idx_v in val_split.split(df,df.Class):
    df_ = df[['Class','V12','AmountBC','V16','V9']]
    X_train = df_.iloc[idx_t].drop(columns='Class').to_numpy()
    y_train = df_.iloc[idx_t]['Class'].to_numpy()
    X_val = df_.iloc[idx_v].drop(columns='Class').to_numpy()
    y_val = df_.iloc[idx_v]['Class'].to_numpy()
    
    clf.fit(X_train,y_train)
    
    y_pred = clf.predict(X_val)
    metric = f1_score(y_val,y_pred)
    metrics.append(metric)
    
    y_t_pred = clf.predict(X_train)
    metric_train = f1_score(y_train,y_t_pred)
    metrics_train.append(metric_train)
    print('{}-fold / {} completed!'.format(i+1,VAL_SPLITS))
metric_mean = np.mean(metrics)
metric_std = np.std(metrics, ddof=1)
metric_t_mean = np.mean(metrics_train)
metric_t_std = np.std(metrics_train, ddof=1)
print('Metric value (Train): {:.2f} ± {:.2f}'.format(metric_t_mean,metric_t_std))
print('Metric value(Val): {:.2f} ± {:.2f}'.format(metric_mean,metric_std))

In [51]:
val_split = StratifiedShuffleSplit(n_splits=VAL_SPLITS,test_size=0.15,random_state=0)
clf = RandomForestClassifier(n_estimators=100,n_jobs=8,random_state=0)

metrics = []
metrics_train = []
df_ = df
X = df_.drop(columns='Class').to_numpy()
y = df_['Class'].to_numpy()
for i,(idx_t, idx_v) in enumerate(val_split.split(X,y)):
    X_train = X[idx_t]
    y_train = y[idx_t]
    X_val = X[idx_v]
    y_val = y[idx_v]
    
    clf.fit(X_train,y_train)
    
    y_pred = clf.predict(X_val)
    metric = f1_score(y_val,y_pred)
    metrics.append(metric)
    
    y_t_pred = clf.predict(X_train)
    metric_train = f1_score(y_train,y_t_pred)
    metrics_train.append(metric_train)
    
    print('{}-fold / {} completed!'.format(i+1,VAL_SPLITS))
metric_mean = np.mean(metrics)
metric_std = np.std(metrics, ddof=1)
metric_t_mean = np.mean(metrics_train)
metric_t_std = np.std(metrics_train, ddof=1)
print('Metric value (Train): {:.2f} ± {:.2f}'.format(metric_t_mean,metric_t_std))
print('Metric value(Val): {:.2f} ± {:.2f}'.format(metric_mean,metric_std))

1-fold / 4 completed!
2-fold / 4 completed!
3-fold / 4 completed!
4-fold / 4 completed!
Metric value (Train): 1.00 ± 0.00
Metric value(Val): 0.83 ± 0.04


In [52]:
for name, value in zip(df.drop(columns='Class').columns,100*clf.feature_importances_):
    print(name,value)

V1 1.4445690344130526
V2 1.0407789352493997
V3 1.8334328539683125
V4 2.4508095633053926
V5 1.3080906158696157
V6 1.266587100952753
V7 2.172020109878126
V8 1.0004153158890368
V9 2.9169683492456517
V10 7.467995669410439
V11 7.067423274885819
V12 9.889335640881278
V13 1.1742516564243106
V14 11.261043868424617
V15 1.1506599918831715
V16 9.190379372644717
V17 16.83700394567183
V18 2.4739896718587397
V19 1.0512658982656427
V20 1.4378303692438361
V21 1.839861052533947
V22 1.1349676912907947
V23 0.7643447010475918
V24 0.9921365053866255
V25 0.9097270921540689
V26 2.3268793192105965
V27 1.509509331853052
V28 0.848657603382417
TimeScaled 1.1273378696609109
TimeSin 1.9413620726002765
TimeCos 1.1139122446028416
AmountBC 1.0564532779111413


In [55]:
val_split = StratifiedShuffleSplit(n_splits=VAL_SPLITS,test_size=0.15,random_state=0)
clf = AdaBoostClassifier()

metrics = []
metrics_train = []
df_ = df
X = df_.drop(columns='Class').to_numpy()
y = df_['Class'].to_numpy()
for i,(idx_t, idx_v) in enumerate(val_split.split(X,y)):
    X_train = X[idx_t]
    y_train = y[idx_t]
    X_val = X[idx_v]
    y_val = y[idx_v]
    
    clf.fit(X_train,y_train)
    
    y_pred = clf.predict(X_val)
    metric = f1_score(y_val,y_pred)
    metrics.append(metric)
    
    y_t_pred = clf.predict(X_train)
    metric_train = f1_score(y_train,y_t_pred)
    metrics_train.append(metric_train)
    
    print('{}-fold / {} completed!'.format(i+1,VAL_SPLITS))
metric_mean = np.mean(metrics)
metric_std = np.std(metrics, ddof=1)
metric_t_mean = np.mean(metrics_train)
metric_t_std = np.std(metrics_train, ddof=1)
print('Metric value (Train): {:.2f} ± {:.2f}'.format(metric_t_mean,metric_t_std))
print('Metric value(Val): {:.2f} ± {:.2f}'.format(metric_mean,metric_std))

1-fold / 4 completed!
2-fold / 4 completed!
3-fold / 4 completed!
4-fold / 4 completed!
Metric value (Train): 0.74 ± 0.00
Metric value(Val): 0.72 ± 0.04


In [58]:
for name, val in zip(df_.drop(columns='Class').columns,100*clf.feature_importances_):
    print(name,val)

V1 2.0
V2 2.0
V3 4.0
V4 10.0
V5 2.0
V6 2.0
V7 2.0
V8 4.0
V9 0.0
V10 2.0
V11 4.0
V12 4.0
V13 2.0
V14 8.0
V15 2.0
V16 2.0
V17 6.0
V18 6.0
V19 6.0
V20 4.0
V21 0.0
V22 4.0
V23 0.0
V24 2.0
V25 0.0
V26 2.0
V27 2.0
V28 4.0
TimeScaled 2.0
TimeSin 4.0
TimeCos 0.0
AmountBC 6.0


In [72]:
val_split = StratifiedShuffleSplit(n_splits=VAL_SPLITS,test_size=0.15,random_state=0)
clf = LogisticRegression(solver='lbfgs',random_state=0,class_weight='balanced')
scaler = StandardScaler()
metrics = []
metrics_train = []
df_ = df[['Class','V4','V14','V16','V17','V18','TimeSin','AmountBC']]
X = df_.drop(columns='Class').to_numpy()
y = df_['Class'].to_numpy()
for i,(idx_t, idx_v) in enumerate(val_split.split(X,y)):
    X_train = X[idx_t]
    y_train = y[idx_t]
    X_val = X[idx_v]
    y_val = y[idx_v]
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_val = scaler.transform(X_val)
    
    clf.fit(X_train,y_train)
    
    y_pred = clf.predict(X_val)
    metric = f1_score(y_val,y_pred)
    metrics.append(metric)
    
    y_t_pred = clf.predict(X_train)
    metric_train = f1_score(y_train,y_t_pred)
    metrics_train.append(metric_train)
    
    print('{}-fold / {} completed!'.format(i+1,VAL_SPLITS))
metric_mean = np.mean(metrics)
metric_std = np.std(metrics, ddof=1)
metric_t_mean = np.mean(metrics_train)
metric_t_std = np.std(metrics_train, ddof=1)
print('Metric value (Train): {:.2f} ± {:.2f}'.format(metric_t_mean,metric_t_std))
print('Metric value(Val): {:.2f} ± {:.2f}'.format(metric_mean,metric_std))

1-fold / 4 completed!
2-fold / 4 completed!
3-fold / 4 completed!
4-fold / 4 completed!
Metric value (Train): 0.08 ± 0.01
Metric value(Val): 0.08 ± 0.00


In [73]:
val_split = StratifiedShuffleSplit(n_splits=VAL_SPLITS,test_size=0.15,random_state=0)
clf = LogisticRegression(solver='lbfgs',random_state=0)
scaler = StandardScaler()

metrics = []
metrics_train = []
df_ = df[['Class','V4','V14','V16','V17','V18','TimeSin','AmountBC']]
X = df_.drop(columns='Class').to_numpy()
y = df_['Class'].to_numpy()
for i,(idx_t, idx_v) in enumerate(val_split.split(X,y)):
    X_train = X[idx_t]
    y_train = y[idx_t]
    X_val = X[idx_v]
    y_val = y[idx_v]
    
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_val = scaler.transform(X_val)
    
    clf.fit(X_train,y_train)
    
    y_pred = clf.predict(X_val)
    metric = f1_score(y_val,y_pred)
    metrics.append(metric)
    
    y_t_pred = clf.predict(X_train)
    metric_train = f1_score(y_train,y_t_pred)
    metrics_train.append(metric_train)
    
    print('{}-fold / {} completed!'.format(i+1,VAL_SPLITS))
metric_mean = np.mean(metrics)
metric_std = np.std(metrics, ddof=1)
metric_t_mean = np.mean(metrics_train)
metric_t_std = np.std(metrics_train, ddof=1)
print('Metric value (Train): {:.2f} ± {:.2f}'.format(metric_t_mean,metric_t_std))
print('Metric value(Val): {:.2f} ± {:.2f}'.format(metric_mean,metric_std))

1-fold / 4 completed!
2-fold / 4 completed!
3-fold / 4 completed!
4-fold / 4 completed!
Metric value (Train): 0.67 ± 0.01
Metric value(Val): 0.66 ± 0.05


In [77]:
val_split = StratifiedShuffleSplit(n_splits=VAL_SPLITS,test_size=0.15,random_state=0)
clf = KNeighborsClassifier(n_neighbors=3)
scaler = StandardScaler()

metrics = []
metrics_train = []
df_ = df[['Class','V4','V14','V16','V17','V18','TimeSin','AmountBC']]
X = df_.drop(columns='Class').to_numpy()
y = df_['Class'].to_numpy()
for i,(idx_t, idx_v) in enumerate(val_split.split(X,y)):
    X_train = X[idx_t]
    y_train = y[idx_t]
    X_val = X[idx_v]
    y_val = y[idx_v]
    
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_val = scaler.transform(X_val)
    
    clf.fit(X_train,y_train)
    
    y_pred = clf.predict(X_val)
    metric = f1_score(y_val,y_pred)
    metrics.append(metric)
    
    y_t_pred = clf.predict(X_train)
    metric_train = f1_score(y_train,y_t_pred)
    metrics_train.append(metric_train)
    
    print('{}-fold / {} completed!'.format(i+1,VAL_SPLITS))
metric_mean = np.mean(metrics)
metric_std = np.std(metrics, ddof=1)
metric_t_mean = np.mean(metrics_train)
metric_t_std = np.std(metrics_train, ddof=1)
print('Metric value (Train): {:.2f} ± {:.2f}'.format(metric_t_mean,metric_t_std))
print('Metric value(Val): {:.2f} ± {:.2f}'.format(metric_mean,metric_std))

1-fold / 4 completed!
2-fold / 4 completed!
3-fold / 4 completed!
4-fold / 4 completed!
Metric value (Train): 0.87 ± 0.01
Metric value(Val): 0.81 ± 0.07


In [78]:
val_split = StratifiedShuffleSplit(n_splits=VAL_SPLITS,test_size=0.15,random_state=0)
clf = RandomForestClassifier(n_estimators=100,n_jobs=-1,random_state=0)

metrics = []
metrics_train = []
df_ = df[['Class','V4','V14','V16','V17','V18','TimeSin','AmountBC']]
X = df_.drop(columns='Class').to_numpy()
y = df_['Class'].to_numpy()
for i,(idx_t, idx_v) in enumerate(val_split.split(X,y)):
    X_train = X[idx_t]
    y_train = y[idx_t]
    X_val = X[idx_v]
    y_val = y[idx_v]
    
    clf.fit(X_train,y_train)
    
    y_pred = clf.predict(X_val)
    metric = f1_score(y_val,y_pred)
    metrics.append(metric)
    
    y_t_pred = clf.predict(X_train)
    metric_train = f1_score(y_train,y_t_pred)
    metrics_train.append(metric_train)
    
    print('{}-fold / {} completed!'.format(i+1,VAL_SPLITS))
metric_mean = np.mean(metrics)
metric_std = np.std(metrics, ddof=1)
metric_t_mean = np.mean(metrics_train)
metric_t_std = np.std(metrics_train, ddof=1)
print('Metric value (Train): {:.2f} ± {:.2f}'.format(metric_t_mean,metric_t_std))
print('Metric value(Val): {:.2f} ± {:.2f}'.format(metric_mean,metric_std))

1-fold / 4 completed!
2-fold / 4 completed!
3-fold / 4 completed!
4-fold / 4 completed!
Metric value (Train): 1.00 ± 0.00
Metric value(Val): 0.80 ± 0.06


In [62]:
val_split = StratifiedShuffleSplit(n_splits=VAL_SPLITS,test_size=0.15,random_state=0)
clf = AdaBoostClassifier()

metrics = []
metrics_train = []
df_ = df[['Class','V4','V14','V16','V17','V18','TimeSin','AmountBC']]
X = df_.drop(columns='Class').to_numpy()
y = df_['Class'].to_numpy()
for i,(idx_t, idx_v) in enumerate(val_split.split(X,y)):
    X_train = X[idx_t]
    y_train = y[idx_t]
    X_val = X[idx_v]
    y_val = y[idx_v]
    
    clf.fit(X_train,y_train)
    
    y_pred = clf.predict(X_val)
    metric = f1_score(y_val,y_pred)
    metrics.append(metric)
    
    y_t_pred = clf.predict(X_train)
    metric_train = f1_score(y_train,y_t_pred)
    metrics_train.append(metric_train)
    
    print('{}-fold / {} completed!'.format(i+1,VAL_SPLITS))
metric_mean = np.mean(metrics)
metric_std = np.std(metrics, ddof=1)
metric_t_mean = np.mean(metrics_train)
metric_t_std = np.std(metrics_train, ddof=1)
print('Metric value (Train): {:.2f} ± {:.2f}'.format(metric_t_mean,metric_t_std))
print('Metric value(Val): {:.2f} ± {:.2f}'.format(metric_mean,metric_std))

1-fold / 4 completed!
2-fold / 4 completed!
3-fold / 4 completed!
4-fold / 4 completed!
Metric value (Train): 0.74 ± 0.01
Metric value(Val): 0.73 ± 0.05


In [80]:
val_split = StratifiedShuffleSplit(n_splits=VAL_SPLITS,test_size=0.15,random_state=0)
clf = ExtraTreesClassifier(n_estimators=50,n_jobs=-1,random_state=0)

metrics = []
metrics_train = []
df_ = df[['Class','V4','V14','V16','V17','V18','TimeSin','AmountBC']]
X = df_.drop(columns='Class').to_numpy()
y = df_['Class'].to_numpy()
for i,(idx_t, idx_v) in enumerate(val_split.split(X,y)):
    X_train = X[idx_t]
    y_train = y[idx_t]
    X_val = X[idx_v]
    y_val = y[idx_v]
    
    clf.fit(X_train,y_train)
    
    y_pred = clf.predict(X_val)
    metric = f1_score(y_val,y_pred)
    metrics.append(metric)
    
    y_t_pred = clf.predict(X_train)
    metric_train = f1_score(y_train,y_t_pred)
    metrics_train.append(metric_train)
    
    print('{}-fold / {} completed!'.format(i+1,VAL_SPLITS))
metric_mean = np.mean(metrics)
metric_std = np.std(metrics, ddof=1)
metric_t_mean = np.mean(metrics_train)
metric_t_std = np.std(metrics_train, ddof=1)
print('Metric value (Train): {:.2f} ± {:.2f}'.format(metric_t_mean,metric_t_std))
print('Metric value(Val): {:.2f} ± {:.2f}'.format(metric_mean,metric_std))

1-fold / 4 completed!
2-fold / 4 completed!
3-fold / 4 completed!
4-fold / 4 completed!
Metric value (Train): 1.00 ± 0.00
Metric value(Val): 0.81 ± 0.06


In [81]:
val_split = StratifiedShuffleSplit(n_splits=VAL_SPLITS,test_size=0.15,random_state=0)
clf = ExtraTreesClassifier(n_estimators=50,n_jobs=-1,random_state=0)

metrics = []
metrics_train = []
df_ = df
X = df_.drop(columns='Class').to_numpy()
y = df_['Class'].to_numpy()
for i,(idx_t, idx_v) in enumerate(val_split.split(X,y)):
    X_train = X[idx_t]
    y_train = y[idx_t]
    X_val = X[idx_v]
    y_val = y[idx_v]
    
    clf.fit(X_train,y_train)
    
    y_pred = clf.predict(X_val)
    metric = f1_score(y_val,y_pred)
    metrics.append(metric)
    
    y_t_pred = clf.predict(X_train)
    metric_train = f1_score(y_train,y_t_pred)
    metrics_train.append(metric_train)
    
    print('{}-fold / {} completed!'.format(i+1,VAL_SPLITS))
metric_mean = np.mean(metrics)
metric_std = np.std(metrics, ddof=1)
metric_t_mean = np.mean(metrics_train)
metric_t_std = np.std(metrics_train, ddof=1)
print('Metric value (Train): {:.2f} ± {:.2f}'.format(metric_t_mean,metric_t_std))
print('Metric value(Val): {:.2f} ± {:.2f}'.format(metric_mean,metric_std))

1-fold / 4 completed!
2-fold / 4 completed!
3-fold / 4 completed!
4-fold / 4 completed!
Metric value (Train): 1.00 ± 0.00
Metric value(Val): 0.82 ± 0.04


In [84]:
val_split = StratifiedShuffleSplit(n_splits=VAL_SPLITS,test_size=0.15,random_state=0)
clf = ExtraTreesClassifier(n_estimators=150,n_jobs=-1,random_state=0,max_depth=10)

metrics = []
metrics_train = []
df_ = df
X = df_.drop(columns='Class').to_numpy()
y = df_['Class'].to_numpy()
for i,(idx_t, idx_v) in enumerate(val_split.split(X,y)):
    X_train = X[idx_t]
    y_train = y[idx_t]
    X_val = X[idx_v]
    y_val = y[idx_v]
    
    clf.fit(X_train,y_train)
    
    y_pred = clf.predict(X_val)
    metric = f1_score(y_val,y_pred)
    metrics.append(metric)
    
    y_t_pred = clf.predict(X_train)
    metric_train = f1_score(y_train,y_t_pred)
    metrics_train.append(metric_train)
    
    print('{}-fold / {} completed!'.format(i+1,VAL_SPLITS))
metric_mean = np.mean(metrics)
metric_std = np.std(metrics, ddof=1)
metric_t_mean = np.mean(metrics_train)
metric_t_std = np.std(metrics_train, ddof=1)
print('Metric value (Train): {:.2f} ± {:.2f}'.format(metric_t_mean,metric_t_std))
print('Metric value(Val): {:.2f} ± {:.2f}'.format(metric_mean,metric_std))

1-fold / 4 completed!
2-fold / 4 completed!
3-fold / 4 completed!
4-fold / 4 completed!
Metric value (Train): 0.79 ± 0.01
Metric value(Val): 0.72 ± 0.06


In [85]:
val_split = StratifiedShuffleSplit(n_splits=VAL_SPLITS,test_size=0.15,random_state=0)
clf = RandomForestClassifier(n_estimators=150,n_jobs=-1,random_state=0,max_depth=3)

metrics = []
metrics_train = []
df_ = df
X = df_.drop(columns='Class').to_numpy()
y = df_['Class'].to_numpy()
for i,(idx_t, idx_v) in enumerate(val_split.split(X,y)):
    X_train = X[idx_t]
    y_train = y[idx_t]
    X_val = X[idx_v]
    y_val = y[idx_v]
    
    clf.fit(X_train,y_train)
    
    y_pred = clf.predict(X_val)
    metric = f1_score(y_val,y_pred)
    metrics.append(metric)
    
    y_t_pred = clf.predict(X_train)
    metric_train = f1_score(y_train,y_t_pred)
    metrics_train.append(metric_train)
    
    print('{}-fold / {} completed!'.format(i+1,VAL_SPLITS))
metric_mean = np.mean(metrics)
metric_std = np.std(metrics, ddof=1)
metric_t_mean = np.mean(metrics_train)
metric_t_std = np.std(metrics_train, ddof=1)
print('Metric value (Train): {:.2f} ± {:.2f}'.format(metric_t_mean,metric_t_std))
print('Metric value(Val): {:.2f} ± {:.2f}'.format(metric_mean,metric_std))

1-fold / 4 completed!
2-fold / 4 completed!
3-fold / 4 completed!
4-fold / 4 completed!
Metric value (Train): 0.73 ± 0.02
Metric value(Val): 0.71 ± 0.06


In [86]:
from sklearn.neural_network import MLPClassifier

In [88]:
val_split = StratifiedShuffleSplit(n_splits=VAL_SPLITS,test_size=0.15,random_state=0)
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                 hidden_layer_sizes=(16,8, 2), random_state=0)
scaler = StandardScaler()

metrics = []
metrics_train = []
df_ = df
X = df_.drop(columns='Class').to_numpy()
y = df_['Class'].to_numpy()
for i,(idx_t, idx_v) in enumerate(val_split.split(X,y)):
    X_train = X[idx_t]
    y_train = y[idx_t]
    X_val = X[idx_v]
    y_val = y[idx_v]
    
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_val = scaler.transform(X_val)
    
    clf.fit(X_train,y_train)
    
    y_pred = clf.predict(X_val)
    metric = f1_score(y_val,y_pred)
    metrics.append(metric)
    
    y_t_pred = clf.predict(X_train)
    metric_train = f1_score(y_train,y_t_pred)
    metrics_train.append(metric_train)
    
    print('{}-fold / {} completed!'.format(i+1,VAL_SPLITS))
metric_mean = np.mean(metrics)
metric_std = np.std(metrics, ddof=1)
metric_t_mean = np.mean(metrics_train)
metric_t_std = np.std(metrics_train, ddof=1)
print('Metric value (Train): {:.2f} ± {:.2f}'.format(metric_t_mean,metric_t_std))
print('Metric value(Val): {:.2f} ± {:.2f}'.format(metric_mean,metric_std))

1-fold / 4 completed!
2-fold / 4 completed!
3-fold / 4 completed!
4-fold / 4 completed!
Metric value (Train): 0.96 ± 0.01
Metric value(Val): 0.78 ± 0.05


In [90]:
val_split = StratifiedShuffleSplit(n_splits=VAL_SPLITS,test_size=0.15,random_state=0)
clf = MLPClassifier(solver='lbfgs', alpha=1e-3,
                 hidden_layer_sizes=(8,4,2), random_state=0)
scaler = StandardScaler()

metrics = []
metrics_train = []
df_ = df
X = df_.drop(columns='Class').to_numpy()
y = df_['Class'].to_numpy()
for i,(idx_t, idx_v) in enumerate(val_split.split(X,y)):
    X_train = X[idx_t]
    y_train = y[idx_t]
    X_val = X[idx_v]
    y_val = y[idx_v]
    
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_val = scaler.transform(X_val)
    clf.fit(X_train,y_train)
    
    y_pred = clf.predict(X_val)
    metric = f1_score(y_val,y_pred)
    metrics.append(metric)
    
    y_t_pred = clf.predict(X_train)
    metric_train = f1_score(y_train,y_t_pred)
    metrics_train.append(metric_train)
    
    print('{}-fold / {} completed!'.format(i+1,VAL_SPLITS))
metric_mean = np.mean(metrics)
metric_std = np.std(metrics, ddof=1)
metric_t_mean = np.mean(metrics_train)
metric_t_std = np.std(metrics_train, ddof=1)
print('Metric value (Train): {:.2f} ± {:.2f}'.format(metric_t_mean,metric_t_std))
print('Metric value(Val): {:.2f} ± {:.2f}'.format(metric_mean,metric_std))

1-fold / 4 completed!
2-fold / 4 completed!
3-fold / 4 completed!
4-fold / 4 completed!
Metric value (Train): 0.74 ± 0.25
Metric value(Val): 0.75 ± 0.19


In [93]:
val_split = StratifiedShuffleSplit(n_splits=VAL_SPLITS,test_size=0.15,random_state=0)
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                 hidden_layer_sizes=(16,8,2), random_state=0)
scaler = StandardScaler()

metrics = []
metrics_train = []
df_ = df
X = df_.drop(columns='Class').to_numpy()
y = df_['Class'].to_numpy()
for i,(idx_t, idx_v) in enumerate(val_split.split(X,y)):
    X_train = X[idx_t]
    y_train = y[idx_t]
    X_val = X[idx_v]
    y_val = y[idx_v]
    
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_val = scaler.transform(X_val)
    clf.fit(X_train,y_train)
    
    y_pred = clf.predict(X_val)
    metric = f1_score(y_val,y_pred)
    metrics.append(metric)
    
    y_t_pred = clf.predict(X_train)
    metric_train = f1_score(y_train,y_t_pred)
    metrics_train.append(metric_train)
    
    print('{}-fold / {} completed!'.format(i+1,VAL_SPLITS))
metric_mean = np.mean(metrics)
metric_std = np.std(metrics, ddof=1)
metric_t_mean = np.mean(metrics_train)
metric_t_std = np.std(metrics_train, ddof=1)
print('Metric value (Train): {:.2f} ± {:.2f}'.format(metric_t_mean,metric_t_std))
print('Metric value(Val): {:.2f} ± {:.2f}'.format(metric_mean,metric_std))

1-fold / 4 completed!
2-fold / 4 completed!
3-fold / 4 completed!
4-fold / 4 completed!
Metric value (Train): 0.97 ± 0.01
Metric value(Val): 0.79 ± 0.04
