In [1]:
import sys
import os
sys.path.append(os.path.abspath('..'))

import yaml
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import optuna

from src.data.prepare_data import prepare_data
from src.models.utils import train_splits, imbalanced_sampling, set_global_seed
from src.models.tuner import HyperParamSearch
from src.models.model import Classifier
from src.models.evaluation import Evaluation

In [2]:
# read config
with open("../config.yml", "r") as file:
    config=yaml.load(file, Loader= yaml.SafeLoader)
del file

# ensure reproducibility
set_global_seed(seed=config["general"]["seed"])

In [3]:
# load and prepare data
df = pd.read_csv("../" + config["data_loader"]["path"])
df = prepare_data(df=df)
display(df.head())

# check class distributions
df["label"].value_counts(normalize=True)

Unnamed: 0,time,v1,v2,v3,v4,v5,v6,v7,v8,v9,...,v21,v22,v23,v24,v25,v26,v27,v28,amount,label
0,-1.99658,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0.244964,0
1,-1.99658,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,-0.342474,0
2,-1.996558,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,1.160684,0
3,-1.996558,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0.140534,0
4,-1.996537,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,-0.073403,0


label
0    0.998273
1    0.001727
Name: proportion, dtype: float64

**Train and Test Set**
- Split whole set into train, validation and test sets using strat sampling
- Apply oversampling as the number of positive instances is small

In [4]:
# split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df.iloc[:,:-1], df["label"]
    , test_size=config["train_test_split"]["test_size"]
    , random_state=config["general"]["seed"]
    , shuffle=True
    , stratify=df["label"]
    )

# split dataset into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train
    , test_size=config["train_test_split"]["test_size"]
    , random_state=config["general"]["seed"]
    , shuffle=True
    , stratify=y_train
    )

In [5]:
# apply oversampling to the train set
X_train_rs, y_train_rs = imbalanced_sampling(
    method="over"
    , X_train=X_train
    , y_train=y_train
)

# check class distributions
print(
    y_train.value_counts(normalize=True)
    , y_train_rs.value_counts(normalize=True)
    )

label
0    0.998273
1    0.001727
Name: proportion, dtype: float64 label
0    0.5
1    0.5
Name: proportion, dtype: float64


**Optimization & Evaluation**
- Hyper-parameters - search which hyper-parameters optimize scoring metric for the given algorithm in the validation set
- Evaluation - retrieve best hyper-parameters and recover full training set, applying over sampling, to evaluate results on test set

In [6]:
ALGORITHM = "NeuralNetworkClassifier"

In [7]:
# set tuner for hyperparam optimization
tuner = HyperParamSearch(config=config['optimization'], algorithm=ALGORITHM)

def objective(trial) -> float:
    return tuner.fit(
        X=X_train_rs, y=y_train_rs, trial=trial,
        X_val=X_val, y_val=y_val
        )

# set study
study = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=config["general"]["seed"])
    )
study.optimize(objective, n_trials=10)

print(
    f"Best trial:\n",
    f"- params: {study.best_trial.params}\n",
    f"- score: {study.best_trial.value}", "\n"*2
)

[I 2025-06-13 16:44:51,823] A new study created in memory with name: no-name-f75bfc74-871a-480f-ac07-31b36fedf174


[1m423/423[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


[I 2025-06-13 16:46:24,086] Trial 0 finished with value: 0.99557 and parameters: {'n_layers': 11, 'units_0': 245, 'activation_0': 'sigmoid', 'units_1': 67, 'activation_1': 'sigmoid', 'units_2': 226, 'activation_2': 'relu', 'units_3': 36, 'activation_3': 'sigmoid', 'units_4': 79, 'activation_4': 'relu', 'units_5': 100, 'activation_5': 'sigmoid', 'units_6': 97, 'activation_6': 'sigmoid', 'units_7': 97, 'activation_7': 'relu', 'units_8': 208, 'activation_8': 'relu', 'units_9': 165, 'activation_9': 'relu', 'units_10': 70, 'activation_10': 'relu', 'batch_size': 125, 'learning_rate': 0.008103133746352966}. Best is trial 0 with value: 0.99557.


[1m423/423[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


[I 2025-06-13 16:48:13,405] Trial 1 finished with value: 0.99756 and parameters: {'n_layers': 10, 'units_0': 53, 'activation_0': 'sigmoid', 'units_1': 59, 'activation_1': 'sigmoid', 'units_2': 236, 'activation_2': 'relu', 'units_3': 102, 'activation_3': 'relu', 'units_4': 73, 'activation_4': 'sigmoid', 'units_5': 243, 'activation_5': 'sigmoid', 'units_6': 239, 'activation_6': 'relu', 'units_7': 42, 'activation_7': 'relu', 'units_8': 93, 'activation_8': 'sigmoid', 'units_9': 95, 'activation_9': 'sigmoid', 'batch_size': 106, 'learning_rate': 0.0008380513724297313}. Best is trial 1 with value: 0.99756.


[1m423/423[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step


[I 2025-06-13 16:51:20,425] Trial 2 finished with value: 0.0017 and parameters: {'n_layers': 16, 'units_0': 205, 'activation_0': 'sigmoid', 'units_1': 215, 'activation_1': 'relu', 'units_2': 205, 'activation_2': 'relu', 'units_3': 58, 'activation_3': 'sigmoid', 'units_4': 106, 'activation_4': 'relu', 'units_5': 105, 'activation_5': 'sigmoid', 'units_6': 231, 'activation_6': 'sigmoid', 'units_7': 192, 'activation_7': 'sigmoid', 'units_8': 205, 'activation_8': 'relu', 'units_9': 128, 'activation_9': 'relu', 'units_10': 39, 'activation_10': 'sigmoid', 'units_11': 146, 'activation_11': 'sigmoid', 'units_12': 124, 'activation_12': 'sigmoid', 'units_13': 49, 'activation_13': 'sigmoid', 'units_14': 241, 'activation_14': 'sigmoid', 'units_15': 228, 'activation_15': 'sigmoid', 'batch_size': 116, 'learning_rate': 0.005439488194964943}. Best is trial 1 with value: 0.99756.


[1m423/423[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step


[I 2025-06-13 16:58:47,132] Trial 3 finished with value: 0.0017 and parameters: {'n_layers': 15, 'units_0': 233, 'activation_0': 'sigmoid', 'units_1': 83, 'activation_1': 'relu', 'units_2': 225, 'activation_2': 'relu', 'units_3': 125, 'activation_3': 'sigmoid', 'units_4': 107, 'activation_4': 'sigmoid', 'units_5': 148, 'activation_5': 'sigmoid', 'units_6': 250, 'activation_6': 'sigmoid', 'units_7': 143, 'activation_7': 'sigmoid', 'units_8': 40, 'activation_8': 'sigmoid', 'units_9': 43, 'activation_9': 'relu', 'units_10': 85, 'activation_10': 'relu', 'units_11': 253, 'activation_11': 'relu', 'units_12': 203, 'activation_12': 'relu', 'units_13': 114, 'activation_13': 'relu', 'units_14': 152, 'activation_14': 'relu', 'batch_size': 52, 'learning_rate': 0.0019465332529585572}. Best is trial 1 with value: 0.99756.


[1m423/423[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


[I 2025-06-13 17:00:31,203] Trial 4 finished with value: 0.99867 and parameters: {'n_layers': 8, 'units_0': 164, 'activation_0': 'sigmoid', 'units_1': 147, 'activation_1': 'relu', 'units_2': 71, 'activation_2': 'sigmoid', 'units_3': 242, 'activation_3': 'relu', 'units_4': 57, 'activation_4': 'sigmoid', 'units_5': 90, 'activation_5': 'relu', 'units_6': 156, 'activation_6': 'sigmoid', 'units_7': 52, 'activation_7': 'relu', 'batch_size': 87, 'learning_rate': 0.003456394931382137}. Best is trial 4 with value: 0.99867.


[1m423/423[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


[I 2025-06-13 17:03:29,936] Trial 5 finished with value: 0.0017 and parameters: {'n_layers': 11, 'units_0': 195, 'activation_0': 'sigmoid', 'units_1': 207, 'activation_1': 'sigmoid', 'units_2': 68, 'activation_2': 'sigmoid', 'units_3': 34, 'activation_3': 'relu', 'units_4': 33, 'activation_4': 'relu', 'units_5': 187, 'activation_5': 'sigmoid', 'units_6': 192, 'activation_6': 'relu', 'units_7': 199, 'activation_7': 'relu', 'units_8': 179, 'activation_8': 'sigmoid', 'units_9': 114, 'activation_9': 'sigmoid', 'units_10': 250, 'activation_10': 'relu', 'batch_size': 87, 'learning_rate': 0.007968631905062319}. Best is trial 4 with value: 0.99867.


[1m423/423[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


[I 2025-06-13 17:07:08,391] Trial 6 finished with value: 0.0017 and parameters: {'n_layers': 12, 'units_0': 161, 'activation_0': 'sigmoid', 'units_1': 194, 'activation_1': 'sigmoid', 'units_2': 177, 'activation_2': 'relu', 'units_3': 246, 'activation_3': 'sigmoid', 'units_4': 35, 'activation_4': 'sigmoid', 'units_5': 249, 'activation_5': 'sigmoid', 'units_6': 98, 'activation_6': 'relu', 'units_7': 103, 'activation_7': 'relu', 'units_8': 242, 'activation_8': 'sigmoid', 'units_9': 53, 'activation_9': 'relu', 'units_10': 63, 'activation_10': 'relu', 'units_11': 198, 'activation_11': 'relu', 'batch_size': 56, 'learning_rate': 0.003006559258218484}. Best is trial 4 with value: 0.99867.


[1m423/423[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step


[I 2025-06-13 17:13:02,175] Trial 7 finished with value: 0.0017 and parameters: {'n_layers': 15, 'units_0': 214, 'activation_0': 'relu', 'units_1': 147, 'activation_1': 'relu', 'units_2': 178, 'activation_2': 'relu', 'units_3': 232, 'activation_3': 'relu', 'units_4': 53, 'activation_4': 'sigmoid', 'units_5': 136, 'activation_5': 'sigmoid', 'units_6': 164, 'activation_6': 'relu', 'units_7': 217, 'activation_7': 'sigmoid', 'units_8': 149, 'activation_8': 'sigmoid', 'units_9': 172, 'activation_9': 'sigmoid', 'units_10': 151, 'activation_10': 'relu', 'units_11': 195, 'activation_11': 'sigmoid', 'units_12': 104, 'activation_12': 'sigmoid', 'units_13': 130, 'activation_13': 'sigmoid', 'units_14': 248, 'activation_14': 'sigmoid', 'batch_size': 62, 'learning_rate': 0.0018156137687013734}. Best is trial 4 with value: 0.99867.


[1m423/423[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step


[I 2025-06-13 17:18:23,810] Trial 8 finished with value: 0.99941 and parameters: {'n_layers': 9, 'units_0': 88, 'activation_0': 'relu', 'units_1': 180, 'activation_1': 'relu', 'units_2': 198, 'activation_2': 'relu', 'units_3': 126, 'activation_3': 'relu', 'units_4': 202, 'activation_4': 'relu', 'units_5': 42, 'activation_5': 'relu', 'units_6': 190, 'activation_6': 'sigmoid', 'units_7': 142, 'activation_7': 'sigmoid', 'units_8': 129, 'activation_8': 'relu', 'batch_size': 87, 'learning_rate': 0.0005485096967432408}. Best is trial 8 with value: 0.99941.


[1m423/423[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


[I 2025-06-13 17:23:09,881] Trial 9 finished with value: 0.0017 and parameters: {'n_layers': 11, 'units_0': 172, 'activation_0': 'relu', 'units_1': 180, 'activation_1': 'sigmoid', 'units_2': 176, 'activation_2': 'relu', 'units_3': 243, 'activation_3': 'sigmoid', 'units_4': 176, 'activation_4': 'relu', 'units_5': 243, 'activation_5': 'relu', 'units_6': 235, 'activation_6': 'sigmoid', 'units_7': 54, 'activation_7': 'relu', 'units_8': 185, 'activation_8': 'relu', 'units_9': 222, 'activation_9': 'relu', 'units_10': 95, 'activation_10': 'relu', 'batch_size': 87, 'learning_rate': 0.008786972933917824}. Best is trial 8 with value: 0.99941.


Best trial:
 - params: {'n_layers': 9, 'units_0': 88, 'activation_0': 'relu', 'units_1': 180, 'activation_1': 'relu', 'units_2': 198, 'activation_2': 'relu', 'units_3': 126, 'activation_3': 'relu', 'units_4': 202, 'activation_4': 'relu', 'units_5': 42, 'activation_5': 'relu', 'units_6': 190, 'activation_6': 'sigmoid', 'units_7': 142, 'activation_7': 'sigmoid', 'units_8': 129, 'activation_8': 'relu', 'batch_size': 87, 'learning_rate': 0.0005485096967432408}
 - score: 0.99941 




In [8]:
# fit model on whole training set
X_train, y_train = pd.concat([X_train, X_val]), pd.concat([y_train, y_val])
X_train_rs, y_train_rs = imbalanced_sampling(
    method='over'
    , X_train=X_train
    , y_train=y_train
)

# set algorithm best hyperparams
hyperparams = (
    config["optimization"]["param_grid"][ALGORITHM]["fixed"]
    | study.best_trial.params
)

clf = Classifier(algorithm=ALGORITHM, **hyperparams)
clf.fit(X=X_train_rs, y=y_train_rs)

# test set evaluation
eval = Evaluation(clf=clf, threshold=0.5)
display(eval.fit(train=(X_train_rs, y_train_rs), test=(X_test, y_test)))

[1m16882/16882[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 3ms/step
[1m446/446[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step


Unnamed: 0_level_0,accuracy,precision,recall,f1_score
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
train,0.99976,0.99952,1.0,0.99976
test,0.9986,0.5641,0.88,0.6875
