# Monk Notebook

In [1]:
from __future__ import annotations
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from model.network import NeuralNetwork
from model.trainer import Trainer
from model.losses import Loss
from utils import DataLoader
from utils import load_monk, plot_curves
from utils.model_selection_helpers import count_parameters
from model.activations import sigmoid
from model.losses import mse
from utils.grid_search import grid_search_monk
from IPython.display import clear_output

np.random.seed(8) #reproducibility

In [2]:
# All the configurations to test for the training
# Neural Network architectures
INPUT_NEURONS = 17
OUTPUT_NEURONS = 1

HIDDEN_LAYER_SIZES = [4, 8]
HIDDEN_LAYERS_COUNTS = [1, 2]
INTERNAL_ACTIVATIONS = ['tanh', 'leaky relu', 'relu']
OUTPUT_ACTIVATIONS_AND_LOSS = [('sigmoid', 'mse'), ('identity', 'binary cross entropy sigmoid')]
NEURAL_NETWORK_CONFIGURATIONS = []

for hidden_layers_count in HIDDEN_LAYERS_COUNTS:
    for hidden_layer_size in HIDDEN_LAYER_SIZES:
        for internal_activation in INTERNAL_ACTIVATIONS:
            for output_activation, loss_function in OUTPUT_ACTIVATIONS_AND_LOSS:
                architecture = [INPUT_NEURONS] + [hidden_layer_size] * hidden_layers_count + [OUTPUT_NEURONS]
                activations = [internal_activation] * hidden_layers_count + [output_activation]
                NEURAL_NETWORK_CONFIGURATIONS.append((architecture, activations, loss_function))

# Training parameters
ETA_CONFIGURATIONS = [0.25, 0.1]
LAMBDA_CONFIGURATIONS = [0, 1e-1, 1e-2, 1e-3]
ALPHA_CONFIGURATIONS = [0, 0.5, 0.9]
BATCH_SIZES = [32, -1]

# Cross-validation parameters
K_FOLDS= 5
EPOCHS = 500
EARLY_STOPPING_PATIENCE = 50

# All possible configurations are tuples (NEURAL_NETWORK_ARCHITECTURE, NEURAL_NETWORK_ACTIVATION, LOSS_F, ETA, LAMBDA, ALPHA, BATCH_SIZE)
CONFIGURATIONS = []

for NEURAL_NETWORK_ARCHITECTURE, NEURAL_NETWORK_ACTIVATION, LOSS_F in NEURAL_NETWORK_CONFIGURATIONS:
  for ETA in ETA_CONFIGURATIONS:
    for LAMBDA in LAMBDA_CONFIGURATIONS:
      for ALPHA in ALPHA_CONFIGURATIONS:
        for BATCH_SIZE in BATCH_SIZES:
            config = (NEURAL_NETWORK_ARCHITECTURE, NEURAL_NETWORK_ACTIVATION, LOSS_F, ETA, LAMBDA, ALPHA, BATCH_SIZE)
            CONFIGURATIONS.append(config)
LEN_CONFIGURATIONS = len(CONFIGURATIONS)

print(f"Total configurations: {LEN_CONFIGURATIONS}")

Total configurations: 1152


In [3]:
# CONFIG_DICTIONARY is the id of each configuration of CONFIGURATIONS
# CONFIG_DICTIONARY_INSTABILITY_VAL is the sumatory of the relative value of when val loss raises
# CONFIG_DICTIONARY_INSTABILITY_TRAIN is the sumatory of the relative value of when train loss raises
# CONFIG_DICTIONARY_EPOCHS stores the epochs needed for each configuration to train
# CONFIG_DICTIONARY_TRAIN_LOSS_DIFF stores the sumatory of when train loss < val loss, so the model is overfitting

def print_top_25(CONFIG_DICTIONARY, CONFIG_DICTIONARY_INSTABILITY_VAL, CONFIG_DICTIONARY_INSTABILITY_TRAIN, CONFIG_DICTIONARY_EPOCHS, CONFIG_DICTIONARY_TRAIN_LOSS_DIFF, min_acc = 100):
    # print the top 25 configurations, sorting criterias: 1 avg accuracy (no longer valued if > than min_acc), 2 n parameters (the less the better), 3 val instability coeff (the less the better)
    TOP_25_CONFIGS_INDEXES = sorted(CONFIG_DICTIONARY, key=lambda i: (-min(min_acc, CONFIG_DICTIONARY[i]), count_parameters(CONFIGURATIONS[i][0]), (CONFIG_DICTIONARY_INSTABILITY_VAL[i]) / K_FOLDS))[:25]
    print("Top 5 configurations:")
    for i in TOP_25_CONFIGS_INDEXES:
        ACCURACY = CONFIG_DICTIONARY[i] * 100 / K_FOLDS
        print(f'''Config index: {CONFIGURATIONS[i]}, Mean Accuracy: {ACCURACY}%,
            training instability coeff validation: {CONFIG_DICTIONARY_INSTABILITY_VAL[i] / K_FOLDS}, 
            training instability coeff train: {CONFIG_DICTIONARY_INSTABILITY_TRAIN[i] / K_FOLDS}, 
            training loss-val loss diff: {CONFIG_DICTIONARY_TRAIN_LOSS_DIFF[i] / K_FOLDS}, 
            Mean Epochs: {CONFIG_DICTIONARY_EPOCHS[i] // K_FOLDS}''')

## Monk 1

In [4]:
# Here we load the dataset and create k folds
PATH_TRAIN = 'data/monk/monks-1.train'
PATH_TEST = 'data/monk/monks-1.test'
X_train_full, y_train_full, X_test, y_test = load_monk(PATH_TRAIN, PATH_TEST)
monk_dataset_1 = DataLoader(X_train_full, y_train_full)
k_fold = monk_dataset_1.k_fold(k = K_FOLDS)

After one-hot encoding: X train full shape: (124, 17), X test shape: (432, 17), y train full shape: (124, 1), y test shape: (432, 1)


In [5]:
grid_s_outputs = grid_search_monk(LEN_CONFIGURATIONS, CONFIGURATIONS, k_fold, EPOCHS, EARLY_STOPPING_PATIENCE)
clear_output(wait=False)
CONFIG_DICTIONARY, CONFIG_DICTIONARY_EPOCHS, CONFIG_DICTIONARY_INSTABILITY_TRAIN, CONFIG_DICTIONARY_INSTABILITY_VAL, CONFIG_DICTIONARY_TRAIN_LOSS_DIFF = grid_s_outputs
print_top_25(CONFIG_DICTIONARY, CONFIG_DICTIONARY_INSTABILITY_VAL, CONFIG_DICTIONARY_INSTABILITY_TRAIN, CONFIG_DICTIONARY_EPOCHS, CONFIG_DICTIONARY_TRAIN_LOSS_DIFF)

Top 5 configurations:
Config index: ([17, 4, 1], ['relu', 'identity'], 'binary cross entropy sigmoid', 0.25, 0, 0.5, 32), Mean Accuracy: 100.0%,
            training instability coeff validation: 6.6846563167656425, 
            training instability coeff train: 30.11696759260483, 
            training loss-val loss diff: 17.468853644296047, 
            Mean Epochs: 404
Config index: ([17, 4, 1], ['relu', 'identity'], 'binary cross entropy sigmoid', 0.1, 0.001, 0.9, 32), Mean Accuracy: 100.0%,
            training instability coeff validation: 7.251121972242489, 
            training instability coeff train: 18.125499760799777, 
            training loss-val loss diff: 8.502494598628806, 
            Mean Epochs: 182
Config index: ([17, 4, 1], ['tanh', 'identity'], 'binary cross entropy sigmoid', 0.25, 0.001, 0.9, 32), Mean Accuracy: 100.0%,
            training instability coeff validation: 9.700461563209013, 
            training instability coeff train: 12.256099612245993, 
       

## Monk 2

In [6]:
# Here we load the dataset and create k folds
PATH_TRAIN = 'data/monk/monks-2.train'
PATH_TEST = 'data/monk/monks-2.test'
X_train_full, y_train_full, X_test, y_test = load_monk(PATH_TRAIN, PATH_TEST)
monk_dataset_2 = DataLoader(X_train_full, y_train_full)
k_fold = monk_dataset_2.k_fold(k = K_FOLDS)

After one-hot encoding: X train full shape: (169, 17), X test shape: (432, 17), y train full shape: (169, 1), y test shape: (432, 1)


In [7]:
grid_s_outputs = grid_search_monk(LEN_CONFIGURATIONS, CONFIGURATIONS, k_fold, EPOCHS = 500, EARLY_STOPPING_PATIENCE = EARLY_STOPPING_PATIENCE)
clear_output(wait=False)
CONFIG_DICTIONARY, CONFIG_DICTIONARY_EPOCHS, CONFIG_DICTIONARY_INSTABILITY_TRAIN, CONFIG_DICTIONARY_INSTABILITY_VAL, CONFIG_DICTIONARY_TRAIN_LOSS_DIFF = grid_s_outputs
print_top_25(CONFIG_DICTIONARY, CONFIG_DICTIONARY_INSTABILITY_VAL, CONFIG_DICTIONARY_INSTABILITY_TRAIN, CONFIG_DICTIONARY_EPOCHS, CONFIG_DICTIONARY_TRAIN_LOSS_DIFF)

Top 5 configurations:
Config index: ([17, 4, 1], ['leaky relu', 'identity'], 'binary cross entropy sigmoid', 0.25, 0, 0.9, -1), Mean Accuracy: 100.0%,
            training instability coeff validation: 0.21730799802185136, 
            training instability coeff train: 0.05647236271957169, 
            training loss-val loss diff: 5.611568195799458, 
            Mean Epochs: 500
Config index: ([17, 4, 1], ['tanh', 'sigmoid'], 'mse', 0.1, 0, 0.9, 32), Mean Accuracy: 100.0%,
            training instability coeff validation: 0.30486541264563805, 
            training instability coeff train: 11.582679251901965, 
            training loss-val loss diff: 2.942008558362141, 
            Mean Epochs: 500
Config index: ([17, 4, 1], ['tanh', 'identity'], 'binary cross entropy sigmoid', 0.25, 0, 0.5, 32), Mean Accuracy: 100.0%,
            training instability coeff validation: 0.6576087876998861, 
            training instability coeff train: 8.968732956666083, 
            training loss-val l

## Monk 3

In [8]:
# Here we load the dataset and create k folds
PATH_TRAIN = 'data/monk/monks-3.train'
PATH_TEST = 'data/monk/monks-3.test'
X_train_full, y_train_full, X_test, y_test = load_monk(PATH_TRAIN, PATH_TEST)
monk_dataset_3 = DataLoader(X_train_full, y_train_full)
k_fold = monk_dataset_3.k_fold(k = K_FOLDS)

After one-hot encoding: X train full shape: (122, 17), X test shape: (432, 17), y train full shape: (122, 1), y test shape: (432, 1)


In [9]:
grid_s_outputs = grid_search_monk(LEN_CONFIGURATIONS, CONFIGURATIONS, k_fold, EPOCHS = 500, EARLY_STOPPING_PATIENCE = EARLY_STOPPING_PATIENCE)
clear_output(wait=False)
CONFIG_DICTIONARY, CONFIG_DICTIONARY_EPOCHS, CONFIG_DICTIONARY_INSTABILITY_TRAIN, CONFIG_DICTIONARY_INSTABILITY_VAL, CONFIG_DICTIONARY_TRAIN_LOSS_DIFF = grid_s_outputs
print_top_25(CONFIG_DICTIONARY, CONFIG_DICTIONARY_INSTABILITY_VAL, CONFIG_DICTIONARY_INSTABILITY_TRAIN, CONFIG_DICTIONARY_EPOCHS, CONFIG_DICTIONARY_TRAIN_LOSS_DIFF, min_acc=93.5)

Top 5 configurations:
Config index: ([17, 8, 8, 1], ['relu', 'relu', 'identity'], 'binary cross entropy sigmoid', 0.1, 0.001, 0.5, -1), Mean Accuracy: 94.36666666666667%,
            training instability coeff validation: 0.18952201637035826, 
            training instability coeff train: 0.006681477735887299, 
            training loss-val loss diff: 12.55048642910361, 
            Mean Epochs: 222
Config index: ([17, 4, 1], ['relu', 'sigmoid'], 'mse', 0.25, 0.001, 0.5, 32), Mean Accuracy: 94.33333333333334%,
            training instability coeff validation: 3.08172329759867, 
            training instability coeff train: 13.04649501724607, 
            training loss-val loss diff: 2.8616062336733754, 
            Mean Epochs: 99
Config index: ([17, 4, 1], ['leaky relu', 'identity'], 'binary cross entropy sigmoid', 0.25, 0.001, 0, 32), Mean Accuracy: 94.33333333333334%,
            training instability coeff validation: 25.700225784586856, 
            training instability coeff trai

## Monk 3 no reg

In [12]:
#lets choose an overfitting config here
# print the top 25 configurations, sorting criterias: 1 avg accuracy (no longer valued if > than min_acc), 2 n parameters (the less the better), 3 val instability coeff (the less the better)
TOP_25_CONFIGS_INDEXES = sorted(CONFIG_DICTIONARY, key=lambda i: (-count_parameters(CONFIGURATIONS[i][0]), -CONFIG_DICTIONARY[i], (CONFIG_DICTIONARY_INSTABILITY_VAL[i]) / K_FOLDS))[:100]
print("Top 5 configurations:")
for i in TOP_25_CONFIGS_INDEXES:
    ACCURACY = CONFIG_DICTIONARY[i] * 100 / K_FOLDS
    print(f'''Config index: {CONFIGURATIONS[i]}, Mean Accuracy: {ACCURACY}%,
        training instability coeff validation: {CONFIG_DICTIONARY_INSTABILITY_VAL[i] / K_FOLDS}, 
        training instability coeff train: {CONFIG_DICTIONARY_INSTABILITY_TRAIN[i] / K_FOLDS}, 
        training loss-val loss diff: {CONFIG_DICTIONARY_TRAIN_LOSS_DIFF[i] / K_FOLDS}, 
        Mean Epochs: {CONFIG_DICTIONARY_EPOCHS[i] // K_FOLDS}''')

Top 5 configurations:
Config index: ([17, 8, 8, 1], ['relu', 'relu', 'identity'], 'binary cross entropy sigmoid', 0.1, 0.001, 0.5, -1), Mean Accuracy: 94.36666666666667%,
        training instability coeff validation: 0.18952201637035826, 
        training instability coeff train: 0.006681477735887299, 
        training loss-val loss diff: 12.55048642910361, 
        Mean Epochs: 222
Config index: ([17, 8, 8, 1], ['relu', 'relu', 'identity'], 'binary cross entropy sigmoid', 0.25, 0, 0.5, -1), Mean Accuracy: 94.3%,
        training instability coeff validation: 0.5101067216534323, 
        training instability coeff train: 0.0035634011571540245, 
        training loss-val loss diff: 14.215801300638915, 
        Mean Epochs: 178
Config index: ([17, 8, 8, 1], ['relu', 'relu', 'identity'], 'binary cross entropy sigmoid', 0.1, 0.01, 0.9, -1), Mean Accuracy: 93.53333333333333%,
        training instability coeff validation: 0.4881507742889616, 
        training instability coeff train: 0.029