# ML Cup Notebook

In [1]:
from __future__ import annotations
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from model.network import NeuralNetwork
from model.trainer import Trainer
from model.losses import Loss, mee
from utils import DataLoader
from utils import plot_curves
from utils import StandardScaler
from utils.model_selection_helpers import instability_coeff, tran_val_diff, count_parameters
from utils.grid_search import grid_search_mlcup
import copy
from IPython.display import clear_output

np.random.seed(8) # for reproducibility, we are not choosing a 'good seed for rigging experiments', just my lucky number =)

In [2]:
# Load Data
PATH = 'data/ML CUP/ML-CUP25-TR.csv'
df = pd.read_csv(PATH, comment='#', header=None)

dataset = np.array(df)
X = dataset[:, 1:-4]
y = dataset[:, -4:]

print(f"X.shape: {X.shape}, y.shape: {y.shape} ")
train_test_dataset = DataLoader(X, y)
X_train, y_train, X_test, y_test = train_test_dataset.train_val_split(portion = 0.8, shuffle = True)
print(f"X_train.shape: {X_train.shape}, y_train.shape: {y_train.shape}, X_test.shape: {X_test.shape}, y_test.shape: {y_test.shape}")
train_val_dataset = DataLoader(X_train, y_train)


X.shape: (500, 12), y.shape: (500, 4) 
X_train.shape: (400, 12), y_train.shape: (400, 4), X_test.shape: (100, 12), y_test.shape: (100, 4)


In [3]:
# All the configurations to test for the training
# Neural Network architectures
INPUT_NEURONS = 12
OUTPUT_NEURONS = 4

HIDDEN_LAYER_SIZES = [16, 32]
HIDDEN_LAYERS_COUNTS = [1, 2, 3]
INTERNAL_ACTIVATIONS = ['tanh', 'leaky relu', 'relu']
OUTPUT_ACTIVATIONS_AND_LOSS = [('identity', 'mse')]
NEURAL_NETWORK_CONFIGURATIONS = []

for hidden_layers_count in HIDDEN_LAYERS_COUNTS:
    for hidden_layer_size in HIDDEN_LAYER_SIZES:
        for internal_activation in INTERNAL_ACTIVATIONS:
            for output_activation, loss_function in OUTPUT_ACTIVATIONS_AND_LOSS:
                architecture = [INPUT_NEURONS] + [hidden_layer_size] * hidden_layers_count + [OUTPUT_NEURONS]
                activations = [internal_activation] * hidden_layers_count + [output_activation]
                NEURAL_NETWORK_CONFIGURATIONS.append((architecture, activations, loss_function))

# Training parameters
ETA_CONFIGURATIONS =[0.01, 0.001]
LAMBDA_CONFIGURATIONS =[0, 1e-4, 1e-6] # we have to make them small because they are independent of eta
ALPHA_CONFIGURATIONS = [0, 0.5, 0.9]
BATCH_SIZES =  [32] #we won't fine tune the batch size for time constraints

# Cross-validation parameters
K_FOLDS= 5
EPOCHS = 500
EARLY_STOPPING_PATIENCE = 100

# All possible configurations are tuples (NEURAL_NETWORK_ARCHITECTURE, NEURAL_NETWORK_ACTIVATION, LOSS_F, ETA, LAMBDA, ALPHA, BATCH_SIZE)
CONFIGURATIONS = []

for NEURAL_NETWORK_ARCHITECTURE, NEURAL_NETWORK_ACTIVATION, LOSS_F in NEURAL_NETWORK_CONFIGURATIONS:
  for ETA in ETA_CONFIGURATIONS:
    for LAMBDA in LAMBDA_CONFIGURATIONS:
      for ALPHA in ALPHA_CONFIGURATIONS:
        for BATCH_SIZE in BATCH_SIZES:
            config = (NEURAL_NETWORK_ARCHITECTURE, NEURAL_NETWORK_ACTIVATION, LOSS_F, ETA, LAMBDA, ALPHA, BATCH_SIZE)
            CONFIGURATIONS.append(config)
LEN_CONFIGURATIONS = len(CONFIGURATIONS)

# We use this to train the nn with only one output neuron
CONFIGURATIONS_OUT_1 = copy.deepcopy(CONFIGURATIONS)
for config in CONFIGURATIONS_OUT_1:
    architecture, activations, loss_f, eta, lambd, alpha, batch_size = config
    architecture[-1] = 1  # change output neurons to 1

print(f"Total configurations: {LEN_CONFIGURATIONS}")

Total configurations: 324


In [4]:
def print_top_25(CONFIG_DICTIONARY, CONFIG_DICTIONARY_EPOCHS, CONFIG_DICTIONARY_INSTABILITY_TRAIN, CONFIG_DICTIONARY_INSTABILITY_VAL, CONFIG_DICTIONARY_TRAIN_LOSS_DIFF, CONFIG_DICTIONARY_TEST_LOSS):
    # print the top 25 configurations, sorting criterias: 1 avg mee
    valid_id = [i for i in CONFIG_DICTIONARY if not np.isnan(CONFIG_DICTIONARY[i])] #sometimes gradient explodes and mee is nan
    TOP_25_CONFIGS_INDEXES = sorted(valid_id, key=lambda i: CONFIG_DICTIONARY[i])[:25]
    print("Top 25 configurations:")
    for i in TOP_25_CONFIGS_INDEXES:
        MEE = CONFIG_DICTIONARY[i]/ K_FOLDS
        print(f'''Config index: {CONFIGURATIONS[i]}, Avg Epochs: {CONFIG_DICTIONARY_EPOCHS[i] // K_FOLDS}, Mean MEE: {MEE}%,
            training instability coeff validation: {CONFIG_DICTIONARY_INSTABILITY_VAL[i] / K_FOLDS}, 
            training instability coeff train: {CONFIG_DICTIONARY_INSTABILITY_TRAIN[i] / K_FOLDS}, 
            training loss-val loss diff: {CONFIG_DICTIONARY_TRAIN_LOSS_DIFF[i] / K_FOLDS}, 
            Mean Epochs: {CONFIG_DICTIONARY_EPOCHS[i] // K_FOLDS},
            Mean Test Loss (not rescaled): {np.mean(CONFIG_DICTIONARY_TEST_LOSS[i])},
            Std Test Loss (not rescaled): {np.std(CONFIG_DICTIONARY_TEST_LOSS[i])}
            ''')

## 1 Neural Network for classification

In [5]:
k_fold = train_val_dataset.k_fold(k = K_FOLDS)
grid_s_outputs =  grid_search_mlcup(LEN_CONFIGURATIONS, CONFIGURATIONS, k_fold, EPOCHS, EARLY_STOPPING_PATIENCE)
CONFIG_DICTIONARY, CONFIG_DICTIONARY_EPOCHS, CONFIG_DICTIONARY_INSTABILITY_TRAIN, CONFIG_DICTIONARY_INSTABILITY_VAL, CONFIG_DICTIONARY_TRAIN_LOSS_DIFF, CONFIG_DICTIONARY_TEST_LOSS = grid_s_outputs
clear_output(wait=False)
print_top_25(CONFIG_DICTIONARY, CONFIG_DICTIONARY_EPOCHS, CONFIG_DICTIONARY_INSTABILITY_TRAIN, CONFIG_DICTIONARY_INSTABILITY_VAL, CONFIG_DICTIONARY_TRAIN_LOSS_DIFF, CONFIG_DICTIONARY_TEST_LOSS)

Top 25 configurations:
Config index: ([12, 16, 16, 16, 4], ['tanh', 'tanh', 'tanh', 'identity'], 'mse', 0.001, 1e-06, 0.9, 32), Avg Epochs: 197, Mean MEE: 21.086128127322805%,
            training instability coeff validation: 6.718068383147505, 
            training instability coeff train: 2.8847336568257016, 
            training loss-val loss diff: 21.176896658767543, 
            Mean Epochs: 197,
            Mean Test Loss (not rescaled): 0.4620858954590549,
            Std Test Loss (not rescaled): 0.0462047391932604
            
Config index: ([12, 16, 16, 16, 4], ['tanh', 'tanh', 'tanh', 'identity'], 'mse', 0.001, 0, 0.9, 32), Avg Epochs: 205, Mean MEE: 21.352102145430102%,
            training instability coeff validation: 7.037332852119496, 
            training instability coeff train: 3.149126223942718, 
            training loss-val loss diff: 24.474168238208758, 
            Mean Epochs: 205,
            Mean Test Loss (not rescaled): 0.4615695825677557,
            Std 

## 4 Neural Networks for classification

### y 0 pred

In [6]:
y_0 = np.reshape(y_train[:,0], (-1, 1))
train_val_dataset_y0 = DataLoader(X_train, y_0)
k_fold = train_val_dataset_y0.k_fold(k = K_FOLDS)
grid_s_outputs =  grid_search_mlcup(LEN_CONFIGURATIONS, CONFIGURATIONS_OUT_1, k_fold, EPOCHS, EARLY_STOPPING_PATIENCE)
CONFIG_DICTIONARY, CONFIG_DICTIONARY_EPOCHS, CONFIG_DICTIONARY_INSTABILITY_TRAIN, CONFIG_DICTIONARY_INSTABILITY_VAL, CONFIG_DICTIONARY_TRAIN_LOSS_DIFF, CONFIG_DICTIONARY_TEST_LOSS = grid_s_outputs
clear_output(wait=False)
print_top_25(CONFIG_DICTIONARY, CONFIG_DICTIONARY_EPOCHS, CONFIG_DICTIONARY_INSTABILITY_TRAIN, CONFIG_DICTIONARY_INSTABILITY_VAL, CONFIG_DICTIONARY_TRAIN_LOSS_DIFF, CONFIG_DICTIONARY_TEST_LOSS)

Top 25 configurations:
Config index: ([12, 32, 32, 32, 4], ['leaky relu', 'leaky relu', 'leaky relu', 'identity'], 'mse', 0.001, 0.0001, 0.5, 32), Avg Epochs: 212, Mean MEE: 6.434529320250364%,
            training instability coeff validation: 22.871468077768174, 
            training instability coeff train: 12.718210606646071, 
            training loss-val loss diff: 24.689440075237535, 
            Mean Epochs: 212,
            Mean Test Loss (not rescaled): 0.3625352423985309,
            Std Test Loss (not rescaled): 0.07036548671648388
            
Config index: ([12, 16, 16, 16, 4], ['relu', 'relu', 'relu', 'identity'], 'mse', 0.001, 0, 0.5, 32), Avg Epochs: 239, Mean MEE: 6.717121662200228%,
            training instability coeff validation: 20.67136142488769, 
            training instability coeff train: 12.633494600274535, 
            training loss-val loss diff: 37.4645956417088, 
            Mean Epochs: 239,
            Mean Test Loss (not rescaled): 0.4476795334180775

### y 1 pred

In [7]:
y_1 = np.reshape(y_train[:,1], (-1, 1))
train_val_dataset_y1 = DataLoader(X_train, y_1)
k_fold = train_val_dataset_y1.k_fold(k = K_FOLDS)
grid_s_outputs =  grid_search_mlcup(LEN_CONFIGURATIONS, CONFIGURATIONS_OUT_1, k_fold, EPOCHS, EARLY_STOPPING_PATIENCE)
CONFIG_DICTIONARY, CONFIG_DICTIONARY_EPOCHS, CONFIG_DICTIONARY_INSTABILITY_TRAIN, CONFIG_DICTIONARY_INSTABILITY_VAL, CONFIG_DICTIONARY_TRAIN_LOSS_DIFF, CONFIG_DICTIONARY_TEST_LOSS = grid_s_outputs
clear_output(wait=False)
print_top_25(CONFIG_DICTIONARY, CONFIG_DICTIONARY_EPOCHS, CONFIG_DICTIONARY_INSTABILITY_TRAIN, CONFIG_DICTIONARY_INSTABILITY_VAL, CONFIG_DICTIONARY_TRAIN_LOSS_DIFF, CONFIG_DICTIONARY_TEST_LOSS)

Top 25 configurations:
Config index: ([12, 32, 32, 4], ['leaky relu', 'leaky relu', 'identity'], 'mse', 0.001, 0.0001, 0.5, 32), Avg Epochs: 219, Mean MEE: 8.621673292476268%,
            training instability coeff validation: 10.560687992944192, 
            training instability coeff train: 6.311110659627067, 
            training loss-val loss diff: 39.42954287484198, 
            Mean Epochs: 219,
            Mean Test Loss (not rescaled): 0.61161008125937,
            Std Test Loss (not rescaled): 0.0638485486973228
            
Config index: ([12, 32, 32, 32, 4], ['leaky relu', 'leaky relu', 'leaky relu', 'identity'], 'mse', 0.001, 0.0001, 0.5, 32), Avg Epochs: 208, Mean MEE: 8.758414418513059%,
            training instability coeff validation: 15.164325449359746, 
            training instability coeff train: 8.646168375077913, 
            training loss-val loss diff: 43.215754699151184, 
            Mean Epochs: 208,
            Mean Test Loss (not rescaled): 0.67577798708628

### y 2 pred

In [8]:
y_2 = np.reshape(y_train[:,2], (-1, 1))
train_val_dataset_y2 = DataLoader(X_train, y_2)
k_fold = train_val_dataset_y2.k_fold(k = K_FOLDS)
grid_s_outputs =  grid_search_mlcup(LEN_CONFIGURATIONS, CONFIGURATIONS_OUT_1, k_fold, EPOCHS, EARLY_STOPPING_PATIENCE)
CONFIG_DICTIONARY, CONFIG_DICTIONARY_EPOCHS, CONFIG_DICTIONARY_INSTABILITY_TRAIN, CONFIG_DICTIONARY_INSTABILITY_VAL, CONFIG_DICTIONARY_TRAIN_LOSS_DIFF , CONFIG_DICTIONARY_TEST_LOSS= grid_s_outputs
clear_output(wait=False)
print_top_25(CONFIG_DICTIONARY, CONFIG_DICTIONARY_EPOCHS, CONFIG_DICTIONARY_INSTABILITY_TRAIN, CONFIG_DICTIONARY_INSTABILITY_VAL, CONFIG_DICTIONARY_TRAIN_LOSS_DIFF, CONFIG_DICTIONARY_TEST_LOSS)

Top 25 configurations:
Config index: ([12, 32, 32, 32, 4], ['relu', 'relu', 'relu', 'identity'], 'mse', 0.001, 1e-06, 0, 32), Avg Epochs: 115, Mean MEE: 11.022137589743087%,
            training instability coeff validation: 3.772454670724213, 
            training instability coeff train: 1.6783012481148898, 
            training loss-val loss diff: 7.637669452043786, 
            Mean Epochs: 115,
            Mean Test Loss (not rescaled): 0.35686997136876564,
            Std Test Loss (not rescaled): 0.03272337265108414
            
Config index: ([12, 16, 16, 16, 4], ['leaky relu', 'leaky relu', 'leaky relu', 'identity'], 'mse', 0.001, 1e-06, 0, 32), Avg Epochs: 105, Mean MEE: 11.040052322362566%,
            training instability coeff validation: 3.922135115007657, 
            training instability coeff train: 1.3364672059263487, 
            training loss-val loss diff: 5.496870464886455, 
            Mean Epochs: 105,
            Mean Test Loss (not rescaled): 0.352294414752904

### y 3 pred

In [9]:
y_3 = np.reshape(y_train[:,3], (-1, 1))
train_val_dataset_y3 = DataLoader(X_train, y_3)
k_fold = train_val_dataset_y3.k_fold(k = K_FOLDS)
grid_s_outputs =  grid_search_mlcup(LEN_CONFIGURATIONS, CONFIGURATIONS_OUT_1, k_fold, EPOCHS, EARLY_STOPPING_PATIENCE)
CONFIG_DICTIONARY, CONFIG_DICTIONARY_EPOCHS, CONFIG_DICTIONARY_INSTABILITY_TRAIN, CONFIG_DICTIONARY_INSTABILITY_VAL, CONFIG_DICTIONARY_TRAIN_LOSS_DIFF, CONFIG_DICTIONARY_TEST_LOSS = grid_s_outputs
clear_output(wait=False)
print_top_25(CONFIG_DICTIONARY, CONFIG_DICTIONARY_EPOCHS, CONFIG_DICTIONARY_INSTABILITY_TRAIN, CONFIG_DICTIONARY_INSTABILITY_VAL, CONFIG_DICTIONARY_TRAIN_LOSS_DIFF, CONFIG_DICTIONARY_TEST_LOSS)

Top 25 configurations:
Config index: ([12, 16, 16, 4], ['leaky relu', 'leaky relu', 'identity'], 'mse', 0.001, 0.0001, 0.5, 32), Avg Epochs: 103, Mean MEE: 10.897085615507248%,
            training instability coeff validation: 2.9299197014922265, 
            training instability coeff train: 0.7752807578057503, 
            training loss-val loss diff: 6.272541554142586, 
            Mean Epochs: 103,
            Mean Test Loss (not rescaled): 0.36555703122854216,
            Std Test Loss (not rescaled): 0.04404277889744408
            
Config index: ([12, 32, 32, 4], ['relu', 'relu', 'identity'], 'mse', 0.001, 0.0001, 0.9, 32), Avg Epochs: 111, Mean MEE: 10.912943742680628%,
            training instability coeff validation: 6.964066342245479, 
            training instability coeff train: 2.384784501962328, 
            training loss-val loss diff: 8.00948539593516, 
            Mean Epochs: 111,
            Mean Test Loss (not rescaled): 0.3747243531921858,
            Std Test L

## Final decision
We can clearly see that the best 4-output NN performs worse than 4 single-output NNs
indeed ~20 < ~18. 18 being the square root of the sum of each mee squared