# ML Cup Dataset
In this notebook we perform model training, selection and assessment over the ML Cup Dataset.

In [None]:
from exclusiveAI.components.Validation.HoldOut import parallel_hold_out
from exclusiveAI.components.Validation.KFoldCrossValidation import validate
from exclusiveAI.ConfiguratorGen import ConfiguratorGen
from exclusiveAI.datasets.mlcup import read_cup_training_dataset, read_cup_test_dataset
from exclusiveAI.Composer import Composer
import pandas as pd
import numpy as np
from exclusiveAI.utils import plot_history
import os, json

## Read Dataset 
Importing training and test dataset by splitting data and test labels. Saving indexes too for convenience.

In [None]:
file_path = "MLCup/Data/training_data_split.json"
# Load training and test data from the JSON file
with open(file_path, 'r') as jsonfile:
    data_dict = json.load(jsonfile)

training_data = np.array(data_dict['training_data'])
training_labels = np.array(data_dict['training_labels'])
test_data = np.array(data_dict['test_data'])
test_labels = np.array(data_dict['test_labels'])
train_idx = np.array(data_dict['train_idx'])
test_idx = np.array(data_dict['test_idx'])

## Grid Search
After having performed a coarse-grained grid search, we perform a fine-grained grid search using the combination of best model's parameters.

In [None]:
def read_json_files(my_dir_path):
        data = pd.DataFrame()
        for file in os.listdir(my_dir_path):
            if file.endswith('.json'):
                with open(os.path.join(my_dir_path, file), 'r') as f:
                    my_data = [data['0'] for data in json.load(f).values()][1]
                    data = pd.concat([data,  pd.DataFrame(my_data)], ignore_index=True, axis=0)
        return data
batch_size = 200
epochs = 1000

final_path = 'MLCup_models_configs1520.json'

if not os.path.exists(final_path):
    dir_path = "MLCup/"
    
    all_json_data = read_json_files(dir_path)
    # Get the top 3 most common values
    # regularizations = all_json_data['regularization'].unique().tolist()
    regularizations = [1e-6, 1e-7, 1e-8]
    learning_rates = [0.001, 0.0005]
    momentums = [0.5]
    num_of_layers = [2]
    # num_of_units = set([unit1 for unit in all_json_data['num_of_units'] for unit1 in unit])
    num_of_units = [15, 20]
    initializers = all_json_data['initializers'].unique().tolist()
    activations = ['tanh', 'sigmoid']

    myConfigurator = ConfiguratorGen(random=False, learning_rates=learning_rates, regularizations=regularizations,
                                     loss_function=['mse'], optimizer=['sgd'],
                                     activation_functions=activations,
                                     number_of_units=num_of_units, number_of_layers=num_of_layers,
                                     momentums=momentums, initializers=initializers,
                                     input_shapes=training_data.shape,
                                     verbose=False, nesterov=False, outputs=3,
                                     callbacks=["earlystopping"], output_activation='linear', show_line=False,
                                     ).get_configs()
    print(len(myConfigurator))
    
    configs=[]
    if __name__ == '__main__':
        configs.append(
            parallel_hold_out(myConfigurator, training=training_data, training_target=training_labels, epochs=epochs, return_models_history=True,
                              batch_size=batch_size, num_models=100, number_of_initializations=2, regression=True,
                              ))

        configs = pd.DataFrame(configs)
        # Save as json
        configs.to_json(final_path)
with open(final_path, 'r') as f:
    configs = [data['0'] for data in json.load(f).values()]

## KFold Cross Validation 
We perform model selection using the K-fold cross validation technique.

In [None]:
my_configs = []
for config in configs[1]:
    config['callbacks'] = ['earlystopping_1e-2_20']
if __name__ == '__main__':
    my_configs.append(
        validate(configs[1], x=training_data, y_true=training_labels, epochs=epochs, return_models_history=True,
                          batch_size=batch_size, max_configs=100, number_of_initializations=2, n_splits=4
                          ))

configs=my_configs[0]
old_history = configs[0][0]
my_config=configs[1][0]

## Model Assessment 
We perform model assessment over the internal test set.

In [None]:
model = Composer(config=my_config).compose(regression=True)
model.train(inputs=training_data, input_label=training_labels, epochs=epochs, batch_size=batch_size, name=my_config['model_name'], disable_line=False)
test_val = model.evaluate(input=test_data, input_label=test_labels, metrics=['mse', 'mee'])
results = [(model.get_last()['mse'], np.std(np.array(model.history['mee'])), model.get_last()['mee'], test_val[0], test_val[1], model.curr_epoch, model.best_epoch, np.min(old_history['mee']),  np.min(old_history['val_mee']), old_history['mee'],  old_history['val_mee'], model.history['mee'], Composer(config=my_config).compose(), my_config, my_config['num_layers'], my_config['num_of_units'], my_config['model_name'])]
# Convert the list of tuples to a DataFrame with one column for each element in the tuple
df = pd.DataFrame(results, columns=['Score', 'History_Std', 'Mee', 'Test_Score', 'Test_Mee', 'Trained_Epochs', 'Old_Best_Epochs', 'Old_History_Last', 'Old_History_val_Last', 'Old_History', 'Old_History_val', 'History', 'Model', 'Config', 'Num_Layers', 'Num_of_Units', 'Name'])

## Filtering and Ordering Results
Sorting the DataFrame by the first element in the tuple (column 'Value')

In [None]:
# Sort the DataFrame by the first element in the tuple (column 'Value')

df_sorted = df.sort_values(by=['Num_Layers', 'Score', 'Test_Score', 'History_Std'])
histories = {row[0]: row[1] for row in df_sorted[['Name', 'History']].values}
df_sorted

In [None]:
plot_history(histories)
plot_history(old_history)

In [None]:
def find_least_difference_row(my_df):
    min_diff = float('inf')
    selected_row = None

    for index, row in my_df.iterrows():
        array = np.array(row['History'])
        differences =  (np.diff(array) - np.mean(array)) /np.mean(array) 
        # differences =  (np.diff(array) / np.mean(array)) 
        min_consecutive_difference = np.min(differences)

        if min_consecutive_difference < min_diff:
            min_diff = min_consecutive_difference
            selected_row = row

    return selected_row

# Example usage:
result_row = find_least_difference_row(df_sorted)
print("Selected row:")
print(result_row)
print(result_row['Config']) 
result_row.to_csv('MLCup_FirstTry.csv')

## Plot

In [None]:
import matplotlib.pyplot as plt
def plot_history2(name, lines: dict, fig_size=(10, 6)):
    plt.figure(figsize=fig_size)
    for elem in lines:
        plt.plot(lines[elem], label=elem)
    plt.legend(fontsize=12)
    plt.savefig(name+'.eps', format='eps')
    plt.savefig(name+'.png', format='png')

plot_history2(name='MLCup_train', lines={result_row["Name"]: result_row['History']}, fig_size=(10,8))
plot_history2(name='MLCup_HoldOut', lines={'HoldOutTrain'+result_row["Name"]: -np.sort(-np.array(result_row['Old_History'])), 'HoldOutTrainVal'+result_row["Name"]: -np.sort(-np.array(result_row['Old_History_val']))}, fig_size=(10,8))


## Final Retraining
We perform the final retraining with the selected models over the entire dataset using hold-out technique.

In [None]:
full_dataset, full_dataset_labels = read_cup_training_dataset('../exclusiveAI/datasets')
results = validate(configs[1], x=full_dataset, y_true=full_dataset_labels, max_configs=1, n_splits=4, number_of_initializations=2, epochs=epochs, batch_size=batch_size, regression=True, return_models_history=True)
model_config = results[1]

model = Composer(config=model_config).compose(regression=True)
history = model.train(inputs=full_dataset, input_label=full_dataset_labels, epochs=epochs, batch_size=batch_size)
model.save('MLCup_model.h5')

In [None]:
# histories = results[0]
# history_cv = {key: histories[key] for key in histories if 'mee' in  key}
# plot_history({key: history[key] for key in history if 'mee' in key})

In [None]:
from exclusiveAI.components.NeuralNetwork import NeuralNetwork
final_model = NeuralNetwork.load('MLCup_model.h5')
res = final_model.evaluate(test_data, test_labels, metrics=['mse', 'mee'])
print(res)

## Blind Test

In [None]:
test_dataset = read_cup_test_dataset('../exclusiveAI/datasets')
test_dataset
test_labels

In [None]:
predictions = final_model.predict(test_dataset)

df = pd.DataFrame(predictions)
df.to_csv("./final_prediction.csv", float_format='%.16f', index=True, header=False)