# All in 1

In [1]:
def create_parameters(task_id, Layers, Heads, Emedding_dim, batch_size):
    
    parameters = {
    "task_id" : task_id,
    "Layers": Layers,
    "Heads": Heads,
    "Emedding_dim": Emedding_dim,
    "batch_size": batch_size,
    }

    return parameters

In [2]:
import sys
project_path = "/home/diego/Git/thesis-tabtrans"
sys.path.append(project_path) #import folders from the project_path

import os
import numpy as np
import torch
import torch.nn as nn
from utils import training, callback, evaluating, attention, data, plots 
from sklearn import datasets, model_selection
import skorch
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import openml
from sklearn import datasets, model_selection
from skorch.callbacks import Checkpoint, EarlyStopping, LoadInitState, EpochScoring, Checkpoint, TrainEndCheckpoint
import csv

In [3]:
def new_folder(project_path, new_folder_name):
    # Create the full path for the new folder
    new_folder_path = os.path.join(project_path, new_folder_name)

    # Check if the new folder exists
    if not os.path.exists(new_folder_path):
        # If it doesn't exist, create the new folder
        os.makedirs(new_folder_path)

In [4]:
def export_to_csv(results_table, columns_names, folder_path):
    # Creating the file path
    file_path = folder_path + "/results.csv"

    # Writing to the CSV file
    with open(file_path, 'w', newline='') as csvfile:
        # Creating a CSV writer object
        csv_writer = csv.writer(csvfile)

        # Writing the column names
        csv_writer.writerow(columns_names)

        # Writing the data rows
        csv_writer.writerows(results_table)

In [5]:
def model_creation(parameters, project_path):
    
    #extract parameters
    task_id = parameters["task_id"]
    layers = parameters["Layers"]
    heads = parameters["Heads"]
    embed_dim = parameters["Emedding_dim"]
    batch_size = parameters["batch_size"]
    
    total_combs = len(Layers)*len(Heads)
    columns_names = ["dataset_name", "experiment_num", "n_layers", "n_heads", "embed_dim", "batch_size", "balanced_accuracy", "accuracy", "log_loss"]
    results_table = []

    #parameters for the model
    ff_pw_size = 30  #this value because of the paper 
    attn_dropout = 0.3 #paper
    ff_dropout = 0.1 #paper value
    aggregator = "cls"
    aggregator_parameters = None
    decoder_hidden_units = [128,64] #paper value
    decoder_activation_fn = nn.ReLU()
    need_weights = False
    numerical_passthrough = False


    #get the dataset_name
    dataset_name = data.get_dataset_name(task_id)

    X_train, X_test, y_train, y_test, n_instances, n_labels, n_numerical, n_categories = data.import_data(task_id, "task")
    #getting validation indices
    train_indices, val_indices = model_selection.train_test_split(np.arange(X_train.shape[0]), test_size=0.333) #1/9 of train is equal to 10% of total


    #create the folder to save the dataset experiments if it doesn't exist
    new_folder(project_path, "data_models")
    path_of_data_models = os.path.join(project_path, "data_models") #path of the folder data_models

    #create the folder for specific dataset name
    new_folder(path_of_data_models, dataset_name)
    path_of_dataset = os.path.join(path_of_data_models, dataset_name) #path of the folder dataset 

    #save a .txt in the folder to sava the validation indices
    np.savetxt(os.path.join(path_of_dataset, "validation_indices"), val_indices)
    
    experiment_num = 1
    
    for n_layers in layers:
        for n_heads in heads:
            #experiment i folder
            new_folder(path_of_dataset, f"experiment_{experiment_num}")
            path_of_experiment = os.path.join(path_of_dataset, f"experiment_{experiment_num}") #In this folder it will be saved the model and images


            #create the folder for the checkpoints
            new_folder(path_of_experiment, "checkpoints")
            path_of_checkpoint = os.path.join(path_of_experiment, "checkpoints") #path to save the checkpoints

            #create the folder for the plots
            new_folder(path_of_experiment, "plots")
            path_of_plots = os.path.join(path_of_experiment, "plots") #path of the folder dataset

            #module
            module = training.build_module(
                n_categories, # List of number of categories
                n_numerical, # Number of numerical features
                n_heads, # Number of heads per layer
                ff_pw_size, # Size of the MLP inside each transformer encoder layer
                n_layers, # Number of transformer encoder layers    
                n_labels, # Number of output neurons
                embed_dim,
                attn_dropout, 
                ff_dropout, 
                aggregator, # The aggregator for output vectors before decoder
                rnn_aggregator_parameters=aggregator_parameters,
                decoder_hidden_units=decoder_hidden_units,
                decoder_activation_fn=decoder_activation_fn,
                need_weights=need_weights,
                numerical_passthrough=numerical_passthrough
            )

            #MODEL
            model = skorch.NeuralNetClassifier(
                module=module,
                criterion=torch.nn.CrossEntropyLoss,
                optimizer=torch.optim.AdamW,
                device="cuda" if torch.cuda.is_available() else "cpu",
                batch_size = batch_size,
                max_epochs = 100,
                train_split=skorch.dataset.ValidSplit(((train_indices, val_indices),)),
                callbacks=[
                    ("balanced_accuracy", skorch.callbacks.EpochScoring("balanced_accuracy", lower_is_better=False)),
                    ("duration", skorch.callbacks.EpochTimer()),
                    EpochScoring(scoring='accuracy', name='train_acc', on_train=True),
                    Checkpoint(monitor='valid_acc_best',dirname = path_of_checkpoint, load_best = True),
                    EarlyStopping(patience = 15)

                ],
                optimizer__lr=1e-4,
                optimizer__weight_decay=1e-4
            )

            # Define Checkpoint and TrainEndCheckpoint callbacks with custom directory
            cp = Checkpoint()
            train_end_cp = TrainEndCheckpoint()


            #TRAINING
            model = model.fit(X={
                    "x_numerical": X_train[:, :n_numerical].astype(np.float32),
                    "x_categorical": X_train[:, n_numerical:].astype(np.int32)
                    }, 
                    y=y_train.astype(np.int64)
                    )
            
            #TESTING
            predictions = model.predict_proba(X={
                            "x_numerical": X_test[:, :n_numerical].astype(np.float32),
                            "x_categorical": X_test[:, n_numerical:].astype(np.int32)
                            }
                            )
            
            print("Test results:\n")
            print(evaluating.get_default_scores(y_test.astype(np.int64), predictions, multiclass=True))
            
            balanced_accuracy = evaluating.get_default_scores(y_test, predictions, multiclass=True)["balanced_accuracy"]
            accuracy = evaluating.get_default_scores(y_test, predictions, multiclass=True)["accuracy"]
            log_loss = evaluating.get_default_scores(y_test, predictions, multiclass=True)["log_loss"]

            #save the results in a list
            result_row = [dataset_name, experiment_num, n_layers, n_heads, embed_dim, batch_size, balanced_accuracy, accuracy, log_loss]
            results_table.append(result_row)

            #create and save the plots
            fig_1, fig_2 = plots.model_plots(model, f"Experiment {experiment_num}")

            # Save the first figure
            fig_1.savefig(os.path.join(path_of_plots, 'figure1.png'))

            # Save the second figure
            fig_2.savefig(os.path.join(path_of_plots, 'figure2.png'))

            #increase experiment number
            experiment_num += 1


    export_to_csv(results_table, columns_names, path_of_dataset)


            


    


In [6]:
task_id = 233092
Layers = [4]
Heads = [4]
Emedding_dim = 128 #The embedding size is set one by one to avoid the out of memory error
total_combs = len(Layers)*len(Heads)
batch_size = 64

In [7]:
parameters = create_parameters(task_id, Layers, Heads, Emedding_dim, batch_size)

In [8]:
project_path = "/home/diego/Git/thesis-tabtrans"
model_creation(parameters, project_path)

INFO:openml.datasets.dataset:pickle write arrhythmia




  epoch    balanced_accuracy    train_acc    train_loss    valid_acc    valid_loss    cp     dur
-------  -------------------  -----------  ------------  -----------  ------------  ----  ------
      1               [36m0.1000[0m       [32m0.5104[0m           nan       [31m0.5702[0m           nan     +  0.6535
      2               0.1000       0.5228           nan       0.5702           nan        0.4788
      3               0.1000       0.5228           nan       0.5702           nan        0.4762
      4               0.1000       0.5228           nan       0.5702           nan        0.4763
      5               0.1000       0.5228           nan       0.5702           nan        0.4760
      6               0.1000       0.5228           nan       0.5702           nan        0.4767
      7               0.1000       0.5228           nan       0.5702           nan        0.4768
      8               0.1000       0.5228           nan       0.5702           nan        0.4755
   

ValueError: Input contains NaN.