In [2]:
import sys
import os

#path of the project
project_path = "/home/diego-ngz/Git/thesis-tabtrans"

sys.path.append(project_path) #This helps to be able to import the data from the parent directory to other files

from utils import data, tabtrans_file, plots,attention, training, attention_file
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
import pickle
import plotly.express as px
from skorch.callbacks import TrainEndCheckpoint, EarlyStopping, LoadInitState, Checkpoint
import skorch
import torch.nn as nn
import torch
from scipy.stats import entropy

/home/diego-ngz/Git/thesis-tabtrans


In [3]:
'''  
Steps to follow:
1. Load the data
2. separate between training and testing.
3. Separate the validation from the training set
4. Import the best hyperparameters
5. Train the model and in every epoch check the validation error, the training error, and the attention matrix
8. Stop the model when 10 epochs the entropy doesnt decrease
''' 

'  \nSteps to follow:\n1. Load the data\n2. separate between training and testing.\n3. Separate the validation from the training set\n4. Import the best hyperparameters\n5. Train the model and in every epoch check the validation error, the training error, and the attention matrix\n8. Stop the model when 10 epochs the entropy doesnt decrease\n'

In [4]:
df_id = 2
X_train, X_test, y_train, y_test, train_indices, val_indices, n_instances, n_labels, n_numerical, n_categories = data.import_data(df_id)
'''
The train_indices are the ones used for training the model
The val_indices are the ones used for validation (is the 20% of the training set)
'''
n_features = X_train.shape[1]

name_df = data.get_dataset_name(df_id)

path_of_datset = f'{project_path}/Final_models_4/{name_df}'

path_to_hyperparameters = f'{path_of_datset}/tabtrans/hyperparameter_selection'

#define the path to final_tabtrans
path_to_final_tabtrans = f'{path_of_datset}/tabtrans/final_tabtrans_cv'

sample = 100
path_of_hyper_size = f'{path_to_hyperparameters}/{sample}'
path_of_hyper_results = f'{path_of_hyper_size}/results.csv'


INFO:openml.datasets.dataset:pickle write anneal


In [5]:

#parameters for the model
ff_pw_size = 30  #this value because of the paper 
attn_dropout = 0.3 #paper
ff_dropout = 0.1 #paper value
aggregator = "cls"
aggregator_parameters = None
decoder_hidden_units = [128,64] #paper value [128,64]
decoder_activation_fn = nn.ReLU()
need_weights = False
numerical_passthrough = False


In [6]:
#import the hyperparameters
hyperparameters = data.import_hyperparameters(path_of_hyper_results, cv = True)


n_layers = int(hyperparameters["n_layers"])
n_heads = int(hyperparameters["n_heads"])
embedding_size = int(hyperparameters["embedding_size"])
batch_size = int(hyperparameters["batch_size"])
epochs = int(hyperparameters["max_epochs_mean"])

In [7]:
#module
module = training.build_module(
    n_categories, # List of number of categories
    n_numerical, # Number of numerical features
    n_heads, # Number of heads per layer
    ff_pw_size, # Size of the MLP inside each transformer encoder layer
    n_layers, # Number of transformer encoder layers    
    n_labels, # Number of output neurons
    embedding_size,
    attn_dropout, 
    ff_dropout, 
    aggregator, # The aggregator for output vectors before decoder
    rnn_aggregator_parameters=aggregator_parameters,
    decoder_hidden_units=decoder_hidden_units,
    decoder_activation_fn=decoder_activation_fn,
    need_weights=need_weights,
    numerical_passthrough=numerical_passthrough
)



In [8]:
path_to_checkpoint = f"{path_of_datset}/entropy" #create the path to save the checkpoint
os.makedirs(path_to_checkpoint, exist_ok = True)

intervals_size = epochs // 5
epochs_to_save = [intervals_size, 2*intervals_size, 3*intervals_size, 4*intervals_size, epochs]

train_end_cp = TrainEndCheckpoint(dirname = f"{path_to_checkpoint}/epoch_{intervals_size}")


model = skorch.NeuralNetClassifier(
    module = module,
    criterion=torch.nn.CrossEntropyLoss,
    optimizer=torch.optim.AdamW,
    device= "cuda", #cuda" if torch.cuda.is_available() else
    batch_size = batch_size,
    train_split = None,
    max_epochs = intervals_size,
    optimizer__lr=1e-4,
    optimizer__weight_decay=1e-4,
    callbacks=[train_end_cp]
    )

model = model.fit(X={
    "x_numerical": X_train[:, :n_numerical].astype(np.float32),
    "x_categorical": X_train[:, n_numerical:].astype(np.int32)
    }, 
    y=y_train.astype(np.int64)     
    )


print("--------------------------------------------------------------------")
print(f"Model saved for {intervals_size} epochs")
         
for i in range(1,len(epochs_to_save)):
    
    epoch = epochs_to_save[i]
    
    load_state = LoadInitState(train_end_cp) #load the state of the past model
    train_end_cp = TrainEndCheckpoint(dirname = f"{path_to_checkpoint}/epoch_{epoch}")
    
    #create the model
    model = skorch.NeuralNetClassifier(
            module = module,
            criterion=torch.nn.CrossEntropyLoss,
            optimizer=torch.optim.AdamW,
            device= "cuda", #cuda" if torch.cuda.is_available() else
            batch_size = batch_size,
            train_split = None,
            max_epochs = epochs_to_save[i]-epochs_to_save[i-1], #It will train for the difference between the epochs given that it will start where the last end
            optimizer__lr=1e-4,
            optimizer__weight_decay=1e-4,
            callbacks=[load_state, train_end_cp]
            )

    model = model.fit(X={
        "x_numerical": X_train[:, :n_numerical].astype(np.float32),
        "x_categorical": X_train[:, n_numerical:].astype(np.int32)
        }, 
        y=y_train.astype(np.int64)
        )



  epoch    train_loss     dur
-------  ------------  ------
      1        [36m0.9362[0m  0.4392
      2        [36m0.7370[0m  0.2628
      3        [36m0.5795[0m  0.2636
      4        [36m0.4020[0m  0.2642
      5        [36m0.2729[0m  0.2641
      6        [36m0.1874[0m  0.2642
      7        [36m0.1438[0m  0.2622
      8        [36m0.1151[0m  0.2645
      9        [36m0.0985[0m  0.2635
     10        [36m0.0865[0m  0.2645
--------------------------------------------------------------------
Model saved for 10 epochs
  epoch    train_loss     dur
-------  ------------  ------
     11        [36m0.0693[0m  0.3684
     12        [36m0.0567[0m  0.3843
     13        [36m0.0423[0m  0.3850
     14        [36m0.0329[0m  0.3840
     15        0.0336  0.3853
     16        [36m0.0234[0m  0.3821
     17        0.0282  0.3852
     18        [36m0.0176[0m  0.3849
     19        0.0277  0.3853
     20        0.0177  0.3862
  epoch    train_loss     dur
-------  --

In [None]:
epoch_avg_entropy = []

matrix = attention_file.attention_matrix(model, X_train[train_indices], y_train[train_indices], n_numerical, n_layers, n_heads, n_features+1)
entropy_per_row = np.apply_along_axis(entropy, 1, matrix, base=2) / np.log2(n_features)
average_entropy = np.mean(entropy_per_row)


epoch_avg_entropy.append(average_entropy)

In [None]:
load_state = LoadInitState(train_end_cp) #load the state of the past model
train_end_cp = TrainEndCheckpoint(dirname = f"{path_to_checkpoint}/epoch_{4}")

#create the model
model = skorch.NeuralNetClassifier(
        module = module,
        criterion=torch.nn.CrossEntropyLoss,
        optimizer=torch.optim.AdamW,
        device= "cuda", #cuda" if torch.cuda.is_available() else
        batch_size = batch_size,
        train_split = None,
        max_epochs = 3, #It will train for the difference between the epochs given that it will start where the last end
        optimizer__lr=1e-4,
        optimizer__weight_decay=1e-4,
        callbacks=[load_state, train_end_cp]
        )

model = model.fit(X={
    "x_numerical": X_train[:, :n_numerical].astype(np.float32),
    "x_categorical": X_train[:, n_numerical:].astype(np.int32)
    }, 
    y=y_train.astype(np.int64)
    )


In [None]:
model = skorch.NeuralNetClassifier(
    module=module,
    criterion=torch.nn.CrossEntropyLoss,
    optimizer=torch.optim.AdamW,
    device = "cuda", #if torch.cuda.is_available() else "cpu",
    batch_size = batch_size,
    max_epochs = 1,
    train_split=skorch.dataset.ValidSplit(((train_indices, val_indices),)),
    callbacks=[load_state,
    ],
    optimizer__lr=1e-4,
    optimizer__weight_decay=1e-4
)

In [None]:
model = model.fit(X={
    "x_numerical": X_train[:, :n_numerical].astype(np.float32),
    "x_categorical": X_train[:, n_numerical:].astype(np.int32)
    }, 
    y=y_train.astype(np.int64)
    )

In [None]:
'''
while epoch_counter <= max_epochs:
    load_state = LoadInitState(train_end_cp) #load the state of the past model
    train_end_cp = TrainEndCheckpoint(dirname = path_to_checkpoint)
    
    #Train the first model    
    model = skorch.NeuralNetClassifier(
        module=module,
        criterion=torch.nn.CrossEntropyLoss,
        optimizer=torch.optim.AdamW,
        device = "cuda", #if torch.cuda.is_available() else "cpu",
        batch_size = batch_size,
        max_epochs = 1,
        train_split=skorch.dataset.ValidSplit(((train_indices, val_indices),)),
        callbacks=[load_state, train_end_cp,
            ("balanced_accuracy", skorch.callbacks.EpochScoring("balanced_accuracy", lower_is_better=False)),
            ("duration", skorch.callbacks.EpochTimer()),
            EpochScoring(scoring='accuracy', name='train_acc', on_train=True),
            #Checkpoint(dirname = path_of_checkpoint, load_best = True), 
            EarlyStopping(patience=10)

        ],
        optimizer__lr=1e-4,
        optimizer__weight_decay=1e-4
    )

    #Trainning 
    model = model.fit(X={
        "x_numerical": X_train[:, :n_numerical].astype(np.float32),
        "x_categorical": X_train[:, n_numerical:].astype(np.int32)
        }, 
        y=y_train.astype(np.int64)
        )
    
    matrix = attention_file.attention_matrix(model, X_train[train_indices], y_train[train_indices], n_numerical, n_layers, n_heads, n_features+1)
    entropy_per_row = np.apply_along_axis(entropy, 1, matrix, base=2) / np.log2(n_features)
    average_entropy = np.mean(entropy_per_row)
    epoch_avg_entropy.append(average_entropy)
    
    epoch_counter += 1
    
'''

In [None]:
max_epochs = 5

path_to_checkpoint = f"{path_of_datset}/entropy" #create the path to save the checkpoint
os.makedirs(path_to_checkpoint, exist_ok = True)

#first lest define the train end checkpoint
train_end_cp = TrainEndCheckpoint(dirname = path_to_checkpoint)

average_entropy = []


model = skorch.NeuralNetClassifier(
    module=module,
    criterion=torch.nn.CrossEntropyLoss,
    optimizer=torch.optim.AdamW,
    device = "cuda", #if torch.cuda.is_available() else "cpu",
    batch_size = batch_size,
    max_epochs = 1,
    train_split=skorch.dataset.ValidSplit(((train_indices, val_indices),)),
    callbacks=[train_end_cp,
        ("balanced_accuracy", skorch.callbacks.EpochScoring("balanced_accuracy", lower_is_better=False)),
        ("duration", skorch.callbacks.EpochTimer()),
        EpochScoring(scoring='accuracy', name='train_acc', on_train=True),
        #Checkpoint(dirname = path_of_checkpoint, load_best = True), 
        EarlyStopping(patience=10)

    ],
    optimizer__lr=1e-4,
    optimizer__weight_decay=1e-4
)

#Trainning 
model = model.fit(X={
    "x_numerical": X_train[:, :n_numerical].astype(np.float32),
    "x_categorical": X_train[:, n_numerical:].astype(np.int32)
    }, 
    y=y_train.astype(np.int64)
    )


average_entropy.append(entropy)

print(average_entropy)
''' 
epoch_counter = 1

while epoch_counter <= max_epochs:
    load_state = LoadInitState(train_end_cp) #load the state of the past model
    
    #Train the first model    
    model = skorch.NeuralNetClassifier(
        module=module,
        criterion=torch.nn.CrossEntropyLoss,
        optimizer=torch.optim.AdamW,
        device = "cuda", #if torch.cuda.is_available() else "cpu",
        batch_size = batch_size,
        max_epochs = 1,
        train_split=skorch.dataset.ValidSplit(((train_indices, val_indices),)),
        callbacks=[
            load_state,
            train_end_cp,
            ("balanced_accuracy", skorch.callbacks.EpochScoring("balanced_accuracy", lower_is_better=False)),
            ("duration", skorch.callbacks.EpochTimer()),
            EpochScoring(scoring='accuracy', name='train_acc', on_train=True),
        ],
        optimizer__lr=1e-4,
        optimizer__weight_decay=1e-4
    )

    model = model.fit(X={
        "x_numerical": X_train[:, :n_numerical].astype(np.float32),
        "x_categorical": X_train[:, n_numerical:].astype(np.int32)
        }, 
        y=y_train.astype(np.int64)
        )
    
    entropy = attention_file.entropy_attention_matrix(model, X_train, y_train, n_numerical, n_layers, n_heads, n_features)
    average_entropy.append(entropy)
    
    epoch_counter += 1
'''

In [None]:
epochs = []

train_acc = []
val_acc = []

train_loss = []
val_loss = []

for x in model.history:
    epoch_num = x["epoch"]
    epochs.append(epoch_num)

    train_acc.append(x['train_acc'])
    val_acc.append(x['valid_acc'])

    train_loss.append(x["train_loss"])
    val_loss.append(x["valid_loss"])