# FT Replicate 1

In [1]:
import sys
sys.path.append('/home/diego/Git/thesis-tabtrans')

import numpy as np
import torch
import torch.nn as nn
from utils import training, callback, evaluating, attention, data
from sklearn import datasets, model_selection
import skorch
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [2]:
#####################################################
# Configuration
#####################################################

dataset = "anneal"
aggregator = "cls"

print(f"Using -- Dataset:{dataset} Aggregator:{aggregator}")

Using -- Dataset:anneal Aggregator:cls


In [4]:
task_id = 233090 #anneal dataset

# Load the Iris dataset
X_train, X_test, y_train, y_test, train_indices, val_indices, n_instances, n_labels, n_numerical, n_categories = data.import_data(task_id, 100) #this function returns a dictionary with the dataset's data and metadata

INFO:openml.datasets.dataset:pickle write anneal


In [7]:
"""
Transformer hyperparameters definition.

n_heads: Number of heads oneach Transformer Encoder.
embed_dim: The embeddings' dimension.
n_layers: Number of stacked Transformer Encoders.
ff_pw_size: Position-wise Feed Forward network hidden layer size.
attn_dropout: Dropout applied in the Multi-head self-attention mechanism.
ff_dropout: Position-wise Feed Forward network dropout.
aggregator: Aggregator to use. Must be in {concatenate, cls, max, mean, sum, rnn}
aggregator_parameters: If the aggregator is different from rnn, set to None. Otherwise, a dictionary expecting:
                        cell: Cellused in the RNN. Must be one of {GRU, LSTM}
                        output_size: Recurrent neural network hidden size 
                        num_layers: Number of stacked layers in the RNN
                        dropout: Dropout applied to the RNN
    }
decoder_hidden_units: List of hidden layer's sizes of the decoder MLP.
decoder_activation_fn: Activation function used in the hidden layers
need_weights: Set True if you require the attention cubes. During training is recommended to set it to False.
numerical_passthrough: False if numerical features will be processed by the Multi-head self-attention mechanism.
"""


n_heads = 4 # In average 4 works better
embed_dim = 128 # In average 256 works better
n_layers = 2
ff_pw_size = 30  #this value because of the paper 
attn_dropout = 0.3 #paper
ff_dropout = 0.1 #paper value
aggregator = "cls"
aggregator_parameters = None
decoder_hidden_units = [128,64] #paper value
decoder_activation_fn = nn.ReLU()
need_weights = False
numerical_passthrough = False

epochs = 50


In [6]:
"""
Building PyTorch module.

We provide a wrapper function for building the PyTorch module.
The function is utils.training.build_module.
"""
#module
module = training.build_module(
    n_categories, # List of number of categories
    n_numerical, # Number of numerical features
    n_heads, # Number of heads per layer
    ff_pw_size, # Size of the MLP inside each transformer encoder layer
    n_layers, # Number of transformer encoder layers    
    n_labels, # Number of output neurons
    embed_dim,
    attn_dropout, 
    ff_dropout, 
    aggregator, # The aggregator for output vectors before decoder
    rnn_aggregator_parameters=aggregator_parameters,
    decoder_hidden_units=decoder_hidden_units,
    decoder_activation_fn=decoder_activation_fn,
    need_weights=need_weights,
    numerical_passthrough=numerical_passthrough
) 



In [11]:
"""
Wrapping module in skorch.

The PyTorch module can be used for a custom training.

However, in this example we use the skorch library,
which avoid the implementation of a custom training loop.
"""

model = skorch.NeuralNetClassifier(
            module = module,
            criterion=torch.nn.CrossEntropyLoss,
            optimizer=torch.optim.AdamW,
            device= "cuda", #cuda" if torch.cuda.is_available() else
            batch_size=32,
            train_split=None,
            max_epochs= epochs,
            optimizer__lr=1e-4,
            optimizer__weight_decay=1e-4
        )

'''
#train_split=skorch.dataset.ValidSplit(((train_indices, val_indices),)),
callbacks=[
                ("balanced_accuracy", skorch.callbacks.EpochScoring("balanced_accuracy", lower_is_better=False)),
                ("accuracy", skorch.callbacks.EpochScoring("accuracy", lower_is_better=False)),
                ("duration", skorch.callbacks.EpochTimer())
            ]
'''   

'\n#train_split=skorch.dataset.ValidSplit(((train_indices, val_indices),)),\ncallbacks=[\n                ("balanced_accuracy", skorch.callbacks.EpochScoring("balanced_accuracy", lower_is_better=False)),\n                ("accuracy", skorch.callbacks.EpochScoring("accuracy", lower_is_better=False)),\n                ("duration", skorch.callbacks.EpochTimer())\n            ]\n'

In [12]:
"""
Training and validation
"""

model = model.fit(X={
        "x_numerical": X_train[:, :n_numerical].astype(np.float32),
        "x_categorical": X_train[:, n_numerical:].astype(np.int32)
        }, 
        y=y_train.astype(np.int64)
    )

  epoch    train_loss     dur
-------  ------------  ------
      1        [36m0.0829[0m  0.1258


      2        [36m0.0825[0m  0.1214
      3        [36m0.0709[0m  0.1164
      4        0.0855  0.1188
      5        0.0777  0.1164
      6        0.0905  0.1166
      7        0.0853  0.1186
      8        0.0753  0.1166
      9        0.0854  0.1175
     10        0.0792  0.1187
     11        0.0840  0.1173
     12        0.0849  0.1175
     13        0.0713  0.1167
     14        0.0832  0.1173
     15        [36m0.0627[0m  0.1167
     16        0.0856  0.1166
     17        0.0789  0.1171
     18        [36m0.0585[0m  0.1175
     19        [36m0.0553[0m  0.1182
     20        0.0581  0.1160
     21        0.0553  0.1174
     22        0.0601  0.1170
     23        0.0589  0.1170
     24        0.0564  0.1162
     25        0.0582  0.1187
     26        0.0585  0.1160
     27        [36m0.0463[0m  0.1173
     28        0.0622  0.1167
     29        0.0517  0.1160
     30        [36m0.0400[0m  0.1169
     31        0.0514  0.1166
     32        0.0433  0.1159
     33

In [15]:
predictions = model.predict_proba(X={
    "x_numerical": X_train[:, :n_numerical].astype(np.float32),
    "x_categorical": X_train[:, n_numerical:].astype(np.int32)
    }
    )

In [18]:
print("Test results in validation:\n")
print(evaluating.get_default_scores(y_train.astype(np.int64), predictions, multiclass = True))


Test results in validation:

{'balanced_accuracy': 0.9990859232175503, 'accuracy': 0.9971910112359551, 'log_loss': 0.008496357153436617}
