# Usage example

This notebook aims to simulate a complete pipeline for training, evaluating and testing a Transformer for tabular data using the developed architecture.

Additionally, we also include an example of how to extract the attention cubes and compute the cumulative attention. 

In [1]:
import numpy as np
import torch
import torch.nn as nn
from utils import training, callback, evaluating, attention
from sklearn import datasets, model_selection
import skorch

## Data simulation

Firstly, we simulate a dataset containing numerical and categorical features. This dataset is equivalent to you importing your dataset and preprocessing. It is important to notice that we simulate categorical variables as if they were ordinal encoded, which is required by the architecture.

In [2]:
"""
Dataset metadata definition.

    n_instances: Number of instances (rows) in your dataset.
    n_numerical: Number of numerical features in your dataset.
    n_categorical: List of the number of categories for each categorical column.
    n_labels: Number of classification labels.
    
"""

n_instances = 100
n_numerical = 3
n_categorical = [2, 5, 8]
n_labels = 3

In [3]:
"""
Data simulation.

It is important that the numerical columns be the first columns. 
"""

X = np.random.rand(n_instances, n_numerical)

for n_cat in n_categorical:
    X_category = np.random.randint(0, n_cat, size=(n_instances, 1), dtype=np.int32)
    X = np.concatenate([X, X_category], axis=1)
    
y = np.random.randint(0, n_labels, size=n_instances, dtype=np.int32)

print("Example of simulated features:", X[0])
print("\nExample of simulated labels:", y[0])

Example of simulated features: [0.37432988 0.1293632  0.81723612 1.         0.         6.        ]

Example of simulated labels: 2


In [4]:
"""
Data spliting.
"""

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1)
train_indices, val_indices = model_selection.train_test_split(np.arange(X_train.shape[0]), test_size=0.1)

## Training Transformer for tabular data

In [5]:
"""
Transformer hyperparameters definition.

n_heads: Number of heads oneach Transformer Encoder.
embed_dim: The embeddings' dimension.
n_layers: Number of stacked Transformer Encoders.
ff_pw_size: Position-wise Feed Forward network hidden layer size.
attn_dropout: Dropout applied in the Multi-head self-attention mechanism.
ff_dropout: Position-wise Feed Forward network dropout.
aggregator: Aggregator to use. Must be in {concatenate, cls, max, mean, sum, rnn}
aggregator_parameters: If the aggregator is different from rnn, set to None. Otherwise, a dictionary expecting:
                        cell: Cellused in the RNN. Must be one of {GRU, LSTM}
                        output_size: Recurrent neural network hidden size 
                        num_layers: Number of stacked layers in the RNN
                        dropout: Dropout applied to the RNN
    }
decoder_hidden_units: List of hidden layer's sizes of the decoder MLP.
decoder_activation_fn: Activation function used in the hidden layers
need_weights: Set True if you require the attention cubes. During training is recommended to set it to False.
numerical_passthrough: False if numerical features will be processed by the Multi-head self-attention mechanism.
"""

n_heads = 4
embed_dim = 32
n_layers = 2
ff_pw_size = 128
attn_dropout = 0.1
ff_dropout = 0.2
aggregator = "rnn"
aggregator_parameters = {
        "cell": "GRU",
        "output_size": 8,
        "num_layers":  2,
        "dropout":  0.1
    }
decoder_hidden_units = [16, 4, 4]
decoder_activation_fn=nn.ReLU()
need_weights = False
numerical_passthrough = False

In [6]:
"""
Building PyTorch module.

We provide a wrapper function for building the PyTorch module.
The function is utils.training.build_module.
"""

module = training.build_module(
    n_categorical, # List of number of categories
    n_numerical, # Number of numerical features
    n_heads, # Number of heads per layer
    ff_pw_size, # Size of the MLP inside each transformer encoder layer
    n_layers, # Number of transformer encoder layers    
    n_labels, # Number of output neurons
    embed_dim,
    attn_dropout, 
    ff_dropout, 
    aggregator, # The aggregator for output vectors before decoder
    rnn_aggregator_parameters=aggregator_parameters,
    decoder_hidden_units=decoder_hidden_units,
    decoder_activation_fn=decoder_activation_fn,
    need_weights=need_weights,
    numerical_passthrough=numerical_passthrough
)

print(module)

TabularTransformer(
  (numerical_encoder): NumericalEncoder()
  (transformer_encoder): TTransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TTransformerEncoderLayer(
        (pre_norm_1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
        (pre_norm_2): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
        (self_attn): MultiheadAttentionContainer(
          (in_proj_container): InProjContainer(
            (query_proj): Linear(in_features=32, out_features=32, bias=True)
            (key_proj): Linear(in_features=32, out_features=32, bias=True)
            (value_proj): Linear(in_features=32, out_features=32, bias=True)
          )
          (attention_layer): ScaledDotProduct()
          (out_proj): Linear(in_features=32, out_features=32, bias=True)
        )
        (ff_network): Sequential(
          (0): Linear(in_features=32, out_features=128, bias=True)
          (1): ReLU()
          (2): Dropout(p=0.2, inplace=False)
          (3): Linear(in_features=1



In [7]:
"""
Wrapping module in skorch.

The PyTorch module can be used for a custom training.

However, in this example we use the skorch library,
which avoid the implementation of a custom training loop.
"""

model = skorch.NeuralNetClassifier(
            module=module,
            criterion=torch.nn.CrossEntropyLoss,
            optimizer=torch.optim.AdamW,
            device="cuda" if torch.cuda.is_available() else "cpu",
            batch_size=8,
            max_epochs=5,
            train_split=skorch.dataset.ValidSplit(((train_indices, val_indices),)),
            callbacks=[
                ("balanced_accuracy", skorch.callbacks.EpochScoring("balanced_accuracy", lower_is_better=False)),
                ("accuracy", skorch.callbacks.EpochScoring("accuracy", lower_is_better=False)),
                ("duration", skorch.callbacks.EpochTimer())
            ],
            optimizer__lr=1e-4,
            optimizer__weight_decay=1e-4
        )
    

In [8]:
"""
Training and validation
"""

model = model.fit(X={
        "x_numerical": X_train[:, :n_numerical].astype(np.float32),
        "x_categorical": X_train[:, n_numerical:].astype(np.int32)
        }, 
        y=y_train.astype(np.int64)
    )

  epoch    accuracy    balanced_accuracy    train_loss    valid_acc    valid_loss     dur
-------  ----------  -------------------  ------------  -----------  ------------  ------
      1      [36m0.2222[0m               [32m0.3333[0m        [35m1.1290[0m       [31m0.2222[0m        [94m1.1684[0m  0.2985
      2      0.2222               0.3333        [35m1.1279[0m       0.2222        [94m1.1682[0m  0.1452
      3      0.2222               0.3333        [35m1.1270[0m       0.2222        [94m1.1682[0m  0.1358
      4      0.2222               0.3333        [35m1.1267[0m       0.2222        1.1683  0.1279
      5      0.2222               0.3333        [35m1.1264[0m       0.2222        1.1684  0.1295


In [9]:
"""
Testing
"""

predictions = model.predict_proba(X={
        "x_numerical": X_test[:, :n_numerical].astype(np.float32),
        "x_categorical": X_test[:, n_numerical:].astype(np.int32)
        }
    )

print("Test results:\n")
evaluating.get_default_scores(y_test, predictions, multiclass=True)

Test results:



{'balanced_accuracy': 0.3333333333333333,
 'accuracy': 0.4,
 'log_loss': 1.0904202886059424}

## Extracting attention cubes

Once the Transformer was trained, you could extract the attention cubes. Using the attention cubes you are able to compute the cumulative attention.

In [10]:
"""
Eneabling and extracting the attention cubes.

To eneable the attention cubes recovering, the only requirement is to 
set the PyTorch module need_weights=True. When the cubes are required the
new output will be:

    - predictions: The predictionsfor the given instances
    - layer outputs: The output of each encoder layer
    - weights: The attention cube of each encoder

In skorch, the trained PyTorch module is saved in the variable .module_.

When using skorch, the only way to recover multiple outputs is by
using the forward/forward_iter method.
"""

model.module_.need_weights = True
n_features = n_numerical + len(n_categorical)

cumulative_attns = []

for X_inst, y_inst in zip(X_test, y_test):
    pred, layer_outputs, attn = model.forward(X={
        "x_numerical": X_inst[None, :n_numerical].astype(np.float32),
        "x_categorical": X_inst[None, n_numerical:].astype(np.int32)
        })
    
    """
    The attention cubes dimensions are:
    
    (num. layers, batch size, num. heads, num. features, num. features)
    """
    assert attn.shape == (n_layers, 1, n_heads, n_features, n_features) 
    
    """
    To compute the cumulative attention we provide a function in:
    
        utils.attention.compute_std_attentions(attention, aggregator)
        
    The function returns:
        The inidivual attention (non cumulative) of each layer. Shape:  (num layers, batch size, num. features)
        The cumulative attention at each layer. Shape: (num layers, batch size, num. features)
        
    The last layerof the cumulative attention represents the cumulative attention over all
    Transformer Encoders.
    """
    
    ind_attn, cum_attn = attention.compute_std_attentions(attn, aggregator)
    
    assert ind_attn.shape == (n_layers, 1, n_features)
    assert cum_attn.shape == (n_layers, 1, n_features)
    
    cumulative_attns.append( cum_attn[-1, 0])
    
cumulative_attns = np.array(cumulative_attns)

## Cumulative attention visualization

In [13]:
import plotly.express as px

"""
Each row represents a test instance, while each column
represents a feature.
"""

fig = px.imshow(cumulative_attns)
fig.show()