# FT Replicate 1

In [None]:
import sys
sys.path.append('/home/diego/Git/thesis-tabtrans')

import numpy as np
import torch
import torch.nn as nn
from utils import training, callback, evaluating, attention, data
from sklearn import datasets, model_selection
import skorch
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [None]:
#####################################################
# Configuration
#####################################################

dataset = "christine"
aggregator = "cls"

print(f"Using -- Dataset:{dataset} Aggregator:{aggregator}")

In [None]:
id = 554 #41142 #this is the ID of the Iris dataset in OpenML  

# Load the Iris dataset
df = data.read_dataset_by_id(id) #this function returns a dictionary with the dataset's data and metadata

In [None]:
df.keys()

In [None]:
df["categories"]

In [None]:
print(f"In total there are {len(df['numerical'])} numerical columns and {len(df['categorical'])} categorical columns \n")
print(f"The categorical columns are: {df['categorical']}")
print(f"The numerical columns are: {df['numerical']}")

In [None]:
print(type(df))

In [None]:
df_pandas = df["features"]
df_pandas
print(type(df_pandas))

In [None]:
#First numerical and then the categorcial columns

categorical_features = df['categorical'].tolist()
numerical_features = df['numerical'].tolist()
#target = df["target"]

numerical_features = df_pandas[numerical_features]  # Assuming numerical_features is a list of column names
categorical_features = df_pandas[categorical_features]  # Assuming categorical_features is a list of column names
#target = df_pandas[target]  # Assuming target is a column name

#print(target)

print(type(categorical_features))
print(type(numerical_features))
print(type(df["target"]))

In [None]:
df_ordered = pd.concat([numerical_features,categorical_features], axis=1)

In [None]:
# Suppose 'df' is your DataFrame and 'categorical_columns' is a list of column names with categorical features
for col in categorical_features:
    df_ordered[col], _ = pd.factorize(df_ordered[col])

In [None]:
X = df_ordered.values
y = df["outputs"].codes

print(type(X))
print(type(y))  

In [None]:
"""
Dataset metadata definition.

    n_instances: Number of instances (rows) in your dataset.
    n_numerical: Number of numerical features in your dataset.
    n_categorical: List of the number of categories for each categorical column.
    n_labels: Number of classification labels.
    
"""

n_instances = len(X)
n_numerical = df["n_numerical"]
n_categorical = df["n_categorical"]
n_labels = len(df["labels"].keys())

In [None]:
#####################################################
    # Split data
#####################################################

from sklearn import datasets, model_selection

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1)
train_indices, val_indices = model_selection.train_test_split(np.arange(X_train.shape[0]), test_size=1/9) #1/9 of train is equal to 10% of total

In [None]:
"""
Transformer hyperparameters definition.

n_heads: Number of heads oneach Transformer Encoder.
embed_dim: The embeddings' dimension.
n_layers: Number of stacked Transformer Encoders.
ff_pw_size: Position-wise Feed Forward network hidden layer size.
attn_dropout: Dropout applied in the Multi-head self-attention mechanism.
ff_dropout: Position-wise Feed Forward network dropout.
aggregator: Aggregator to use. Must be in {concatenate, cls, max, mean, sum, rnn}
aggregator_parameters: If the aggregator is different from rnn, set to None. Otherwise, a dictionary expecting:
                        cell: Cellused in the RNN. Must be one of {GRU, LSTM}
                        output_size: Recurrent neural network hidden size 
                        num_layers: Number of stacked layers in the RNN
                        dropout: Dropout applied to the RNN
    }
decoder_hidden_units: List of hidden layer's sizes of the decoder MLP.
decoder_activation_fn: Activation function used in the hidden layers
need_weights: Set True if you require the attention cubes. During training is recommended to set it to False.
numerical_passthrough: False if numerical features will be processed by the Multi-head self-attention mechanism.
"""


n_heads = 4 # In average 4 works better
embed_dim = 4 # In average 256 works better
n_layers = 3
ff_pw_size = 30  #this value because of the paper 
attn_dropout = 0.3 #paper
ff_dropout = 0.1 #paper value
aggregator = "cls"
aggregator_parameters = None
decoder_hidden_units = [128,64] #paper value
decoder_activation_fn = nn.ReLU()
need_weights = False
numerical_passthrough = False


In [None]:
"""
Building PyTorch module.

We provide a wrapper function for building the PyTorch module.
The function is utils.training.build_module.
"""
module = training.build_module(
    n_categorical, # List of number of categories
    n_numerical, # Number of numerical features
    n_heads, # Number of heads per layer
    ff_pw_size, # Size of the MLP inside each transformer encoder layer
    n_layers, # Number of transformer encoder layers    
    n_labels, # Number of output neurons
    embed_dim,
    attn_dropout, 
    ff_dropout, 
    aggregator, # The aggregator for output vectors before decoder
    rnn_aggregator_parameters=aggregator_parameters,
    decoder_hidden_units=decoder_hidden_units,
    decoder_activation_fn=decoder_activation_fn,
    need_weights=need_weights,
    numerical_passthrough=numerical_passthrough
)

print(module)



In [None]:
"""
Wrapping module in skorch.

The PyTorch module can be used for a custom training.

However, in this example we use the skorch library,
which avoid the implementation of a custom training loop.
"""

model = skorch.NeuralNetClassifier(
            module = module,
            criterion=torch.nn.CrossEntropyLoss,
            optimizer=torch.optim.AdamW,
            device= "cuda", #cuda" if torch.cuda.is_available() else
            batch_size=32,
            max_epochs=100,
            train_split=skorch.dataset.ValidSplit(((train_indices, val_indices),)),
            callbacks=[
                ("balanced_accuracy", skorch.callbacks.EpochScoring("balanced_accuracy", lower_is_better=False)),
                ("accuracy", skorch.callbacks.EpochScoring("accuracy", lower_is_better=False)),
                ("duration", skorch.callbacks.EpochTimer())
            ],
            optimizer__lr=1e-4,
            optimizer__weight_decay=1e-4
        )
    

In [None]:
"""
Training and validation
"""

model = model.fit(X={
        "x_numerical": X_train[:, :n_numerical].astype(np.float32),
        "x_categorical": X_train[:, n_numerical:].astype(np.int32)
        }, 
        y=y_train.astype(np.int64)
    )