In [1]:
import sys
sys.path.append('/home/diego/Git/thesis-tabtrans')

import numpy as np
import torch
import torch.nn as nn
from utils import training, callback, evaluating, attention, data
from sklearn import datasets, model_selection, pipeline, metrics
import skorch
import pandas as pd
import openml
from skorch.callbacks import Checkpoint, EarlyStopping, LoadInitState, EpochScoring, Checkpoint, TrainEndCheckpoint

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler #to create one hot encoding for categorical variables
from sklearn.impute import KNNImputer

In [2]:
#task_id = 233090 #anneal
#task_id = 233093 #mfeat
#task_id = 233092 #arrhythmia
task_id = 233108 #cnae-9

task = openml.tasks.get_task(task_id)  
dataset_id = task.dataset_id
df = data.read_dataset_by_id(dataset_id) #this function returns a dictionary with the dataset's data and metadata

X = df["features"] #features
y = df["outputs"].codes #outputs

categorical_features = df['categorical'].tolist() #name of the categorical features
numerical_features = df['numerical'].tolist() #name of the numerical features

# Split the data into training and testing sets
seed = 11

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.20, random_state= seed, stratify=y)
train_indices, val_indices = model_selection.train_test_split(np.arange(X_train.shape[0]), test_size=1/3, stratify=y_train) #1/3 of train is equal to 20% of total


X_categorical = X_train[categorical_features]  # Categorical features
X_numerical = X_train[numerical_features]     # Numerical features


# Always processing using the imputer. If there were not nan, nothing will happen
imputer = pipeline.Pipeline([('imputer', KNNImputer(n_neighbors=10)), ('scaler', StandardScaler())])
imputer = imputer.fit(X_numerical.iloc[train_indices])
numerical_imputed = imputer.transform(X_numerical)
X_numerical = pd.DataFrame(numerical_imputed, columns=X_numerical.columns) # Convert NumPy array back to Pandas DataFrame


# Use ordinal encoder, not label encoder
# The nan values and non-existing categories are mapped to -1
le = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1, encoded_missing_value=-1)
le = le.fit(X_categorical.iloc[train_indices])
# Adding a 1 ensures that -1->0, 0->1, 1->2 indexing correctly the architecture's embeddings table
categorical_imputed = le.transform(X_categorical) + 1
X_categorical = pd.DataFrame(categorical_imputed, columns=X_categorical.columns)
    
    
X_ordered = pd.concat([X_numerical, X_categorical], axis=1)
X_train = X_ordered.values

n_instances = X_ordered.shape[0]
n_numerical = X_numerical.shape[1]
n_categories = [X_categorical[col].nunique() for col in X_categorical.columns] #list that tells the number of categories for each categorical feature
n_labels = len(df["labels"].keys()) #number of labels


INFO:openml.datasets.dataset:pickle write cnae-9


In [3]:
# Using these hyperparameters take ~65 epochs to reach the 99% of balancede accuracy
n_layers = 4
n_heads = 4
embed_dim = 128 #The embedding size is set one by one to avoid the out of memory error
batch_size = 32 # 32, 64, 128, 256, 512, 1024
epochs = 150

# Using these hyperparameters take ~65 epochs to reach the 99% of balancede accuracy
#n_layers = 2
#n_heads = 4
#embed_dim = 128 #The embedding size is set one by one to avoid the out of memory error
#batch_size = 32 # I recommend to use this batch size
#epochs = 100

#parameters for the model
ff_pw_size = 30  #this value because of the paper 
attn_dropout = 0.3 #paper
ff_dropout = 0.1 #paper value
aggregator = "cls"
aggregator_parameters = None
decoder_hidden_units = [128,64] #paper value [128,64]
decoder_activation_fn = nn.ReLU()
need_weights = False
numerical_passthrough = False

In [4]:
"""
Building PyTorch module.

We provide a wrapper function for building the PyTorch module.
The function is utils.training.build_module.
"""
module = training.build_module(
    n_categories, # List of number of categories
    n_numerical, # Number of numerical features
    n_heads, # Number of heads per layer
    ff_pw_size, # Size of the MLP inside each transformer encoder layer
    n_layers, # Number of transformer encoder layers    
    n_labels, # Number of output neurons
    embed_dim,
    attn_dropout, 
    ff_dropout, 
    aggregator, # The aggregator for output vectors before decoder
    rnn_aggregator_parameters=aggregator_parameters,
    decoder_hidden_units=decoder_hidden_units,
    decoder_activation_fn=decoder_activation_fn,
    need_weights=need_weights,
    numerical_passthrough=numerical_passthrough
)






In [5]:
model = skorch.NeuralNetClassifier(
    module=module,
    criterion=torch.nn.CrossEntropyLoss,
    optimizer=torch.optim.AdamW,
    device = "cuda" if torch.cuda.is_available() else "cpu",
    batch_size = batch_size,
    max_epochs = epochs,
    train_split=skorch.dataset.ValidSplit(((train_indices, val_indices),)),
    callbacks=[
        ("balanced_accuracy", skorch.callbacks.EpochScoring("balanced_accuracy", lower_is_better=False)),
        ("duration", skorch.callbacks.EpochTimer()),
        EpochScoring(scoring='accuracy', name='train_acc', on_train=True), #        Checkpoint(monitor='valid_acc_best', dirname=path_of_checkpoint, load_best = True), 
        EarlyStopping(patience=15)

    ],
    optimizer__lr=1e-4,
    optimizer__weight_decay=1e-4
)

In [6]:
model = model.fit(X={
        "x_numerical": X_train[:, :n_numerical].astype(np.float32),
        "x_categorical": X_train[:, n_numerical:].astype(np.int32)
        }, 
        y=y_train.astype(np.int64)
    )

  epoch    balanced_accuracy    train_acc    train_loss    valid_acc    valid_loss     dur
-------  -------------------  -----------  ------------  -----------  ------------  ------
      1               [36m0.1111[0m       [32m0.1128[0m        [35m2.2073[0m       [31m0.1111[0m        [94m2.2004[0m  7.0588
      2               0.1111       [32m0.1024[0m        [35m2.1996[0m       0.1111        [94m2.1979[0m  6.9855
      3               0.1111       0.1215        [35m2.1978[0m       0.1111        [94m2.1975[0m  6.9846
      4               0.1111       [32m0.0990[0m        2.1990       0.1111        [94m2.1973[0m  6.9854
      5               0.1111       0.1354        [35m2.1978[0m       0.1111        [94m2.1972[0m  6.9854
      6               0.1111       [32m0.0920[0m        2.1984       0.1111        [94m2.1971[0m  6.9850
      7               0.1111       0.1076        [35m2.1973[0m       0.1111        [94m2.1970[0m  6.9849
      8            

In [7]:
preds = model.predict({
        "x_numerical": X_train[:, :n_numerical].astype(np.float32),
        "x_categorical": X_train[:, n_numerical:].astype(np.int32)
        })

In [8]:
metrics.balanced_accuracy_score(y_train[val_indices].astype(np.int64), preds[val_indices])

0.8576388888888888