In [1]:
import sys
sys.path.append('/home/diego/Git/thesis-tabtrans')

import numpy as np
import torch
import torch.nn as nn
from utils import training, callback, evaluating, attention, data
from sklearn import datasets, model_selection
import skorch
import pandas as pd
import openml
from skorch.callbacks import Checkpoint, EarlyStopping, LoadInitState, EpochScoring, Checkpoint, TrainEndCheckpoint

from sklearn.preprocessing import LabelEncoder,StandardScaler #to create one hot encoding for categorical variables
from sklearn.impute import KNNImputer


In [2]:
#import the data
task_id = 233093
task = openml.tasks.get_task(task_id)
dataset_id = task.dataset_id #suppose we input the task id 
df = data.read_dataset_by_id(dataset_id)

X = df["features"].values.astype(np.float32) #features
y = df["outputs"].codes #outputs


categorical_features = df['categorical'].tolist() #name of the categorical features
numerical_features = df['numerical'].tolist() #name of the numerical features

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.20, random_state= 11, stratify=y)
train_indices, val_indices = model_selection.train_test_split(np.arange(X_train.shape[0]), test_size=1/3, stratify=y_train) #1/3 of train is equal to 20% of total

''' 
#NORMALIZATION
X_train, X_val = X_train[train_indices], X_train[val_indices]

#normalize data
scaler = StandardScaler() 

# transform data 
X_train = scaler.fit_transform(X_train) #scaler returns a numpy array
X_val = scaler.transform(X_val) #scaler returns a numpy array
X_test = scaler.transform(X_test) #scaler returns a numpy array

X_train = np.vstack((X_train, X_val))
#### End of the normalization
'''



n_instances = X_train.shape[0]
n_numerical = X_train.shape[1]
n_categories = [] #list that tells the number of categories for each categorical feature
n_labels = len(df["labels"].keys()) #number of labels


n_layers = 4
n_heads = 4
embed_dim = 128 #The embedding size is set one by one to avoid the out of memory error
batch_size = 64 # 32, 64, 128, 256, 512, 1024
epochs = 100

#parameters for the model
ff_pw_size = 30  #this value because of the paper 
attn_dropout = 0.3 #paper
ff_dropout = 0.1 #paper value
aggregator = "cls"
aggregator_parameters = None
decoder_hidden_units = [128,64] #paper value [128,64]
decoder_activation_fn = nn.ReLU()
need_weights = False
numerical_passthrough = False

INFO:openml.datasets.dataset:pickle write mfeat-factors


In [3]:
"""
Building PyTorch module.

We provide a wrapper function for building the PyTorch module.
The function is utils.training.build_module.
"""
module = training.build_module(
    n_categories, # List of number of categories
    n_numerical, # Number of numerical features
    n_heads, # Number of heads per layer
    ff_pw_size, # Size of the MLP inside each transformer encoder layer
    n_layers, # Number of transformer encoder layers    
    n_labels, # Number of output neurons
    embed_dim,
    attn_dropout, 
    ff_dropout, 
    aggregator, # The aggregator for output vectors before decoder
    rnn_aggregator_parameters=aggregator_parameters,
    decoder_hidden_units=decoder_hidden_units,
    decoder_activation_fn=decoder_activation_fn,
    need_weights=need_weights,
    numerical_passthrough=numerical_passthrough
)






In [4]:
model = skorch.NeuralNetClassifier(
            module=module,
            criterion=torch.nn.CrossEntropyLoss,
            optimizer=torch.optim.AdamW,
            device="cuda" if torch.cuda.is_available() else "cpu",
            batch_size = batch_size,
            max_epochs = epochs,
            train_split=skorch.dataset.ValidSplit(((train_indices, val_indices),)),
            callbacks=[
                ("balanced_accuracy", skorch.callbacks.EpochScoring("balanced_accuracy", lower_is_better=False)),
                ("accuracy", skorch.callbacks.EpochScoring("accuracy", lower_is_better=False)),
                ("duration", skorch.callbacks.EpochTimer()),
                EpochScoring(scoring='accuracy', name='train_acc', on_train=True)
            ],
            optimizer__lr=1e-4,
            optimizer__weight_decay=1e-4
        )

In [5]:
model = model.fit(X={
        "x_numerical": X_train[:, :n_numerical].astype(np.float32),
        "x_categorical": X_train[:, n_numerical:].astype(np.int32)
        }, 
        y=y_train.astype(np.int64)
    )


"""
Training and validation


model = model.fit(X={
        "x_numerical": X_train[:, :n_numerical].astype(np.float32),
        "x_categorical": X_train[:, n_numerical:].astype(np.int32)
        }, 
        y=y_train.astype(np.int64)
    )
    
"""

  epoch    accuracy    balanced_accuracy    train_acc    train_loss    valid_acc    valid_loss     dur
-------  ----------  -------------------  -----------  ------------  -----------  ------------  ------
      1      [36m0.1011[0m               [32m0.1000[0m       [35m0.0901[0m        [31m2.3163[0m       [94m0.1011[0m        [36m2.3067[0m  1.5911
      2      0.1011               0.1000       [35m0.0891[0m        [31m2.3056[0m       0.1011        [36m2.3029[0m  1.5038
      3      0.0993               0.1000       0.0910        [31m2.3028[0m       0.0993        [36m2.3014[0m  1.5049
      4      [36m0.1030[0m               [32m0.1019[0m       0.1060        [31m2.3018[0m       [94m0.1030[0m        [36m2.3006[0m  1.5049
      5      0.1011               0.1000       0.1004        [31m2.3012[0m       0.1011        [36m2.2998[0m  1.5055
      6      [36m0.1292[0m               [32m0.1278[0m       0.1013        [31m2.3011[0m       [94m0.1292[0m 

'\nTraining and validation\n\n\nmodel = model.fit(X={\n        "x_numerical": X_train[:, :n_numerical].astype(np.float32),\n        "x_categorical": X_train[:, n_numerical:].astype(np.int32)\n        }, \n        y=y_train.astype(np.int64)\n    )\n    \n'

In [6]:
#TESTING
predictions = model.predict_proba(X={
                "x_numerical": X_test[:, :n_numerical].astype(np.float32),
                "x_categorical": X_test[:, n_numerical:].astype(np.int32)
                }
                )

In [7]:
print("Test results:\n")
print(evaluating.get_default_scores(y_test.astype(np.int64), predictions, multiclass = True))

Test results:

{'balanced_accuracy': 0.85, 'accuracy': 0.85, 'log_loss': 0.43292165051158443}
