In [None]:
import sys
sys.path.append('/home/diego/Git/thesis-tabtrans')

import numpy as np
import torch
import torch.nn as nn
from utils import training, callback, evaluating, attention, data
from sklearn import datasets, model_selection
import skorch
import pandas as pd
import openml
from skorch.callbacks import Checkpoint, EarlyStopping, LoadInitState, EpochScoring, Checkpoint, TrainEndCheckpoint

from sklearn.preprocessing import LabelEncoder #to create one hot encoding for categorical variables
from sklearn.impute import KNNImputer

In [None]:
task_id = 233092
task = openml.tasks.get_task(task_id)  
dataset_id = task.dataset_id
df = data.read_dataset_by_id(dataset_id) #this function returns a dictionary with the dataset's data and metadata

X = df["features"] #features
y = df["outputs"].codes #outputs

categorical_features = df['categorical'].tolist() #name of the categorical features
numerical_features = df['numerical'].tolist() #name of the numerical features


# Create numerical and categorical datasets
X_categorical = X[categorical_features]  # Categorical features
X_numerical = X[numerical_features]     # Numerical features

if X_numerical.isnull().values.any():
        imputer = KNNImputer(n_neighbors=10)
        numerical_imputed = imputer.fit_transform(X_numerical)
        X_numerical = pd.DataFrame(numerical_imputed, columns=X_numerical.columns) # Convert NumPy array back to Pandas DataFrame


# Filter out categorical columns with only one unique value
redundant_columns = [col for col in X_categorical.columns if X_categorical[col].nunique() <= 1]
X_categorical = X_categorical.drop(columns=redundant_columns)

# Recompute categorical features after filtering
categorical_features = [col for col in categorical_features if col not in redundant_columns]




# Create a LabelEncoder object
le = LabelEncoder()
for col in X_categorical.columns:
    X_categorical[col] = le.fit_transform(X_categorical[col].astype(str))


X_ordered = pd.concat([X_numerical, X_categorical], axis=1)

n_instances = X_ordered.shape[0]
n_numerical = X_numerical.shape[1]
n_categories = [X_categorical[col].nunique() for col in X_categorical.columns] #list that tells the number of categories for each categorical feature
#n_categories_2 = df["n_categorical"] #this one is from the metadata
n_labels = len(df["labels"].keys()) #number of labels

seed = 11
X_train, X_test, y_train, y_test = model_selection.train_test_split(X_ordered, y, test_size=0.20, random_state= seed, stratify=y)

X_train = X_train.values.astype(np.float32)
X_test = X_test.values.astype(np.float32)


train_indices, val_indices = model_selection.train_test_split(np.arange(X_train.shape[0]), test_size=1/3, stratify=y_train) #1/3 of train is equal to 20% of total


In [None]:
X_categorical.isnull().values.any()

In [None]:
n_heads = 4 # In average 4 works better
embed_dim = 128 # In average 256 works better
n_layers = 4
batch_size = 64
epochs = 5


#parameters for the model
ff_pw_size = 30  #this value because of the paper 
attn_dropout = 0.3 #paper
ff_dropout = 0.1 #paper value
aggregator = "cls"
aggregator_parameters = None
decoder_hidden_units = [128,64] #paper value
decoder_activation_fn = nn.ReLU()
need_weights = False
numerical_passthrough = False


In [None]:
"""
Building PyTorch module.

We provide a wrapper function for building the PyTorch module.
The function is utils.training.build_module.
"""
module = training.build_module(
    n_categories, # List of number of categories
    n_numerical, # Number of numerical features
    n_heads, # Number of heads per layer
    ff_pw_size, # Size of the MLP inside each transformer encoder layer
    n_layers, # Number of transformer encoder layers    
    n_labels, # Number of output neurons
    embed_dim,
    attn_dropout, 
    ff_dropout, 
    aggregator, # The aggregator for output vectors before decoder
    rnn_aggregator_parameters=aggregator_parameters,
    decoder_hidden_units=decoder_hidden_units,
    decoder_activation_fn=decoder_activation_fn,
    need_weights=need_weights,
    numerical_passthrough=numerical_passthrough
)




In [None]:
model = skorch.NeuralNetClassifier(
    module=module,
    criterion=torch.nn.CrossEntropyLoss,
    optimizer=torch.optim.AdamW,
    device = "cpu", #if torch.cuda.is_available() else "cpu",
    batch_size = batch_size,
    max_epochs = epochs,
    train_split=skorch.dataset.ValidSplit(((train_indices, val_indices),)),
    callbacks=[
        ("balanced_accuracy", skorch.callbacks.EpochScoring("balanced_accuracy", lower_is_better=False)),
        ("duration", skorch.callbacks.EpochTimer()),
        EpochScoring(scoring='accuracy', name='train_acc', on_train=True), #        Checkpoint(monitor='valid_acc_best', dirname=path_of_checkpoint, load_best = True), 
        EarlyStopping(patience=15)

    ],
    optimizer__lr=1e-4,
    optimizer__weight_decay=1e-4
)

In [None]:
model = model.fit(X={
        "x_numerical": X_train[:, :n_numerical].astype(np.float32),
        "x_categorical": X_train[:, n_numerical:].astype(np.int32)
        }, 
        y=y_train.astype(np.int64)
    )


"""
Training and validation


model = model.fit(X={
        "x_numerical": X_train[:, :n_numerical].astype(np.float32),
        "x_categorical": X_train[:, n_numerical:].astype(np.int32)
        }, 
        y=y_train.astype(np.int64)
    )
    
"""