In [1]:
import torch
import torch.nn as nn
import torch.functional as F

import skorch
from skorch import dataset
import numpy as np


from ndsl.architecture.attention import TabularTransformer

from sklearn import base, pipeline, preprocessing, compose, metrics, model_selection

import pandas as pd

In [2]:
categorical_cols = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
numerical_cols = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
label_col = "class"

In [3]:
data = pd.read_csv("adult/data/dataset.csv")
data[label_col] = data[label_col].replace({"<=50K": 0, ">50K": 1})

In [4]:
data.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
class             0
dtype: int64

In [5]:
train_size=0.65
val_size=0.15
test_size=0.20
seed=11


train_features, test_features, train_labels, test_labels = model_selection.train_test_split(
    data[categorical_cols + numerical_cols], 
    data[label_col], 
    test_size=test_size,
    random_state=seed
)

val_size = data.shape[0] * val_size / train_features.shape[0]

train_features, val_features, train_labels, val_labels = model_selection.train_test_split(
    train_features, 
    train_labels, 
    test_size=val_size, 
    random_state=seed
)    

In [6]:
n_bins = 10
n_quantiles = 10

categorical_transformer = pipeline.Pipeline(steps=[
    ('label', preprocessing.OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
    ('shift', preprocessing.FunctionTransformer(lambda x: x + 1))
])

numerical_transformer = pipeline.FeatureUnion([
#    ('qtscaler', preprocessing.QuantileTransformer(n_quantiles=n_quantiles)),
    ('sscaler', preprocessing.StandardScaler()),
#    ('logscaler', preprocessing.FunctionTransformer(np.log1p)),
])

numerical_categorical_transformer = pipeline.Pipeline(steps=[
    ('dscaler', preprocessing.KBinsDiscretizer(n_bins=n_bins, encode="ordinal", strategy="uniform")), 
])


preprocessor = pipeline.Pipeline([
    ('columns_transformer', compose.ColumnTransformer(
        remainder='passthrough', #passthough features not listed
        transformers=[
            ('categorical_transformer', categorical_transformer , categorical_cols),
            #('numerical_categorical_transformer', numerical_categorical_transformer , numerical_cols),
            ('numerical_transformer', numerical_transformer , numerical_cols)
        ]),
    )
])

In [7]:
total_examples = train_features.shape[0] + val_features.shape[0] + test_features.shape[0]

print("Training examples {} ({})".format(train_features.shape[0], train_features.shape[0] / total_examples))
print("Validation examples {} ({})".format(val_features.shape[0], val_features.shape[0] / total_examples))
print("Test examples {} ({})".format(test_features.shape[0], test_features.shape[0] / total_examples))

Training examples 21163 (0.6499493258806548)
Validation examples 4885 (0.15002610484935966)
Test examples 6513 (0.20002456926998557)


In [8]:
preprocessor = preprocessor.fit(train_features, train_labels)

train_features = preprocessor.transform(train_features)
val_features = preprocessor.transform(val_features)
test_features = preprocessor.transform(test_features)

all_features = np.concatenate([train_features, val_features])
all_labels = np.concatenate([train_labels, val_labels])

n_labels = 1
criterion = torch.nn.BCEWithLogitsLoss

In [9]:
def build_model(*args, **kwargs):
    
    module = TabularTransformer(
        n_categories=(9, 17, 8, 15, 7, 6, 3, 42), # List of number of categories
        n_numerical=6, # Number of numerical features
        n_head=8, # Number of heads per layer
        n_hid=128, # Size of the MLP inside each transformer encoder layer
        n_layers=6, # Number of transformer encoder layers    
        n_output=1, # The number of output neurons
        embed_dim=32,
        aggregator="rnn", # The aggregator for output vectors before decoder
        rnn_aggregator_parameters={
            "output_size": 128,
            "cell": "GRU",
            "num_layers": 1,
            "dropout": 0
        },
        decoder_hidden_units=[128, 64],
        decoder_activation_fn=nn.ReLU(),
        need_weights=False,
        numerical_passthrough=True
    )

    model = skorch.NeuralNetClassifier(
            module=module,
            criterion=criterion,
            optimizer=torch.optim.AdamW,
            device="cuda" if torch.cuda.is_available() else "cpu",
            batch_size=128,
            max_epochs=12,
            train_split=dataset.CVSplit(cv=0.15),
            callbacks=[
                ("balanced_accuracy", skorch.callbacks.EpochScoring("balanced_accuracy", lower_is_better=False)),
                ("accuracy", skorch.callbacks.EpochScoring("accuracy", lower_is_better=False)),
                ("roc_auc", skorch.callbacks.EpochScoring("roc_auc", lower_is_better=False)),
                ("f1", skorch.callbacks.EpochScoring("f1", lower_is_better=False)),
                ("precision", skorch.callbacks.EpochScoring("precision", lower_is_better=False)),
                ("recall", skorch.callbacks.EpochScoring("recall", lower_is_better=False))
            ],
            **kwargs
        )

    return model

In [10]:
all_features[:3]

array([[ 3.00000000e+00,  1.60000000e+01,  4.00000000e+00,
         2.00000000e+00,  4.00000000e+00,  5.00000000e+00,
         1.00000000e+00,  1.40000000e+01, -1.15046758e+00,
         2.89216869e+00, -3.80074141e-02, -1.46859301e-01,
        -2.15354774e-01, -3.60679944e-02],
       [ 5.00000000e+00,  1.60000000e+01,  1.00000000e+00,
         2.00000000e+00,  5.00000000e+00,  5.00000000e+00,
         1.00000000e+00,  4.00000000e+01,  4.67736094e-01,
         1.41924226e+00, -3.80074141e-02, -1.46859301e-01,
        -2.15354774e-01, -3.60679944e-02],
       [ 6.00000000e+00,  1.20000000e+01,  7.00000000e+00,
         9.00000000e+00,  5.00000000e+00,  5.00000000e+00,
         1.00000000e+00,  4.00000000e+01,  2.15949448e+00,
        -5.07547252e-01, -4.26395104e-01, -1.46859301e-01,
        -2.15354774e-01, -2.07068513e+00]])

In [11]:
all_features.shape, all_labels.shape

((26048, 14), (26048,))

In [12]:
params = {
    "optimizer__lr": [10e-6, 10e-5, 10e-4, 10e-3],    
    "optimizer__weight_decay": [10e-6, 10e-5, 10e-4, 10e-3, 10e-2, 10e-1],     
    "module__attn_dropout": [0, 0.1, 0.2, 0.3, 0.4, 0.5], # Used dropout
    "module__ff_dropout": [0, 0.1, 0.2, 0.3, 0.4, 0.5], # Used dropout  
}

for sel_params in model_selection.ParameterSampler(params, n_iter=10):
    print("Trying: ", sel_params)
    build_model(sel_params).fit(X={
        "x_categorical": all_features[:, :8].astype(np.int32), 
        "x_numerical": all_features[:, 8:].astype(np.float32)
        }, 
        y=all_labels.astype(np.double)
    )

Trying:  {'optimizer__weight_decay': 0.001, 'optimizer__lr': 0.001, 'module__ff_dropout': 0.5, 'module__attn_dropout': 0.3}
  epoch    accuracy    balanced_accuracy      f1    precision    recall    roc_auc    train_loss    valid_acc    valid_loss     dur
-------  ----------  -------------------  ------  -----------  --------  ---------  ------------  -----------  ------------  ------
      1      [36m0.8025[0m               [32m0.6366[0m  [35m0.4357[0m       [31m0.7286[0m    [94m0.3107[0m     [36m0.8422[0m        [32m0.4737[0m       [35m0.8025[0m        [31m0.4131[0m  5.1892
      2      [36m0.8211[0m               [32m0.7274[0m  [35m0.5985[0m       0.6662    [94m0.5433[0m     [36m0.8503[0m        [32m0.4131[0m       [35m0.8211[0m        [31m0.3965[0m  5.3143
      3      [36m0.8255[0m               [32m0.7355[0m  [35m0.6112[0m       0.6742    [94m0.5589[0m     [36m0.8520[0m        [32m0.4107[0m       [35m0.8255[0m        [31m0.3939[0m

In [None]:
preds = model.predict_proba({
        "x_categ": test_features[:, :8].astype(np.int32), 
        "x_cont": test_features[:, 8:].astype(np.float32)
        })

metrics.roc_auc_score(test_labels, preds[:, 1])
