In [1]:
from pytorch_tabnet.tab_model import TabNetClassifier

import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split
import pandas as pd
import numpy as np
np.random.seed(0)


import os
import wget
from pathlib import Path

# Download census-income dataset

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
dataset_name = 'census-income'
out = Path(os.getcwd()+'/data/'+dataset_name+'.csv')

In [3]:
out.parent.mkdir(parents=True, exist_ok=True)
if out.exists():
    print("File already exists.")
else:
    print("Downloading file...")
    wget.download(url, out.as_posix())

File already exists.


# Load data and split

In [4]:
train = pd.read_csv(out)
target = ' <=50K'
if "Set" not in train.columns:
    train["Set"] = np.random.choice(["train", "valid", "test"], p =[.8, .1, .1], size=(train.shape[0],))

train_indices = train[train.Set=="train"].index
valid_indices = train[train.Set=="valid"].index
test_indices = train[train.Set=="test"].index

# Simple preprocessing

Label encode categorical features and fill empty cells.

In [5]:
categorical_columns = []
categorical_dims =  {}
for col in train.columns[train.dtypes == object]:
    print(col, train[col].nunique())
    l_enc = LabelEncoder()
    train[col] = train[col].fillna("VV_likely")
    train[col] = l_enc.fit_transform(train[col].values)
    categorical_columns.append(col)
    categorical_dims[col] = len(l_enc.classes_)

for col in train.columns[train.dtypes == 'float64']:
    train.fillna(train.loc[train_indices, col].mean(), inplace=True)

 State-gov 9
 Bachelors 16
 Never-married 7
 Adm-clerical 15
 Not-in-family 6
 White 5
 Male 2
 United-States 42
 <=50K 2
Set 3


# Define categorical features for categorical embeddings

In [6]:
unused_feat = ['Set']

features = [ col for col in train.columns if col not in unused_feat+[target]] 

cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]

cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

# define your embedding sizes : here just a random choice
cat_emb_dim = [5, 4, 3, 6, 2, 2, 1, 10]

# Hyperparameter optimization

In this section, we will build a wrapper around the TabNetClassifier which supports early stopping.

In [7]:
X = train[features].values
y = train[target].values

In [8]:
num_workers = os.cpu_count() if torch.cuda.is_available() else 0

In [9]:
class TabNetTuner(TabNetClassifier):
    def fit(self, X, y, *args, **kwargs):
        # Dirty trick => would be better to add n_d in grid, or fix it in __init__ of tuner
        self.n_d = self.n_a
        X_train, X_valid, y_train, y_valid = train_test_split(
            X, y, test_size=0.2, random_state=0, shuffle=True, stratify=y
        )
        
        return super().fit(
            X_train,
            y_train,
            patience=20,
            eval_set=[(X_valid, y_valid)],
            num_workers=num_workers,
            max_epochs=1000,
            batch_size=1024,
            virtual_batch_size=128
        )

In [10]:
clf = TabNetTuner(cat_dims=cat_dims, cat_emb_dim=cat_emb_dim, cat_idxs=cat_idxs)

Device used : cpu


In [11]:
# Let's generate embedding size based on cat dims
cat_emb_dim_list = []
for max_dim in [1, 5, 10, 20, 50]:
    cat_emb_dim_list.append([min(nb // 2, max_dim) for nb in cat_dims])
cat_emb_dim_list

[[1, 1, 1, 1, 1, 1, 1, 1],
 [4, 5, 3, 5, 3, 2, 1, 5],
 [4, 8, 3, 7, 3, 2, 1, 10],
 [4, 8, 3, 7, 3, 2, 1, 20],
 [4, 8, 3, 7, 3, 2, 1, 21]]

In [12]:
grid = {
    "n_a": [3, 5, 8, 13, 21],
    "cat_emb_dim": cat_emb_dim_list,
    "n_independent": [0, 1, 2, 5],
    "n_shared": [0, 1, 2],
    "n_steps": [1, 3, 5, 8],
    "clip_value": [1],
    "gamma": [0.5, 1.3, 3],
    "momentum": [0.1, 0.05, 0.02, 0.005],
    "lambda_sparse": [0.1, 0.01, 0.001],
    "optimizer_params": [
        {'lr': 0.01}, 
        {'lr': 0.02}, 
        {'lr': 0.001}],
    "verbose": [0]
}

In [13]:
search = RandomizedSearchCV(
    clf,
    grid,
    n_iter=5,
    scoring="roc_auc",
    n_jobs=1,
    refit=False,
    cv=3,
    verbose=1,
    pre_dispatch=0,
    random_state=0,
    return_train_score=False,
)

In [14]:
search.fit(X, y)
search.best_params_

Device used : cpu
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Device used : cpu

Early stopping occurred at epoch 39 with best_epoch = 19 and best_val_0_auc = 0.89068
Best weights from best epoch are automatically used!
Device used : cpu

Early stopping occurred at epoch 48 with best_epoch = 28 and best_val_0_auc = 0.89069
Best weights from best epoch are automatically used!
Device used : cpu

Early stopping occurred at epoch 37 with best_epoch = 17 and best_val_0_auc = 0.89179
Best weights from best epoch are automatically used!
Device used : cpu

Early stopping occurred at epoch 123 with best_epoch = 103 and best_val_0_auc = 0.89602
Best weights from best epoch are automatically used!
Device used : cpu

Early stopping occurred at epoch 132 with best_epoch = 112 and best_val_0_auc = 0.90382
Best weights from best epoch are automatically used!
Device used : cpu

Early stopping occurred at epoch 73 with best_epoch = 53 and best_val_0_auc = 0.89494
Best weights from best 

{'verbose': 0,
 'optimizer_params': {'lr': 0.02},
 'n_steps': 1,
 'n_shared': 2,
 'n_independent': 5,
 'n_a': 3,
 'momentum': 0.005,
 'lambda_sparse': 0.001,
 'gamma': 1.3,
 'clip_value': 1,
 'cat_emb_dim': [4, 5, 3, 5, 3, 2, 1, 5]}

In [15]:
clf = TabNetClassifier(cat_idxs=cat_idxs, cat_dims=cat_dims, **search.best_params_)

In [16]:
clf.fit(
    X[train_indices],
    y[train_indices],
    patience=20,
    eval_set=[(X[valid_indices], y[valid_indices])]
)


Early stopping occurred at epoch 51 with best_epoch = 31 and best_val_0_auc = 0.91778
Best weights from best epoch are automatically used!


In [17]:
preds = clf.predict_proba(X[test_indices])[:, 1]

In [18]:
roc_auc_score(y_score=preds, y_true=y[test_indices])

0.9132824517203846