In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import os
from pathlib import Path

from requests import get
import pandas as pd
import numpy as np

np.random.seed(0)

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

import logging

logging.basicConfig(level=logging.WARN)

<IPython.core.display.Javascript object>

In [3]:
from pytorch_tabnet.tuner.bohb_tuner import BOHBTuner
from pytorch_tabnet.tuner.tabnet_worker import TabNetWorker

<IPython.core.display.Javascript object>

# Utilities

In [4]:
def download(url, out, force=False, verify=True):
    out.parent.mkdir(parents=True, exist_ok=True)
    if force and out.exists():
        print(f"Removing file at {str(out)}")
        out.unlink()

    if out.exists():
        print("File already exists.")
        return
    print(f"Downloading {url} at {str(out)} ...")
    # open in binary mode
    with out.open(mode="wb") as file:
        # get request
        response = get(url, verify=verify)
        for chunk in response.iter_content(100000):
            # write to file
            file.write(chunk)

<IPython.core.display.Javascript object>

In [5]:
UNKNOWN_VALUE = ["Unkn0wnV@lue"]


class SafeLabelEncoder(LabelEncoder):
    """
    Safe label encoder, encoding every unknown value as Unkn0wnV@lue.
    """

    def fit(self, y):
        """
        Fit the label encoder, by casting the numpy array as a string, then adding the code for unknown.
        
        Parameters
        ----------
        y : numpy array
            the values to fit
        
        Returns
        -------
        SafeLabelEncoder
            itself, fitted
        """
        return super().fit(np.concatenate((y.astype("str"), UNKNOWN_VALUE)))

    def fit_transform(self, y):
        """
        Fit the encoder, then transform the input data and returns it.
        
        Parameters
        ----------
        y : numpy array
            the values to fit
        
        Returns
        -------
        numpy array
            the encoded data
        """
        self.fit(y)
        return super().transform(y)

    def transform(self, y):
        """
        Transform the input data and returns it.
        
        Parameters
        ----------
        y : numpy array
            the values to fit
        
        Returns
        -------
        numpy array
            the encoded data
        """
        return super().transform(
            np.where(
                np.isin(y.astype("str"), self.classes_), y.astype("str"), UNKNOWN_VALUE
            )
        )



<IPython.core.display.Javascript object>

# Download census-income dataset

In [6]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
url_test = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"

dataset_name = "census-income"
out = Path(os.getcwd() + "/data/" + dataset_name + ".csv")
out_test = Path(os.getcwd() + "/data/" + dataset_name + "_test.csv")

download(url, out, force=False)
download(url_test, out_test, force=False)

File already exists.
File already exists.


<IPython.core.display.Javascript object>

# Load data and split

In [7]:
cols = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
    "native-country",
    "target",
]

<IPython.core.display.Javascript object>

In [8]:
train = pd.read_csv(out, names=cols)
test = pd.read_csv(out_test, names=cols, skiprows=2)
target = "target"

train[target] = train[target].str.strip()
# Test has . in label, let's clean it
test[target] = test[target].str.strip().str.strip(".")

<IPython.core.display.Javascript object>

In [9]:
used_columns = list(set(train.columns.tolist()) - set([target]) - set(["Set"]))
used_columns

['native-country',
 'education',
 'marital-status',
 'race',
 'fnlwgt',
 'age',
 'workclass',
 'relationship',
 'education-num',
 'capital-loss',
 'sex',
 'hours-per-week',
 'capital-gain',
 'occupation']

<IPython.core.display.Javascript object>

# Simple preprocessing

Label encode categorical features and fill empty cells.

In [10]:
nunique = train[used_columns].nunique()
types = train[used_columns].dtypes

cat_cols = train[used_columns].columns[(nunique < 200) | (types == "object")]
other_cols = train[used_columns].columns[~train[used_columns].columns.isin(cat_cols)]
print(cat_cols)
print(other_cols)

Index(['native-country', 'education', 'marital-status', 'race', 'age',
       'workclass', 'relationship', 'education-num', 'capital-loss', 'sex',
       'hours-per-week', 'capital-gain', 'occupation'],
      dtype='object')
Index(['fnlwgt'], dtype='object')


<IPython.core.display.Javascript object>

In [11]:
# Fillna
train[cat_cols] = train[cat_cols].astype("str")
train[other_cols] = train[other_cols].fillna(train[other_cols].mean())

test[cat_cols] = test[cat_cols].astype("str")
test[other_cols] = test[other_cols].fillna(train[other_cols].mean())

<IPython.core.display.Javascript object>

In [12]:
train.isnull().sum().sum()

0

<IPython.core.display.Javascript object>

In [13]:
enc = {}
for col in cat_cols:
    label_enc = SafeLabelEncoder()
    enc[col] = label_enc
    train[col] = label_enc.fit_transform(train[col])
    test[col] = label_enc.transform(test[col])
enc[target] = SafeLabelEncoder()
train[target] = enc[target].fit_transform(train[target])
test[target] = enc[target].transform(test[target])

enc

{'native-country': SafeLabelEncoder(),
 'education': SafeLabelEncoder(),
 'marital-status': SafeLabelEncoder(),
 'race': SafeLabelEncoder(),
 'age': SafeLabelEncoder(),
 'workclass': SafeLabelEncoder(),
 'relationship': SafeLabelEncoder(),
 'education-num': SafeLabelEncoder(),
 'capital-loss': SafeLabelEncoder(),
 'sex': SafeLabelEncoder(),
 'hours-per-week': SafeLabelEncoder(),
 'capital-gain': SafeLabelEncoder(),
 'occupation': SafeLabelEncoder(),
 'target': SafeLabelEncoder()}

<IPython.core.display.Javascript object>

# Define categorical features for categorical embeddings

In [14]:
unused_feat = ["Set"]

cat_idxs = [i for i, f in enumerate(used_columns) if f in cat_cols]
cat_dims = [len(enc[f].classes_) for f in used_columns if f in cat_cols]
print(cat_idxs)
print(cat_dims)

[0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13]
[43, 17, 8, 6, 74, 10, 7, 17, 93, 3, 95, 120, 16]


<IPython.core.display.Javascript object>

In [15]:
def log_emb_generator(cat_dim_list, max_dim):
    return [min(np.log2(nb).astype("int"), max_dim) for nb in cat_dim_list]

<IPython.core.display.Javascript object>

In [16]:
def emb_generator(cat_dim_list, max_dim):
    return [min(nb // 2, max_dim) for nb in cat_dim_list]

<IPython.core.display.Javascript object>

In [17]:
cat_emb_dims = []

for max_dim in [1, 2, 5, 10, 20, 50]:
    cat_emb_dims.append(log_emb_generator(cat_dims, max_dim))
    cat_emb_dims.append(emb_generator(cat_dims, max_dim))

<IPython.core.display.Javascript object>

# Training one model

In [18]:
train["Set"] = np.random.choice(
    ["train", "valid"], p=[0.8, 0.2], size=(train.shape[0],)
)

train_indices = train[train.Set == "train"].index
valid_indices = train[train.Set == "valid"].index

X_train = train[used_columns].values[train_indices]
X_valid = train[used_columns].values[valid_indices]

y_train = train[target].values[train_indices]
y_valid = train[target].values[valid_indices]

# Test here should be ignored for training, only purpose is benching with paper values
X_test = test[used_columns].values
y_test = test[target].values

del train, test, train_indices, valid_indices

<IPython.core.display.Javascript object>

# BOHB

## TabNet Worker

In [19]:
tuner = BOHBTuner(
    TabNetWorker,
    # This is for overriding default grid
    {
        "cat_dims": [cat_dims],
        "cat_idxs": [cat_idxs],
        "cat_emb_dims": cat_emb_dims,
        "lr": [0.02],
    },
)

<IPython.core.display.Javascript object>

In [20]:
min_budget = 2
max_budget = 10
n_iter = 30

<IPython.core.display.Javascript object>

In [None]:
%%time
result = tuner.fit(
    X_train, y_train, X_valid, y_valid, X_test, y_test, n_iter, min_budget, max_budget
)
result

Device used : cuda
Device used : cuda
Will train until validation stopping metric hasn't improved in 5 rounds.
---------------------------------------
| EPOCH |  train  |   valid  | total time (s)
| 1     | 0.65526 |  0.50373 |   7.6       
| 2     | 0.77002 |  0.60426 |   15.1      
| 3     | 0.81314 |  0.64146 |   22.4      
Training done in 22.370 seconds.
---------------------------------------
Device used : cuda
Will train until validation stopping metric hasn't improved in 5 rounds.
---------------------------------------
| EPOCH |  train  |   valid  | total time (s)
| 1     | 0.50783 |  0.44612 |   1.9       
| 2     | 0.66699 |  0.43341 |   3.6       
| 3     | 0.75022 |  0.54452 |   5.4       
Training done in 5.429 seconds.
---------------------------------------
Device used : cuda
Will train until validation stopping metric hasn't improved in 5 rounds.
---------------------------------------
| EPOCH |  train  |   valid  | total time (s)
| 1     | 0.57686 |  0.50460 |   1.9  

| 1     | 0.61986 |  0.43036 |   3.1       
| 2     | 0.71401 |  0.57173 |   6.3       
| 3     | 0.73260 |  0.53650 |   9.4       
| 4     | 0.75294 |  0.40455 |   12.4      
| 5     | 0.74352 |  0.57375 |   15.3      
| 6     | 0.80416 |  0.56700 |   18.1      
| 7     | 0.80772 |  0.58253 |   20.8      
| 8     | 0.80031 |  0.58828 |   23.9      
| 9     | 0.79055 |  0.54962 |   26.7      
| 10    | 0.83528 |  0.50640 |   29.6      
Training done in 29.627 seconds.
---------------------------------------
Device used : cuda
Will train until validation stopping metric hasn't improved in 5 rounds.
---------------------------------------
| EPOCH |  train  |   valid  | total time (s)
| 1     | 0.61574 |  0.57976 |   7.3       
| 2     | 0.77744 |  0.52757 |   13.5      
| 3     | 0.81024 |  0.66992 |   20.6      
| 4     | 0.84152 |  0.77662 |   27.6      
| 5     | 0.85905 |  0.80608 |   34.6      
| 6     | 0.86350 |  0.83523 |   41.2      
| 7     | 0.86396 |  0.84063 |   48.1      
|

| 2     | 0.68588 |  0.51575 |   5.3       
| 3     | 0.72846 |  0.49432 |   7.8       
Training done in 7.804 seconds.
---------------------------------------
Device used : cuda
Will train until validation stopping metric hasn't improved in 5 rounds.
---------------------------------------
| EPOCH |  train  |   valid  | total time (s)
| 1     | 0.70173 |  0.62870 |   3.1       
| 2     | 0.78914 |  0.54646 |   6.0       
| 3     | 0.81399 |  0.59389 |   9.0       
Training done in 8.966 seconds.
---------------------------------------
Device used : cuda
Will train until validation stopping metric hasn't improved in 5 rounds.
---------------------------------------
| EPOCH |  train  |   valid  | total time (s)
| 1     | 0.70173 |  0.62870 |   2.9       
| 2     | 0.78914 |  0.54646 |   5.8       
| 3     | 0.81399 |  0.59389 |   8.8       
| 4     | 0.84063 |  0.56106 |   11.7      
| 5     | 0.84653 |  0.50055 |   14.6      
| 6     | 0.85923 |  0.47822 |   17.5      
Early stopping o

In [None]:
%matplotlib inline
tuner.describe_results()

In [None]:
result["best_params"]

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier

fit_params = {}
model_params = {}
for key, value in result["best_params"].items():
    if key in ["batch_size", "num_workers", "patience", "virtual_batch_size"]:
        fit_params[key] = value
    else:
        model_params[key] = value
fit_params.pop("patience")
clf = TabNetClassifier(**model_params)
clf.fit(X_train, y_train, X_valid=X_valid, y_valid=y_valid, **fit_params, patience=20)

In [None]:
roc_auc_score(y_score=clf.predict_proba(X_train)[:, 1], y_true=y_train)

In [None]:
roc_auc_score(y_score=clf.predict_proba(X_valid)[:, 1], y_true=y_valid)

In [None]:
roc_auc_score(y_score=clf.predict_proba(X_test)[:, 1], y_true=y_test)