In [1]:
!pip install requests black nb_black hpbandster
%load_ext nb_black

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


<IPython.core.display.Javascript object>

In [2]:
import os
from pathlib import Path

from requests import get
import pandas as pd
import numpy as np

np.random.seed(0)

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

import logging

logging.basicConfig(level=logging.WARN)

<IPython.core.display.Javascript object>

In [3]:
from pytorch_tabnet.tuner.bohb_tuner import BOHBTuner
from pytorch_tabnet.tuner.xgb_worker import XGBWorker

<IPython.core.display.Javascript object>

# Utilities

In [4]:
def download(url, out, force=False, verify=True):
    out.parent.mkdir(parents=True, exist_ok=True)
    if force and out.exists():
        print(f"Removing file at {str(out)}")
        out.unlink()

    if out.exists():
        print("File already exists.")
        return
    print(f"Downloading {url} at {str(out)} ...")
    # open in binary mode
    with out.open(mode="wb") as file:
        # get request
        response = get(url, verify=verify)
        for chunk in response.iter_content(100000):
            # write to file
            file.write(chunk)

<IPython.core.display.Javascript object>

In [5]:
UNKNOWN_VALUE = ["Unkn0wnV@lue"]


class SafeLabelEncoder(LabelEncoder):
    """
    Safe label encoder, encoding every unknown value as Unkn0wnV@lue.
    """

    def fit(self, y):
        """
        Fit the label encoder, by casting the numpy array as a string, then adding the code for unknown.
        
        Parameters
        ----------
        y : numpy array
            the values to fit
        
        Returns
        -------
        SafeLabelEncoder
            itself, fitted
        """
        return super().fit(np.concatenate((y.astype("str"), UNKNOWN_VALUE)))

    def fit_transform(self, y):
        """
        Fit the encoder, then transform the input data and returns it.
        
        Parameters
        ----------
        y : numpy array
            the values to fit
        
        Returns
        -------
        numpy array
            the encoded data
        """
        self.fit(y)
        return super().transform(y)

    def transform(self, y):
        """
        Transform the input data and returns it.
        
        Parameters
        ----------
        y : numpy array
            the values to fit
        
        Returns
        -------
        numpy array
            the encoded data
        """
        return super().transform(
            np.where(
                np.isin(y.astype("str"), self.classes_), y.astype("str"), UNKNOWN_VALUE
            )
        )



<IPython.core.display.Javascript object>

# Download census-income dataset

In [6]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
url_test = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"

dataset_name = "census-income"
out = Path(os.getcwd() + "/data/" + dataset_name + ".csv")
out_test = Path(os.getcwd() + "/data/" + dataset_name + "_test.csv")

download(url, out, force=False)
download(url_test, out_test, force=False)

File already exists.
File already exists.


<IPython.core.display.Javascript object>

# Load data and split

In [7]:
cols = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
    "native-country",
    "target",
]

<IPython.core.display.Javascript object>

In [8]:
train = pd.read_csv(out, names=cols)
test = pd.read_csv(out_test, names=cols, skiprows=2)
target = "target"

train[target] = train[target].str.strip()
# Test has . in label, let's clean it
test[target] = test[target].str.strip().str.strip(".")

<IPython.core.display.Javascript object>

In [9]:
used_columns = list(set(train.columns.tolist()) - set([target]) - set(["Set"]))
used_columns

['age',
 'capital-loss',
 'marital-status',
 'sex',
 'capital-gain',
 'native-country',
 'race',
 'education',
 'relationship',
 'fnlwgt',
 'workclass',
 'occupation',
 'education-num',
 'hours-per-week']

<IPython.core.display.Javascript object>

# Simple preprocessing

Label encode categorical features and fill empty cells.

In [10]:
nunique = train[used_columns].nunique()
types = train[used_columns].dtypes

cat_cols = train[used_columns].columns[(nunique < 200) | (types == "object")]
other_cols = train[used_columns].columns[~train[used_columns].columns.isin(cat_cols)]
print(cat_cols)
print(other_cols)

Index(['age', 'capital-loss', 'marital-status', 'sex', 'capital-gain',
       'native-country', 'race', 'education', 'relationship', 'workclass',
       'occupation', 'education-num', 'hours-per-week'],
      dtype='object')
Index(['fnlwgt'], dtype='object')


<IPython.core.display.Javascript object>

In [11]:
# Fillna
train[cat_cols] = train[cat_cols].astype("str")
train[other_cols] = train[other_cols].fillna(train[other_cols].mean())

test[cat_cols] = test[cat_cols].astype("str")
test[other_cols] = test[other_cols].fillna(train[other_cols].mean())

<IPython.core.display.Javascript object>

In [12]:
train.isnull().sum().sum()

0

<IPython.core.display.Javascript object>

In [13]:
enc = {}
for col in cat_cols:
    label_enc = SafeLabelEncoder()
    enc[col] = label_enc
    train[col] = label_enc.fit_transform(train[col])
    test[col] = label_enc.transform(test[col])
enc[target] = SafeLabelEncoder()
train[target] = enc[target].fit_transform(train[target])
test[target] = enc[target].transform(test[target])

enc

{'age': SafeLabelEncoder(),
 'capital-loss': SafeLabelEncoder(),
 'marital-status': SafeLabelEncoder(),
 'sex': SafeLabelEncoder(),
 'capital-gain': SafeLabelEncoder(),
 'native-country': SafeLabelEncoder(),
 'race': SafeLabelEncoder(),
 'education': SafeLabelEncoder(),
 'relationship': SafeLabelEncoder(),
 'workclass': SafeLabelEncoder(),
 'occupation': SafeLabelEncoder(),
 'education-num': SafeLabelEncoder(),
 'hours-per-week': SafeLabelEncoder(),
 'target': SafeLabelEncoder()}

<IPython.core.display.Javascript object>

# Define categorical features for categorical embeddings

In [14]:
unused_feat = ["Set"]

cat_idxs = [i for i, f in enumerate(used_columns) if f in cat_cols]
cat_dims = [len(enc[f].classes_) for f in used_columns if f in cat_cols]
print(cat_idxs)
print(cat_dims)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13]
[74, 93, 8, 3, 120, 43, 6, 17, 7, 10, 16, 17, 95]


<IPython.core.display.Javascript object>

# Training one model

In [15]:
# X = train[used_columns].values
# y = train[target].values

train["Set"] = np.random.choice(
    ["train", "valid"], p=[0.8, 0.2], size=(train.shape[0],)
)

train_indices = train[train.Set == "train"].index
valid_indices = train[train.Set == "valid"].index

X_train = train[used_columns].values[train_indices]
X_valid = train[used_columns].values[valid_indices]

y_train = train[target].values[train_indices]
y_valid = train[target].values[valid_indices]

# Test here should be ignored for training, only purpose is benching with paper values
X_test = test[used_columns].values
y_test = test[target].values

del train, test, train_indices, valid_indices

<IPython.core.display.Javascript object>

# BOHB

## XGB Worker

In [16]:
tuner = BOHBTuner(XGBWorker)

<IPython.core.display.Javascript object>

In [17]:
min_budget = 20
max_budget = 1000
n_iter = 60

<IPython.core.display.Javascript object>

In [None]:
%%time
result = tuner.fit(
    X_train, y_train, X_valid, y_valid, X_test, y_test, n_iter, min_budget, max_budget
)
result

  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.siz

  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.siz

  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.siz

  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.siz

  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.siz

  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.siz

  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.siz

  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.siz

  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.siz

  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.siz

  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.siz

  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)
  kernel_value = np.ones(Xi.size) * h / (num_levels - 1)


In [None]:
%matplotlib inline
tuner.describe_results()

In [None]:
result["best_params"]

In [None]:
from xgboost import XGBClassifier

clf = XGBClassifier(**result["best_params"], n_estimators=10000)
clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=40)

In [None]:
roc_auc_score(y_score=clf.predict_proba(X_train)[:, 1], y_true=y_train)

In [None]:
roc_auc_score(y_score=clf.predict_proba(X_valid)[:, 1], y_true=y_valid)

In [None]:
roc_auc_score(y_score=clf.predict_proba(X_test)[:, 1], y_true=y_test)