In [None]:
from pytorch_tabnet import tab_network
from pytorch_tabnet.tab_model import Model

import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

import pandas as pd
import numpy as np
np.random.seed(0)


import os
import wget
from pathlib import Path

# Download census-income dataset

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
dataset_name = 'census-income'
out = Path(os.getcwd().rsplit("/",  1)[0]+'/data/'+dataset_name+'.csv')

In [None]:
out.parent.mkdir(parents=True, exist_ok=True)
if out.exists():
    print("File already exists.")
else:
    print("Downloading file...")
    wget.download(url, out.as_posix())

# Load data and split

In [None]:
train = pd.read_csv(out)
target = ' <=50K'
if "Set" not in train.columns:
    train["Set"] = np.random.choice(["train", "valid", "test"], p =[.8, .1, .1], size=(train.shape[0],))

train_indices = train[train.Set=="train"].index
valid_indices = train[train.Set=="valid"].index
test_indices = train[train.Set=="test"].index

# Simple preprocessing

Label encode categorical features and fill empty cells.

In [None]:
categorical_columns = []
categorical_dims =  {}
for col in train.columns[train.dtypes == object]:
    print(col, train[col].nunique())
    l_enc = LabelEncoder()
    train[col] = train[col].fillna("VV_likely")
    train[col] = l_enc.fit_transform(train[col].values)
    categorical_columns.append(col)
    categorical_dims[col] = len(l_enc.classes_)

for col in train.columns[train.dtypes == 'float64']:
    train.fillna(train.loc[train_indices, col].mean(), inplace=True)

# Define categorical features for categorical embeddings

In [None]:
unused_feat = ['Set']

features = [ col for col in train.columns if col not in unused_feat+[target]] 

cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]

cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

train[target] = train[target].astype(int)

# Network parameters

In [None]:
num_workers= 5
LR = 2e-2
batch_size = 1024 #64
mini_batch_size = 128
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
network_params = {"input_dim" : len(features),
                  "n_d" : 8,
                  "n_a" : 8,
                  "n_independent": 2,
                  "n_shared": 2,
                  "n_steps": 3,
                  "gamma": 1.3,
                  "output_dim" : 2,
                  "momentum": 0.1,
                  "cat_idxs":cat_idxs,
                  "cat_dims": cat_dims,
                  "cat_emb_dim": 1,
                  "virtual_batch_size": mini_batch_size,
}

description = f"test_TabNet_LR_{LR}_BS_{batch_size}_DS_{dataset_name}"
description += f"_miniBS_{mini_batch_size}"
description += f"_nd_{network_params['n_d']}"
description += f"_na_{network_params['n_a']}"
description += f"_nsteps_{network_params['n_steps']}"
description += f"_gamma_{network_params['gamma']}"
description += f"_momentum_{network_params['momentum']}"

In [None]:
my_scheduler = torch.optim.lr_scheduler.StepLR
scheduler_params = {"gamma": 0.9,
                    "step_size": 20}

training_params = {"model_name": description,
                   "lambda_sparse": 1e-3,
                   "lr":LR,
                   "patience": 200,
                   "optimizer_fn":torch.optim.Adam,
                   "scheduler_fn": my_scheduler,
                   "scheduler_params":scheduler_params,
                   "max_epochs": 1000,
                   "batch_size": batch_size,
                   "clip_value": 0.5,
                   "device":device
                  }

# Training

In [None]:
X_train = train.iloc[train_indices][features].values
y_train = train.iloc[train_indices][target].values

X_valid = train.iloc[valid_indices][features].values
y_valid = train.iloc[valid_indices][target].values

X_test = train.iloc[test_indices][features].values
y_test = train.iloc[test_indices][target].values

In [None]:
network = tab_network.TabNet
model = Model()


model.def_network(network, **network_params)
model.set_params(**training_params)

model.fit(
    X_train=X_train, y_train=y_train,
    X_valid=X_valid, y_valid=y_valid,
    balanced=False, #True,
    weights=None, #{0: 1, 1:10}
) 

In [None]:
model.load_best_model()

preds = model.predict_proba(X_test)

y_true = y_test

test_auc = roc_auc_score(y_score=preds[:,1], y_true=y_true)

print(f"BEST VALID SCORE FOR {dataset_name} : {model.best_cost}")
print(f"FINAL TEST SCORE FOR {dataset_name} : {test_auc}")

# Local explainability and masks

In [None]:
explain_matrix, masks = model.explain(X_test)

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
fig, axs = plt.subplots(1, network_params['n_steps'])

for i in range(network_params['n_steps']):
    axs[i].imshow(masks[i][:50])
    axs[i].set_title(f"mask {i}")


# XGB

In [None]:
from xgboost import XGBClassifier

clf = XGBClassifier(max_depth=8,
    learning_rate=0.1,
    n_estimators=1000,
    verbosity=0,
    silent=None,
    objective='binary:logistic',
    booster='gbtree',
    n_jobs=-1,
    nthread=None,
    gamma=0,
    min_child_weight=1,
    max_delta_step=0,
    subsample=0.7,
    colsample_bytree=1,
    colsample_bylevel=1,
    colsample_bynode=1,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    base_score=0.5,
    random_state=0,
    seed=None,)

clf.fit(X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        early_stopping_rounds=40,
        verbose=10)

In [None]:
preds = np.array(clf.predict_proba(X_valid))
valid_auc = roc_auc_score(y_score=preds[:,1], y_true=y_valid)
print(valid_auc)

preds = np.array(clf.predict_proba(X_test))
test_auc = roc_auc_score(y_score=preds[:,1], y_true=y_test)
print(test_auc)