In [1]:
from pytorch_tabnet.tab_model import TabNetRegressor

import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

import pandas as pd
import numpy as np
np.random.seed(0)


import os
import wget
from pathlib import Path

# Download census-income dataset

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
dataset_name = 'census-income'
out = Path(os.getcwd()+'/data/'+dataset_name+'.csv')

In [3]:
out.parent.mkdir(parents=True, exist_ok=True)
if out.exists():
    print("File already exists.")
else:
    print("Downloading file...")
    wget.download(url, out.as_posix())

File already exists.


# Load data and split

In [4]:
train = pd.read_csv(out)
target = ' <=50K'
if "Set" not in train.columns:
    train["Set"] = np.random.choice(["train", "valid", "test"], p =[.8, .1, .1], size=(train.shape[0],))

train_indices = train[train.Set=="train"].index
valid_indices = train[train.Set=="valid"].index
test_indices = train[train.Set=="test"].index

# Simple preprocessing

Label encode categorical features and fill empty cells.

In [5]:
categorical_columns = []
categorical_dims =  {}
for col in train.columns[train.dtypes == object]:
    print(col, train[col].nunique())
    l_enc = LabelEncoder()
    train[col] = train[col].fillna("VV_likely")
    train[col] = l_enc.fit_transform(train[col].values)
    categorical_columns.append(col)
    categorical_dims[col] = len(l_enc.classes_)

for col in train.columns[train.dtypes == 'float64']:
    train.fillna(train.loc[train_indices, col].mean(), inplace=True)

 State-gov 9
 Bachelors 16
 Never-married 7
 Adm-clerical 15
 Not-in-family 6
 White 5
 Male 2
 United-States 42
 <=50K 2
Set 3


# Define categorical features for categorical embeddings

In [6]:
unused_feat = ['Set']

features = [ col for col in train.columns if col not in unused_feat+[target]] 

cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]

cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

# define your embedding sizes : here just a random choice
cat_emb_dim = [5, 4, 3, 6, 2, 2, 1, 10]

# Network parameters

In [7]:
clf = TabNetRegressor(cat_dims=cat_dims, cat_emb_dim=cat_emb_dim, cat_idxs=cat_idxs)



# Training

In [8]:
X_train = train[features].values[train_indices]
y_train = train[target].values[train_indices].reshape(-1, 1)

X_valid = train[features].values[valid_indices]
y_valid = train[target].values[valid_indices].reshape(-1, 1)

X_test = train[features].values[test_indices]
y_test = train[target].values[test_indices].reshape(-1, 1)

In [9]:
max_epochs = 100 if not os.getenv("CI", False) else 2

In [10]:
from pytorch_tabnet.augmentations import RegressionSMOTE
aug = RegressionSMOTE(p=0.2)

In [11]:
clf.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    eval_name=['train', 'valid'],
    eval_metric=['rmsle', 'mae', 'rmse', 'mse'],
    max_epochs=max_epochs,
    patience=50,
    batch_size=1024, virtual_batch_size=128,
    num_workers=0,
    drop_last=False,
    augmentations=aug, #aug
) 

epoch 0  | loss: 0.16632 | train_rmsle: 0.08172 | train_mae: 0.32434 | train_rmse: 0.42037 | train_mse: 0.17671 | valid_rmsle: 0.08229 | valid_mae: 0.32509 | valid_rmse: 0.41638 | valid_mse: 0.17337 |  0:00:01s
epoch 1  | loss: 0.13043 | train_rmsle: 0.08851 | train_mae: 0.34694 | train_rmse: 0.43319 | train_mse: 0.18765 | valid_rmsle: 0.08877 | valid_mae: 0.34742 | valid_rmse: 0.43215 | valid_mse: 0.18676 |  0:00:02s
epoch 2  | loss: 0.12421 | train_rmsle: 0.07367 | train_mae: 0.31208 | train_rmse: 0.38054 | train_mse: 0.14481 | valid_rmsle: 0.07219 | valid_mae: 0.30876 | valid_rmse: 0.37698 | valid_mse: 0.14211 |  0:00:04s
epoch 3  | loss: 0.11857 | train_rmsle: 0.08045 | train_mae: 0.30774 | train_rmse: 0.38188 | train_mse: 0.14583 | valid_rmsle: 0.08033 | valid_mae: 0.30664 | valid_rmse: 0.38204 | valid_mse: 0.14595 |  0:00:05s
epoch 4  | loss: 0.11478 | train_rmsle: 0.0657  | train_mae: 0.2798  | train_rmse: 0.35083 | train_mse: 0.12308 | valid_rmsle: 0.06471 | valid_mae: 0.2762  

epoch 39 | loss: 0.10109 | train_rmsle: 0.05026 | train_mae: 0.21729 | train_rmse: 0.31905 | train_mse: 0.1018  | valid_rmsle: 0.04844 | valid_mae: 0.21238 | valid_rmse: 0.31259 | valid_mse: 0.09771 |  0:00:55s
epoch 40 | loss: 0.1015  | train_rmsle: 0.05009 | train_mae: 0.21863 | train_rmse: 0.31728 | train_mse: 0.10067 | valid_rmsle: 0.04864 | valid_mae: 0.21431 | valid_rmse: 0.31218 | valid_mse: 0.09746 |  0:00:57s
epoch 41 | loss: 0.10149 | train_rmsle: 0.04867 | train_mae: 0.21139 | train_rmse: 0.32118 | train_mse: 0.10316 | valid_rmsle: 0.04726 | valid_mae: 0.20743 | valid_rmse: 0.31627 | valid_mse: 0.10002 |  0:00:58s
epoch 42 | loss: 0.10082 | train_rmsle: 0.0507  | train_mae: 0.21215 | train_rmse: 0.31643 | train_mse: 0.10013 | valid_rmsle: 0.04898 | valid_mae: 0.20688 | valid_rmse: 0.31061 | valid_mse: 0.09648 |  0:01:00s
epoch 43 | loss: 0.1012  | train_rmsle: 0.05138 | train_mae: 0.21497 | train_rmse: 0.31801 | train_mse: 0.10113 | valid_rmsle: 0.04964 | valid_mae: 0.21004 

epoch 78 | loss: 0.09692 | train_rmsle: 0.04901 | train_mae: 0.20755 | train_rmse: 0.31433 | train_mse: 0.0988  | valid_rmsle: 0.04771 | valid_mae: 0.20417 | valid_rmse: 0.30959 | valid_mse: 0.09585 |  0:01:50s
epoch 79 | loss: 0.09665 | train_rmsle: 0.04741 | train_mae: 0.2011  | train_rmse: 0.31131 | train_mse: 0.09691 | valid_rmsle: 0.04623 | valid_mae: 0.19879 | valid_rmse: 0.30713 | valid_mse: 0.09433 |  0:01:51s
epoch 80 | loss: 0.09707 | train_rmsle: 0.04718 | train_mae: 0.20445 | train_rmse: 0.31355 | train_mse: 0.09831 | valid_rmsle: 0.04589 | valid_mae: 0.20147 | valid_rmse: 0.30907 | valid_mse: 0.09552 |  0:01:52s
epoch 81 | loss: 0.09612 | train_rmsle: 0.04716 | train_mae: 0.20586 | train_rmse: 0.31175 | train_mse: 0.09719 | valid_rmsle: 0.04576 | valid_mae: 0.20278 | valid_rmse: 0.30691 | valid_mse: 0.09419 |  0:01:54s
epoch 82 | loss: 0.09604 | train_rmsle: 0.04763 | train_mae: 0.20646 | train_rmse: 0.31163 | train_mse: 0.09711 | valid_rmsle: 0.04651 | valid_mae: 0.20362 



In [None]:
# Deprecated : best model is automatically loaded at end of fit
# clf.load_best_model()

preds = clf.predict(X_test)

y_true = y_test

test_score = mean_squared_error(y_pred=preds, y_true=y_true)

print(f"BEST VALID SCORE FOR {dataset_name} : {clf.best_cost}")
print(f"FINAL TEST SCORE FOR {dataset_name} : {test_score}")

# Save model and load

In [None]:
# save tabnet model
saving_path_name = "./tabnet_model_test_1"
saved_filepath = clf.save_model(saving_path_name)

In [None]:
# define new model with basic parameters and load state dict weights
loaded_clf = TabNetRegressor()
loaded_clf.load_model(saved_filepath)

In [None]:
loaded_preds = loaded_clf.predict(X_test)
loaded_test_mse = mean_squared_error(loaded_preds, y_test)

print(f"FINAL TEST SCORE FOR {dataset_name} : {loaded_test_mse}")

In [None]:
assert(test_score == loaded_test_mse)

# Global explainability : feat importance summing to 1

In [None]:
clf.feature_importances_

# Local explainability and masks

In [None]:
explain_matrix, masks = clf.explain(X_test)

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(20,20))

for i in range(3):
    axs[i].imshow(masks[i][:50])
    axs[i].set_title(f"mask {i}")


# XGB

In [None]:
from xgboost import XGBRegressor

clf_xgb = XGBRegressor(max_depth=8,
    learning_rate=0.1,
    n_estimators=1000,
    verbosity=0,
    silent=None,
    objective='reg:linear',
    booster='gbtree',
    n_jobs=-1,
    nthread=None,
    gamma=0,
    min_child_weight=1,
    max_delta_step=0,
    subsample=0.7,
    colsample_bytree=1,
    colsample_bylevel=1,
    colsample_bynode=1,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    base_score=0.5,
    random_state=0,
    seed=None,)

clf_xgb.fit(X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        early_stopping_rounds=40,
        verbose=10)

In [None]:
preds = np.array(clf_xgb.predict(X_valid))
valid_auc = mean_squared_error(y_pred=preds, y_true=y_valid)
print(valid_auc)

preds = np.array(clf_xgb.predict(X_test))
test_auc = mean_squared_error(y_pred=preds, y_true=y_test)
print(test_auc)