In [None]:
from pytorch_tabnet.tab_model import TabNetRegressor

import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

import pandas as pd
import numpy as np
np.random.seed(0)


import os
import wget
from pathlib import Path

# Download census-income dataset

In [None]:
# Training set
url = "http://www.gaussianprocess.org/gpml/data/sarcos_inv.mat"
dataset_name = 'sarcos_train'
out_train = Path(os.getcwd().rsplit("/",  1)[0]+'/data/'+dataset_name+'.csv')

out_train.parent.mkdir(parents=True, exist_ok=True)
if out_train.exists():
    print("File already exists.")
else:
    print("Downloading file...")
    wget.download(url, out_train.as_posix())

In [None]:
# Testing set
url = "http://www.gaussianprocess.org/gpml/data/sarcos_inv_test.mat"

dataset_name = 'sarcos_test'
out_test = Path(os.getcwd().rsplit("/",  1)[0]+'/data/'+dataset_name+'.csv')

out_test.parent.mkdir(parents=True, exist_ok=True)
if out_test.exists():
    print("File already exists.")
else:
    print("Downloading file...")
    wget.download(url, out_test.as_posix())

# Load data and split

In [None]:
import scipy.io
train =  scipy.io.loadmat(out_train)['sarcos_inv']
test = scipy.io.loadmat(out_test)['sarcos_inv_test']

In [None]:
train_valid_indices = np.random.choice(["train", "valid"], p =[.9, .1], size=(train.shape[0],))

In [None]:
X_train = train[train_valid_indices=='train', :21]
y_train = train[train_valid_indices=='train', 21]


X_valid = train[train_valid_indices=='valid', :21]
y_valid = train[train_valid_indices=='valid', 21]

X_test = test[:,:21]
y_test = test[:,21]

# Network parameters

In [None]:
clf = TabNetRegressor(n_d=128, n_a=128, n_steps=5,
                    lr=0.01, seed=0,
                    gamma=1.5, n_independent=2, n_shared=2,
                    cat_idxs=[],
                    cat_dims=[],
                    cat_emb_dim=1,
                    lambda_sparse=1e-5, momentum=0.2,
                    clip_value=1,
                    verbose=1, device_name="auto",
                    model_name="Sarcos_model",
                    scheduler_params={'step_size':20,
                                      'gamma':0.95},
                    scheduler_fn=torch.optim.lr_scheduler.StepLR,
                    epsilon=1e-15, saving_path="./")

# Training

In [None]:
clf.fit(
    X_train=X_train, y_train=y_train,
    X_valid=X_valid, y_valid=y_valid,
    max_epochs=1000, patience=50,
    batch_size=4096, virtual_batch_size=128
) 

In [None]:
# Deprecated : best model is automatically loaded at end of fit
# clf.load_best_model()

preds = clf.predict(X_test)
y_true = y_test

test_auc = mean_squared_error(y_pred=preds, y_true=y_true)

print(f"BEST VALID SCORE FOR {dataset_name} : {clf.best_cost}")
print(f"FINAL TEST SCORE FOR {dataset_name} : {test_auc}")

S : FINAL TEST SCORE FOR sarcos_test : 5.197828951757569
M : 3.6911551695466316
L : 6.5352724 (with scheduler 4.631672157657003)

# Global explainability : feat importance summing to 1

In [None]:
clf.feature_importances_

# Local explainability and masks

In [None]:
explain_matrix, masks = clf.explain(X_test)

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(20,20))

for i in range(3):
    axs[i].imshow(masks[i][:50])
    axs[i].set_title(f"mask {i}")


# XGB

In [None]:
from xgboost import XGBRegressor

clf_xgb = XGBRegressor(max_depth=8,
    learning_rate=0.1,
    n_estimators=1000,
    verbosity=0,
    silent=None,
    objective='reg:linear',
    booster='gbtree',
    n_jobs=-1,
    nthread=None,
    gamma=0,
    min_child_weight=1,
    max_delta_step=0,
    subsample=0.7,
    colsample_bytree=1,
    colsample_bylevel=1,
    colsample_bynode=1,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    base_score=0.5,
    random_state=0,
    seed=None,)

clf_xgb.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        early_stopping_rounds=40,
        verbose=10)

In [None]:
preds = np.array(clf_xgb.predict(X_valid))
valid_score = mean_squared_error(y_pred=preds, y_true=y_valid)
print(valid_score)

preds = np.array(clf_xgb.predict(X_test))
test_score = mean_squared_error(y_pred=preds, y_true=y_test)
print(test_score)