In [1]:
import sys
sys.path.append('../')

## Import packages

In [2]:
from sklearn.datasets import load_breast_cancer
from tabnet.estimator import TabNetClassifier
from tabnet.utils.logger import init_logger

## Init logger

In [3]:
logger_dir = 'logs'
logger_name = 'TestRegression'
level = 'INFO'

logger = init_logger(logger_dir=logger_dir, logger_name=logger_name, level=level)

## Load data

In [4]:
X, y = load_breast_cancer(return_X_y=True)

print(X.shape)
print(y.shape)

(569, 30)
(569,)


## Init estimator

In [5]:
tabnet_classifier = TabNetClassifier(
    input_dims=30, output_dims=[1], logger=logger, is_cuda=True,
    reprs_dims=4, atten_dims=4, num_steps=4, num_indep=1, num_shared=1, 
    virtual_batch_size=256, batch_size=512
)

tabnet_classifier.build(path=None)



TabNetClassifier(atten_dims=4, batch_size=512, input_dims=30, is_cuda=True,
                 logger=<RootLogger root (INFO)>, num_indep=1, num_shared=1,
                 num_steps=4, output_dims=[1], reprs_dims=4,
                 virtual_batch_size=256)

## Setup training parameters

In [6]:
from torch.optim import Adam
from torch.optim import lr_scheduler

training_params = {
    'batch_size': 512,
    'max_epochs': 200,
    'optimizer': Adam,
    'optimizer_params': {'lr': 0.2},
    'schedulers': [lr_scheduler.ExponentialLR],
    'scheduler_params': {'gamma': 0.99}
}

## Start training

In [None]:
tabnet_classifier.pretrain(X, **training_params)

[2021-02-20 10:16:36,310][INFO][TabNet] Convert to pretrain model.
[2021-02-20 10:16:36,312][INFO][TabNet] start training.
[2021-02-20 10:16:36,312][INFO][TabNet] ******************** epoch : 0 ********************
[2021-02-20 10:16:41,337][INFO][TabNet] -------------------- train info --------------------
[2021-02-20 10:16:41,338][INFO][TabNet] total_loss : 6160.04833984375
[2021-02-20 10:16:41,339][INFO][TabNet] task_loss : 6160.046875
[2021-02-20 10:16:41,340][INFO][TabNet] mask_loss : -1.2503468990325928
[2021-02-20 10:16:41,341][INFO][TabNet] time_cost : 0.933852
[2021-02-20 10:16:41,341][INFO][TabNet] ******************** epoch : 1 ********************
[2021-02-20 10:16:45,446][INFO][TabNet] -------------------- train info --------------------
[2021-02-20 10:16:45,447][INFO][TabNet] total_loss : 1830.2628173828125
[2021-02-20 10:16:45,447][INFO][TabNet] task_loss : 1830.2618408203125
[2021-02-20 10:16:45,448][INFO][TabNet] mask_loss : -0.9323570728302002
[2021-02-20 10:16:45,449]

In [None]:
# clustering dataset
# determine k using elbow method

from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import numpy as np
import matplotlib.pyplot as plt



# k means determine k
distortions = []
K = range(1, 10)
for k in K:
    kmeanModel = KMeans(n_clusters=k).fit(emb)
    distortions.append(sum(np.min(cdist(emb, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])

# Plot the elbow
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()