In [1]:
import sys
import pandas as pd
import numpy as np
pd.options.display.max_columns = None
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

sys.path.append('../')

### Binary

In [3]:
train = pd.read_parquet('/www/dslib/spark_sota_modeling/dataset/home-credit-default-risk/train.parquet')
target_col = 'target'
index_col = 'sk_id_curr'
train, test = train_test_split(train, test_size=0.2, random_state=42, stratify=train[target_col])
X_train = train.drop(columns=[target_col, index_col])
y_train = train[target_col]
X_test = test.drop(columns=[target_col, index_col])
y_test = test[target_col]

In [None]:
from models.estimators.tabnet_estimator import TabNetBinary

model = TabNetBinary(
    verbose=True,
    # scale_method='standard', # standard, minmax, quantile, binning
    hidden_dim=64,
    decision_dim=32,
    n_steps=5,
    n_glu_layers=3,
    dropout=0.1,
    gamma=1.5,
    lambda_sparse=0.0001,
    batch_size=16384,
    virtual_batch_size=512,
    momentum=0.7,
    learning_rate=0.03,
    epochs=200,
    cat_emb_dim=6,
    early_stopping_patience=10,
)

model.fit(X_train, y_train, eval_set=(X_test, y_test), eval_metric='roc_auc', mode='max')

y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

In [None]:
roc_auc_score(y_test, y_pred_proba[:,1])

In [None]:
# standard  0.7737342479104056
# minmax:   0.7629154604000974
# quantile: 0.7729699391727092
# binning:  0.7630777980797842

### Multiclass

In [2]:
train = pd.read_parquet('/www/dslib/spark_sota_modeling/dataset/forest-cover-type/train.parquet')
target_col = 'cover_type'
train, test = train_test_split(train, test_size=0.2, random_state=42, stratify=train[target_col])
X_train = train.drop(columns=[target_col])
y_train = train[target_col]
X_test = test.drop(columns=[target_col])
y_test = test[target_col]

In [None]:
from models.estimators.tabnet_estimator import TabNetMulticlass

# cat_emb_dim=6,  # Размерность эмбеддингов для категориальных признаков
# n_steps=4,  # Количество шагов в TabNet
# hidden_dim=16,  # Размерность скрытого слоя
# decision_dim=8,  # Размерность решающего слоя
# n_glu_layers=3,  # Количество GLU слоев
# dropout=0.6,  # Вероятность дропаута
# gamma=1.5,  # Коэффициент затухания для масок внимания
# lambda_sparse=0.0001,  # Коэффициент регуляризации разреженности
# virtual_batch_size=128,  # Размер виртуального батча для Ghost BatchNorm
# momentum=0.9,  # Параметр momentum для BatchNorm
# batch_size=1024,  # Размер батча для обучения
# epochs=50,  # Количество эпох обучения
# learning_rate=0.005,  # Скорость обучения
# early_stopping_patience=5,  # Количество эпох без улучшения до остановки
# weight_decay=1e-5,  # Весовая регуляризация для оптимизатора
# scale_numerical=True,  # Масштабировать ли числовые признаки
# scale_method="standard",  # Метод масштабирования ("standard", "minmax", "quantile", "binning")
# n_bins=10,  # Количество бинов для binning

model = TabNetMulticlass(
    verbose=True,
    n_classes=train[target_col].nunique(),
    hidden_dim=64,
    decision_dim=32,
    n_steps=5,
    n_glu_layers=3,
    dropout=0.1,
    gamma=1.5,
    lambda_sparse=0.0001,
    batch_size=16384,
    virtual_batch_size=512,
    momentum=0.7,
    learning_rate=0.05,
    epochs=100,
    cat_emb_dim=6,
    early_stopping_patience=10,
    # scale_method='standard', # standard, minmax, quantile, binning
)

model.fit(X_train, y_train, eval_set=(X_test, y_test))

y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

In [None]:
# 0.8479471270104902
# 0.9272996394241112

### Regression

In [2]:
train = pd.read_parquet('/www/dslib/spark_sota_modeling/dataset/allstate-claims-severity/train.parquet')
target_col = 'loss'
index_col = 'id'
train, test = train_test_split(train, test_size=0.2, random_state=42)
X_train = train.drop(columns=[target_col, index_col])
y_train = train[target_col]
X_test = test.drop(columns=[target_col, index_col])
y_test = test[target_col]

In [3]:
from models.estimators.tabnet_estimator import TabNetRegressor
model = TabNetRegressor(
    verbose=True,
    # scale_method='standard', # standard, minmax, quantile, binning
)

model.fit(X_train, y_train, eval_set=(X_test, y_test), eval_metric='mae', mode='min')

y_pred = model.predict(X_test)

Начинаем обучение на cuda...


Training:   0%|          | 0/148 [00:00<?, ?it/s]

Validation:   0%|          | 0/37 [00:00<?, ?it/s]

Epoch 1/50
Train loss: 17595754.2230, Train mae: 3027.1934
Val loss: 16854354.0811, Val mae: 2990.2883
Сохраняем лучшую модель с метрикой mae: 2990.2883


Training:   0%|          | 0/148 [00:00<?, ?it/s]

Validation:   0%|          | 0/37 [00:00<?, ?it/s]

Epoch 2/50
Train loss: 16083892.2500, Train mae: 2882.9526
Val loss: 14069073.2162, Val mae: 2725.3162
Сохраняем лучшую модель с метрикой mae: 2725.3162


Training:   0%|          | 0/148 [00:00<?, ?it/s]

Validation:   0%|          | 0/37 [00:00<?, ?it/s]

Epoch 3/50
Train loss: 12108162.1824, Train mae: 2431.2056
Val loss: 8923443.8649, Val mae: 2074.6516
Сохраняем лучшую модель с метрикой mae: 2074.6516


Training:   0%|          | 0/148 [00:00<?, ?it/s]

Validation:   0%|          | 0/37 [00:00<?, ?it/s]

Epoch 4/50
Train loss: 6788024.3699, Train mae: 1661.6820
Val loss: 4244063.2095, Val mae: 1312.6130
Сохраняем лучшую модель с метрикой mae: 1312.6130


Training:   0%|          | 0/148 [00:00<?, ?it/s]

Validation:   0%|          | 0/37 [00:00<?, ?it/s]

Epoch 5/50
Train loss: 4663687.7990, Train mae: 1361.3583
Val loss: 3825361.8108, Val mae: 1209.1071
Сохраняем лучшую модель с метрикой mae: 1209.1071


Training:   0%|          | 0/148 [00:00<?, ?it/s]

Validation:   0%|          | 0/37 [00:00<?, ?it/s]

Epoch 6/50
Train loss: 4593373.6926, Train mae: 1349.4701
Val loss: 3833777.4662, Val mae: 1226.4618
Нет улучшения в течение 1 эпох


Training:   0%|          | 0/148 [00:00<?, ?it/s]

Validation:   0%|          | 0/37 [00:00<?, ?it/s]

Epoch 7/50
Train loss: 4567623.3530, Train mae: 1346.9381
Val loss: 3872435.9797, Val mae: 1266.8577
Нет улучшения в течение 2 эпох


Training:   0%|          | 0/148 [00:00<?, ?it/s]

Validation:   0%|          | 0/37 [00:00<?, ?it/s]

Epoch 8/50
Train loss: 4565507.8497, Train mae: 1344.8649
Val loss: 3746526.3108, Val mae: 1199.1720
Сохраняем лучшую модель с метрикой mae: 1199.1720


Training:   0%|          | 0/148 [00:00<?, ?it/s]

Validation:   0%|          | 0/37 [00:00<?, ?it/s]

Epoch 9/50
Train loss: 4534343.2703, Train mae: 1341.8129
Val loss: 3723217.0608, Val mae: 1211.6886
Нет улучшения в течение 1 эпох


Training:   0%|          | 0/148 [00:00<?, ?it/s]

Validation:   0%|          | 0/37 [00:00<?, ?it/s]

Epoch 10/50
Train loss: 4500299.5118, Train mae: 1337.3116
Val loss: 4127223.2635, Val mae: 1206.2593
Нет улучшения в течение 2 эпох


Training:   0%|          | 0/148 [00:00<?, ?it/s]

Validation:   0%|          | 0/37 [00:00<?, ?it/s]

Epoch 11/50
Train loss: 4504811.3784, Train mae: 1336.4084
Val loss: 3736360.9392, Val mae: 1196.3994
Сохраняем лучшую модель с метрикой mae: 1196.3994


Training:   0%|          | 0/148 [00:00<?, ?it/s]

Validation:   0%|          | 0/37 [00:00<?, ?it/s]

Epoch 12/50
Train loss: 4491788.7821, Train mae: 1336.9951
Val loss: 3855456.6351, Val mae: 1253.0751
Нет улучшения в течение 1 эпох


Training:   0%|          | 0/148 [00:00<?, ?it/s]

Validation:   0%|          | 0/37 [00:00<?, ?it/s]

Epoch 13/50
Train loss: 4484649.1672, Train mae: 1330.3732
Val loss: 3702270.3851, Val mae: 1209.0760
Нет улучшения в течение 2 эпох


Training:   0%|          | 0/148 [00:00<?, ?it/s]

Validation:   0%|          | 0/37 [00:00<?, ?it/s]

Epoch 14/50
Train loss: 4472410.8970, Train mae: 1331.2920
Val loss: 3743178.6014, Val mae: 1208.3342
Нет улучшения в течение 3 эпох


Training:   0%|          | 0/148 [00:00<?, ?it/s]

Validation:   0%|          | 0/37 [00:00<?, ?it/s]

Epoch 15/50
Train loss: 4457964.2770, Train mae: 1328.9586
Val loss: 3696697.0135, Val mae: 1226.9131
Нет улучшения в течение 4 эпох


Training:   0%|          | 0/148 [00:00<?, ?it/s]

Validation:   0%|          | 0/37 [00:00<?, ?it/s]

Epoch 16/50
Train loss: 4434305.7990, Train mae: 1326.6647
Val loss: 3860251.6419, Val mae: 1184.2305
Сохраняем лучшую модель с метрикой mae: 1184.2305


Training:   0%|          | 0/148 [00:00<?, ?it/s]

Validation:   0%|          | 0/37 [00:00<?, ?it/s]

Epoch 17/50
Train loss: 4428448.0574, Train mae: 1329.2167
Val loss: 3754855.8378, Val mae: 1189.4880
Нет улучшения в течение 1 эпох


Training:   0%|          | 0/148 [00:00<?, ?it/s]

Validation:   0%|          | 0/37 [00:00<?, ?it/s]

Epoch 18/50
Train loss: 4400451.4966, Train mae: 1325.6748
Val loss: 3857028.3446, Val mae: 1186.1631
Нет улучшения в течение 2 эпох


Training:   0%|          | 0/148 [00:00<?, ?it/s]

Validation:   0%|          | 0/37 [00:00<?, ?it/s]

Epoch 19/50
Train loss: 4438057.4611, Train mae: 1326.9114
Val loss: 3774082.4392, Val mae: 1197.9896
Нет улучшения в течение 3 эпох


Training:   0%|          | 0/148 [00:00<?, ?it/s]

Validation:   0%|          | 0/37 [00:00<?, ?it/s]

Epoch 20/50
Train loss: 4400160.1993, Train mae: 1323.2311
Val loss: 3677805.3986, Val mae: 1213.9348
Нет улучшения в течение 4 эпох


Training:   0%|          | 0/148 [00:00<?, ?it/s]

Validation:   0%|          | 0/37 [00:00<?, ?it/s]

Epoch 21/50
Train loss: 4432715.3412, Train mae: 1324.3528
Val loss: 3741551.0608, Val mae: 1190.4227
Нет улучшения в течение 5 эпох
Останавливаем обучение из-за отсутствия улучшений
Загружена лучшая модель


Predicting:   0%|          | 0/37 [00:00<?, ?it/s]

In [4]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_pred)

1190.4226980803755

In [None]:
# 1190.4226980803755