In [1]:
import sys
import pandas as pd
import numpy as np
pd.options.display.max_columns = None
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import time

sys.path.append('../')

### Binary

In [2]:
train = pd.read_parquet('/www/dslib/spark_sota_modeling/dataset/home-credit-default-risk/train.parquet')
target_col = 'target'
index_col = 'sk_id_curr'
train, test = train_test_split(train, test_size=0.2, random_state=42, stratify=train[target_col])
X_train = train.drop(columns=[target_col, index_col])
y_train = train[target_col]
X_test = test.drop(columns=[target_col, index_col])
y_test = test[target_col]

In [None]:
from models.estimators.tabnet_estimator import TabNetBinary
from models.estimators.cemlp_estimator import CatEmbMLPBinary

# model = TabNetBinary(
#     epochs=1000,
#     dropout=0.6,
#     learning_rate=0.005,
#     early_stopping_patience=10,
#     reducelronplateau_patience=3,
#     reducelronplateau_factor=0.7,
#     verbose=True,
# )

# drop_path_rate : float, default=0.0
#     Вероятность DropPath (Stochastic Depth) для residual ветки
# drop_path_mode : str, default='uniform'
#     Способ распределения вероятности по слоям: 'uniform' или 'linear'

for initialization in ['he_normal', 'he_uniform', 'xavier_normal', 'xavier_uniform']:
    for activation in ['relu', 'leaky_relu', 'gelu', 'swish', 'prelu']:
        start_time = time.time()
        model = CatEmbMLPBinary(
            cat_emb_dim=8,
            hidden_dims=[64, 32],
            activation=activation,
            leaky_relu_negative_slope=0.1,
            dropout=0.6,
            batch_norm=False,
            layer_norm=True,
            initialization=initialization,
            batch_size=1024,
            epochs=100,
            learning_rate=0.01,
            momentum=0.9,
            weight_decay=1e-5,
            early_stopping_patience=10,
            scale_numerical=True,
            scale_method='standard',
            n_bins=10,
            verbose=False,
            random_state=42,
            lr_scheduler_patience=3,
            lr_scheduler_factor=0.7,
            dynamic_emb_size=True,
            feature_dropout=0.3,
        )

        model.fit(X_train, y_train, eval_set=(X_test, y_test), eval_metric='roc_auc', mode='max')
        y_pred_proba = model.predict_proba(X_test)
        # roc_auc_score(y_test, y_pred_proba[:,1])
        print(f"Initialization: {initialization:>12}, Activation: {activation:>10}, "
              f"ROC AUC: {roc_auc_score(y_test, y_pred_proba[:,1]):.6f}, "
              f"Time: {time.time() - start_time:.2f}s")

KeyboardInterrupt: 

### Multiclass

In [2]:
train = pd.read_parquet('/www/dslib/spark_sota_modeling/dataset/forest-cover-type/train.parquet')
target_col = 'cover_type'
train, test = train_test_split(train, test_size=0.2, random_state=42, stratify=train[target_col])
X_train = train.drop(columns=[target_col])
y_train = train[target_col]
X_test = test.drop(columns=[target_col])
y_test = test[target_col]

In [None]:
from models.nn.tabnet import TabNetMulticlassClassifier

# cat_emb_dim=6,  # Размерность эмбеддингов для категориальных признаков
# n_steps=4,  # Количество шагов в TabNet
# hidden_dim=16,  # Размерность скрытого слоя
# decision_dim=8,  # Размерность решающего слоя
# n_glu_layers=3,  # Количество GLU слоев
# dropout=0.6,  # Вероятность дропаута
# gamma=1.5,  # Коэффициент затухания для масок внимания
# lambda_sparse=0.0001,  # Коэффициент регуляризации разреженности
# virtual_batch_size=128,  # Размер виртуального батча для Ghost BatchNorm
# momentum=0.9,  # Параметр momentum для BatchNorm
# batch_size=1024,  # Размер батча для обучения
# epochs=50,  # Количество эпох обучения
# learning_rate=0.005,  # Скорость обучения
# early_stopping_patience=5,  # Количество эпох без улучшения до остановки
# weight_decay=1e-5,  # Весовая регуляризация для оптимизатора
# scale_numerical=True,  # Масштабировать ли числовые признаки
# scale_method="standard",  # Метод масштабирования ("standard", "minmax", "quantile", "binning")
# n_bins=10,  # Количество бинов для binning

model = TabNetMulticlassClassifier(
    verbose=True,
    n_classes=train[target_col].nunique(),
    hidden_dim=64,
    decision_dim=32,
    n_steps=5,
    n_glu_layers=3,
    dropout=0.1,
    gamma=1.5,
    lambda_sparse=0.0001,
    batch_size=16384,
    virtual_batch_size=512,
    momentum=0.7,
    learning_rate=0.05,
    epochs=1000,
    cat_emb_dim=6,
    early_stopping_patience=30,
    # scale_method='standard', # standard, minmax, quantile, binning
)

model.fit(X_train, y_train, eval_set=(X_test, y_test))

y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

In [None]:
# 0.8479471270104902
# 0.9272996394241112

### Regression

In [2]:
train = pd.read_parquet('/www/dslib/spark_sota_modeling/dataset/allstate-claims-severity/train.parquet')
target_col = 'loss'
index_col = 'id'
train, test = train_test_split(train, test_size=0.2, random_state=42)
X_train = train.drop(columns=[target_col, index_col])
y_train = train[target_col]
X_test = test.drop(columns=[target_col, index_col])
y_test = test[target_col]

In [None]:
from models.nn.tabnet import TabNetRegressor
model = TabNetRegressor(
    verbose=True,
    dropout=0.3,
    lambda_sparse=0.001,
    learning_rate=0.0015,
    epochs=200,
    early_stopping_patience=20,
    n_glu_layers=2,
)

model.fit(X_train, y_train, eval_set=(X_test, y_test), eval_metric='mae', mode='min')

y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_pred)

In [None]:
# 1173.9303179279098