In [2]:
from sklearn.model_selection import KFold
from pytorch_tabnet.metrics import Metric
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import pandas as pd
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
from tabnet_model import MyTabnet
import numpy as np

In [4]:
class RMSPE(Metric):
    """自定义评估指标"""

    def __init__(self):
        self._name = "rmspe"
        self._maximize = False

    def __call__(self, y_true, y_score):
        return np.sqrt(np.mean(np.square((y_true - y_score) / y_true)))


def RMSPELoss(y_pred, y_true):
    """自定义损失函数"""
    return torch.sqrt(torch.mean(((y_true - y_pred) / y_true) ** 2)).clone()


def process_tabnet_data(train, test):
    """Function to process features as input to TabNet model"""
    # 对缺失值和无穷值的处理
    train.replace([np.inf, -np.inf], np.nan, inplace=True)
    test.replace([np.inf, -np.inf], np.nan, inplace=True)
    for col in train.columns.to_list()[4:]:
        train[col] = train[col].fillna(train[col].mean())
        train = train.fillna(0)
    for col in test.columns.to_list()[3:]:
        test[col] = test[col].fillna(test[col].mean())
        test = test.fillna(0)

    X_train = train.drop(['row_id', 'target', 'time_id'], axis=1)  # 训练数据集特征
    y_train = train['target']  # 训练数据集标签

    X_test = test.drop(['time_id', 'row_id'], axis=1)

    categorical_columns = []
    categorical_dims = {}

    # 数据预处理:标签编码与数据缩放
    for col in X_train.columns:
        if col == 'stock_id':
            l_enc = LabelEncoder()
            X_train[col] = l_enc.fit_transform(X_train[col].values)
            X_test[col] = l_enc.transform(X_test[col].values)
            categorical_columns.append(col)
            categorical_dims[col] = len(l_enc.classes_)
        else:
            scaler = StandardScaler()
            X_train[col] = scaler.fit_transform(X_train[col].values.reshape(-1, 1))
            X_test[col] = scaler.transform(X_test[col].values.reshape(-1, 1))

    cat_idxs = [i for i, f in enumerate(X_train.columns.tolist()) if f in categorical_columns]

    cat_dims = [categorical_dims[f] for i, f in enumerate(X_train.columns.tolist()) if f in categorical_columns]

    return X_train, y_train, X_test, cat_idxs, cat_dims

In [5]:
# 数据来源:kaggle Optiver_Realized_Volatility_Prediction比赛特征加工数据(从中随机选取10000条)
train_data = pd.read_csv("train_dataset.csv")
test_data = pd.read_csv("test_dataset.csv")

X_train, y_train, X_test, cat_idxs, cat_dims = process_tabnet_data(train_data, test_data)
X_train = X_train.values
y_train = y_train.values
X_test = X_test.values

In [6]:
tabnet_params = dict(
    cat_idxs=cat_idxs,
    cat_dims=cat_dims,
    cat_emb_dim=8,
    n_d=16,
    n_a=16,
    n_steps=2,
    gamma=1.3,
    n_independent=2,
    n_shared=2,
    lambda_sparse=0,
    optimizer_fn=Adam,
    optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
    mask_type="entmax",
    scheduler_params=dict(T_0=200, T_mult=1, eta_min=1e-4, last_epoch=-1, verbose=False),
    scheduler_fn=CosineAnnealingWarmRestarts,
    seed=23,
    verbose=10)
tabnet_fit_params = dict(max_epochs=200,
                         patience=50,
                         batch_size=1024 * 10,
                         virtual_batch_size=128 * 10,
                         num_workers=2,
                         drop_last=False,
                         eval_metric=[RMSPE],
                         loss_fn=RMSPELoss)
kfold = KFold(n_splits=5, shuffle=True, random_state=1)

In [7]:
test_predictions, oof_predictions, model_list = MyTabnet(X_train, y_train, X_test, kfold, tabnet_params,
                                                         tabnet_fit_params)
print(test_predictions)

Training fold 1
Device used : cuda
epoch 0  | loss: 320.59845| val_0_rmspe: 238.38251|  0:00:01s
epoch 10 | loss: 42.26109| val_0_rmspe: 692.1362|  0:00:04s
epoch 20 | loss: 16.61687| val_0_rmspe: 255.57201|  0:00:07s
epoch 30 | loss: 11.67211| val_0_rmspe: 202.42849|  0:00:10s
epoch 40 | loss: 5.51693 | val_0_rmspe: 8.06346 |  0:00:13s
epoch 50 | loss: 3.73243 | val_0_rmspe: 4.3262  |  0:00:16s
epoch 60 | loss: 2.62438 | val_0_rmspe: 2.53799 |  0:00:20s
epoch 70 | loss: 1.6475  | val_0_rmspe: 3.3993  |  0:00:23s
epoch 80 | loss: 1.52509 | val_0_rmspe: 1.72744 |  0:00:26s
epoch 90 | loss: 0.96968 | val_0_rmspe: 1.2624  |  0:00:29s
epoch 100| loss: 0.82148 | val_0_rmspe: 1.03171 |  0:00:32s
epoch 110| loss: 0.59156 | val_0_rmspe: 0.91138 |  0:00:35s
epoch 120| loss: 0.53538 | val_0_rmspe: 0.78263 |  0:00:38s
epoch 130| loss: 0.50592 | val_0_rmspe: 0.81045 |  0:00:42s
epoch 140| loss: 0.75363 | val_0_rmspe: 0.68072 |  0:00:45s
epoch 150| loss: 0.4546  | val_0_rmspe: 0.74015 |  0:00:48s
e

In [8]:
print(model_list)

[TabNetRegressor(n_d=16, n_a=16, n_steps=2, gamma=1.3, cat_idxs=[0], cat_dims=[112], cat_emb_dim=8, n_independent=2, n_shared=2, epsilon=1e-15, momentum=0.02, lambda_sparse=0, seed=23, clip_value=1, verbose=10, optimizer_fn=<class 'torch.optim.adam.Adam'>, optimizer_params={'lr': 0.02, 'weight_decay': 1e-05}, scheduler_fn=<class 'torch.optim.lr_scheduler.CosineAnnealingWarmRestarts'>, scheduler_params={'T_0': 200, 'T_mult': 1, 'eta_min': 0.0001, 'last_epoch': -1, 'verbose': False}, mask_type='entmax', input_dim=157, output_dim=1, device_name='auto'), TabNetRegressor(n_d=16, n_a=16, n_steps=2, gamma=1.3, cat_idxs=[0], cat_dims=[112], cat_emb_dim=8, n_independent=2, n_shared=2, epsilon=1e-15, momentum=0.02, lambda_sparse=0, seed=23, clip_value=1, verbose=10, optimizer_fn=<class 'torch.optim.adam.Adam'>, optimizer_params={'lr': 0.02, 'weight_decay': 1e-05}, scheduler_fn=<class 'torch.optim.lr_scheduler.CosineAnnealingWarmRestarts'>, scheduler_params={'T_0': 200, 'T_mult': 1, 'eta_min': 0.