In [1]:
import sys
import pandas as pd
import numpy as np
pd.options.display.max_columns = None
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import time

sys.path.append('../')

### Binary

In [2]:
train = pd.read_parquet('/www/dslib/spark_sota_modeling/dataset/home-credit-default-risk/train.parquet')
target_col = 'target'
index_col = 'sk_id_curr'
train, test = train_test_split(train, test_size=0.2, random_state=42, stratify=train[target_col])
X_train = train.drop(columns=[target_col, index_col])
y_train = train[target_col]
X_test = test.drop(columns=[target_col, index_col])
y_test = test[target_col]
with open('/www/dslib/spark_sota_modeling/dataset/home-credit-default-risk/categorical_features.txt', 'r') as f:
    categorical_features = [line.strip() for line in f.readlines()]
len(categorical_features)

16

In [None]:
from models.estimators.cemlp_estimator import CatEmbMLPBinary

model = CatEmbMLPBinary(
    cat_emb_dim=4,
    hidden_dims=[512, 256],
    activation='swish',
    dropout=0.7,
    initialization='xavier_uniform',
    normalization='ghost_batch',
    batch_size=8192,
    virtual_batch_size=256,
    learning_rate=0.001,
    verbose=True,
    feature_dropout=0.3,
    # use_self_attention=False,
    # num_attention_heads=2,
    # attn_dropout=0.3,
    # d_model=80,
)
model.fit(X_train, y_train, eval_set=(X_test, y_test), eval_metric='roc_auc', mode='max', cat_features=categorical_features)
y_pred_proba = model.predict_proba(X_test, cat_features=categorical_features)
roc_auc_score(y_test, y_pred_proba[:,1])

In [None]:
# cat_emb_dim=8,
# hidden_dims=[256, 128],
# activation='swish',
# dropout=0.6,
# initialization='xavier_uniform',
# learning_rate=0.001,
# verbose=True,
# dynamic_emb_size=False,
# feature_dropout=0.2,
# use_self_attention=False,
# num_attention_heads=2,
0.7791351516222171



In [None]:
from models.estimators.cemlp_estimator import CatEmbMLPBinary

for dropout in [0.7, 0.8, 0.9]:
    for feature_dropout in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]:
        start_time = time.time()
        model = CatEmbMLPBinary(
            cat_emb_dim=4,
            hidden_dims=[512, 256],
            activation='swish',
            dropout=dropout,
            initialization='xavier_uniform',
            normalization='ghost_batch',
            batch_size=1024,
            virtual_batch_size=128,
            learning_rate=0.001,
            verbose=False,
            feature_dropout=feature_dropout,
            random_state=42,
        )

        model.fit(X_train, y_train, eval_set=(X_test, y_test), eval_metric='roc_auc', mode='max', 
                  cat_features=categorical_features)
        y_pred_proba = model.predict_proba(X_test, cat_features=categorical_features)
        formatted = time.strftime("%H:%M:%S", time.gmtime(time.time()-start_time))
        print(
            f"dropout: {dropout}, "
            f"feature_dropout: {feature_dropout}, "
            f"ROC AUC: {roc_auc_score(y_test, y_pred_proba[:,1]):.4f}, "
            f"Time: {formatted}"
        )

dropout: 0.6, feature_dropout: 0.0, ROC AUC: 0.7762, Time: 00:09:11
dropout: 0.6, feature_dropout: 0.1, ROC AUC: 0.7767, Time: 00:10:07
dropout: 0.6, feature_dropout: 0.2, ROC AUC: 0.7763, Time: 00:10:09
dropout: 0.6, feature_dropout: 0.3, ROC AUC: 0.7774, Time: 00:13:40
dropout: 0.6, feature_dropout: 0.4, ROC AUC: 0.7774, Time: 00:16:18
dropout: 0.6, feature_dropout: 0.5, ROC AUC: 0.7774, Time: 00:13:40
dropout: 0.6, feature_dropout: 0.6, ROC AUC: 0.7771, Time: 00:18:41
dropout: 0.6, feature_dropout: 0.7, ROC AUC: 0.7758, Time: 00:24:15


### Multiclass

In [None]:
train = pd.read_parquet('/www/dslib/spark_sota_modeling/dataset/forest-cover-type/train.parquet')
target_col = 'cover_type'
train, test = train_test_split(train, test_size=0.2, random_state=42, stratify=train[target_col])
X_train = train.drop(columns=[target_col])
y_train = train[target_col]
X_test = test.drop(columns=[target_col])
y_test = test[target_col]
with open('/www/dslib/spark_sota_modeling/dataset/forest-cover-type/categorical_features.txt', 'r') as f:
    categorical_features = [line.strip() for line in f.readlines()]
len(categorical_features)

In [None]:
from models.estimators.cemlp_estimator import CatEmbMLPMulticlass

model = CatEmbMLPMulticlass(
    verbose=True,
    n_classes=train[target_col].nunique(),
    cat_emb_dim=2,
    hidden_dims=[512, 256],
    activation='swish',
    dropout=0.2,
    initialization='xavier_uniform',
    normalization='ghost_batch',
    batch_size=4096,
    virtual_batch_size=256,
    learning_rate=0.01,
    verbose=True,
    random_state=42,
    dynamic_emb_size=False,
    feature_dropout=0.1,
)

model.fit(X_train, y_train, eval_set=(X_test, y_test), cat_features=categorical_features)

y_pred = model.predict(X_test, cat_features=categorical_features)
y_pred_proba = model.predict_proba(X_test, cat_features=categorical_features)
accuracy_score(y_test, y_pred)

### Regression

In [None]:
train = pd.read_parquet('/www/dslib/spark_sota_modeling/dataset/allstate-claims-severity/train.parquet')
target_col = 'loss'
index_col = 'id'
train, test = train_test_split(train, test_size=0.2, random_state=42)
X_train = train.drop(columns=[target_col, index_col])
y_train = train[target_col]
X_test = test.drop(columns=[target_col, index_col])
y_test = test[target_col]
with open('/www/dslib/spark_sota_modeling/dataset/allstate-claims-severity/categorical_features.txt', 'r') as f:
    categorical_features = [line.strip() for line in f.readlines()]
len(categorical_features)

In [None]:
from models.estimators.cemlp_estimator import CatEmbMLPRegressor

model = CatEmbMLPRegressor(
    verbose=True,
    dropout=0.3,
)

model.fit(X_train, y_train, eval_set=(X_test, y_test), eval_metric='mae', mode='min', cat_features=categorical_features)

y_pred = model.predict(X_test, cat_features=categorical_features)

In [None]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_pred)

In [None]:
# Убрать секцию train в конфиге
# Добавить полный список гиперпараметров для всех моделей в конфиг