# Evaluating Model Performance for Predicting 30-Day Hospital Readmissions

Eric Jia, Scott Yamamoto

# Overview

This is the second of two notebooks and contains the code used to compare different models' performances, factoring into account the differences in performance using standard/weighted BCE and also using only the structured data vs. the combined structured data + embeddings.

In [1]:
!pip install scikit-learn pandas matplotlib seaborn numpy torch tqdm nbformat pytorch-tabnet --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m106.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m101.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m68.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import pandas as pd, numpy as np, time, json, matplotlib.pyplot as plt, seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, roc_curve)
import torch, torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
sns.set(style="whitegrid")


This notebook reproduces all models and metrics:

* Logistic Regression  
* Multilayer Perceptron (MLP)
* FT-Transformer
* TabNet Classifier  
* XGBoost

We use the data was was generated from the previous notebook (543t_nb1).

In [3]:
import os, time, numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns, torch, torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, precision_recall_curve, roc_curve)
from sklearn.linear_model import LogisticRegression
from pytorch_tabnet.tab_model import TabNetClassifier

sns.set(style="whitegrid")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

def metric_dict(y_true, y_pred, prob, rt):
    return dict(
        Accuracy  = accuracy_score(y_true, y_pred),
        Precision = precision_score(y_true, y_pred, zero_division=0),
        Recall    = recall_score(y_true, y_pred),
        F1        = f1_score(y_true, y_pred),
        ROC_AUC   = roc_auc_score(y_true, prob),
        Runtime_s = rt)

Using device: cuda


In [4]:
# import the data
DATA_PATH = 'data/full_final_df.csv'
df_full = pd.read_csv(DATA_PATH, index_col=0)

# Final Processing Steps

Standarize the numerical data, apply one-hot encoding for the categorical data.

In [5]:
# Define categorical and continuous variables
categorical_cols = ["INSURANCE", "GENDER", "MARITAL_STATUS"]
continuous_cols = [
    "UREA_N_MIN", "UREA_N_MAX", "UREA_N_MEAN", "PLATELETS_MIN",
    "PLATELETS_MAX", "PLATELETS_MEAN", "MAGNESIUM_MIN", "MAGNESIUM_MAX",
    "MAGNESIUM_MEAN", "ALBUMIN_MIN", "ALBUMIN_MAX", "ALBUMIN_MEAN",
    "CALCIUM_MIN", "CALCIUM_MAX", "CALCIUM_MEAN", "RESP_RATE_MIN",
    "RESP_RATE_MAX", "RESP_RATE_MEAN", "HR_MIN", "HR_MAX", "HR_MEAN",
    "SYSBP_MIN", "SYSBP_MAX", "SYSBP_MEAN", "DIASBP_MIN", "DIASBP_MAX",
    "DIASBP_MEAN", "GLUCOSE_MIN", "GLUCOSE_MAX", "GLUCOSE_MEAN"
]

target_col = ["FUTURE_READMIT", "TEXT"]
selected_cols = categorical_cols + continuous_cols + target_col
sampled_df = df_full[selected_cols]

# apply one-hot encoding
sampled_df_encoded = pd.get_dummies(sampled_df, columns=categorical_cols, drop_first=True)

# convert cols to nuermical values
sampled_df_encoded["FUTURE_READMIT"] = sampled_df_encoded["FUTURE_READMIT"].map({"Yes": 1, "No": 0})
bool_cols = sampled_df_encoded.select_dtypes(include=['bool']).columns
sampled_df_encoded[bool_cols] = sampled_df_encoded[bool_cols].astype(int)

# standardize data
scaler = StandardScaler()
sampled_df_encoded[continuous_cols] = scaler.fit_transform(sampled_df_encoded[continuous_cols])

X = sampled_df_encoded.drop(columns=['FUTURE_READMIT']).values
y = sampled_df_encoded['FUTURE_READMIT'].values.astype(int)

In [6]:
TEXT_COL = "TEXT"
numeric_block = sampled_df_encoded.drop(columns=[TEXT_COL, 'FUTURE_READMIT']).values.astype(np.float32)
texts         = sampled_df_encoded[TEXT_COL].fillna(" ").tolist()

print("Numeric block shape:", numeric_block.shape)
print("Number of texts    :", len(texts))

Numeric block shape: (20948, 41)
Number of texts    : 20948


# Embed discharge summaries using ClinicalBERT

Used the github repository found here: https://github.com/EmilyAlsentzer/clinicalBERT

In [7]:
!pip install -q transformers sentencepiece tqdm

from transformers import AutoTokenizer, AutoModel
import torch, tqdm, numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model     = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT").to(device).eval()

MAX_LEN   = 512
BATCH_TXT = 16

def embed_batch(txt_list):
    enc = tokenizer(txt_list, padding=True, truncation=True,
                    max_length=MAX_LEN, return_tensors='pt')
    enc = {k: v.to(device) for k,v in enc.items()}
    with torch.no_grad():
        out = model(**enc).last_hidden_state
    mask = enc['attention_mask'].unsqueeze(-1)
    # mean-pool
    pooled = (out * mask).sum(1) / mask.sum(1)
    return pooled.cpu().numpy().astype(np.float32)

emb_list = []
for i in tqdm.trange(0, len(texts), BATCH_TXT, desc="Embedding"):
    emb_list.append(embed_batch(texts[i:i+BATCH_TXT]))
embeddings = np.vstack(emb_list)
print("Embeddings shape:", embeddings.shape)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]


Embedding:   0%|          | 0/1310 [00:00<?, ?it/s][A
Embedding:   0%|          | 1/1310 [00:00<18:33,  1.18it/s][A
Embedding:   0%|          | 2/1310 [00:01<10:42,  2.04it/s][A
Embedding:   0%|          | 3/1310 [00:01<08:04,  2.70it/s][A
Embedding:   0%|          | 4/1310 [00:01<06:56,  3.13it/s][A
Embedding:   0%|          | 5/1310 [00:01<06:13,  3.49it/s][A
Embedding:   0%|          | 6/1310 [00:02<06:03,  3.59it/s][A
Embedding:   1%|          | 7/1310 [00:02<05:45,  3.77it/s][A
Embedding:   1%|          | 8/1310 [00:02<05:27,  3.97it/s][A
Embedding:   1%|          | 9/1310 [00:02<05:20,  4.06it/s][A
Embedding:   1%|          | 10/1310 [00:02<05:11,  4.17it/s][A
Embedding:   1%|          | 11/1310 [00:03<05:07,  4.23it/s][A
Embedding:   1%|          | 12/1310 [00:03<05:04,  4.26it/s][A
Embedding:   1%|          | 13/1310 [00:03<05:00,  4.32it/s][A
Embedding:   1%|          | 14/1310 [00:03<04:57,  4.36it/s][A
Embedding:   1%|          | 15/1310 [00:04<04:56,  4.37it

Embeddings shape: (20948, 768)





# Obtain final data / split into train and test sets

In [8]:
X_full = np.concatenate([numeric_block, embeddings], axis=1)
print("Final feature matrix:", X_full.shape)
# embeddings have dim 768

Final feature matrix: (20948, 809)


In [9]:
# We use the same identical split of the data to train/test all models for fair comparison

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_full, y, test_size=0.20, random_state=42, stratify=y)

# Modeling Comparisons

Using the same train and test sets, we will compare the performance and optimization solution quality of various models on the same data splits. We use a manual grid search to try and identify the optimal hyperparamters for each model architecture to compare how the best models measure up to each other.

In [10]:
# helper func for getting metrics
def metric_dict(y_true, y_pred, prob, runtime):
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred, zero_division=0),
        "Recall": recall_score(y_true, y_pred),
        "F1": f1_score(y_true, y_pred),
        "ROC_AUC": roc_auc_score(y_true, prob),
        "Runtime_s": runtime
    }

In [11]:
def manual_grid_search(model_fn, param_grid_list, train_fn, X_train, y_train, X_test, y_test, score_key='F1'):
    best_score = -float('inf')
    best_params = None
    best_metrics = None

    for param_dict in param_grid_list:
        print(f"Testing: {param_dict}")
        model = model_fn(**param_dict)
        model, metrics = train_fn(model, X_train, y_train, X_test, y_test)
        score = metrics[score_key]
        print(f"{score_key}: {score:.4f}\\n")

        if score > best_score:
            best_score = score
            best_params = param_dict
            best_metrics = metrics

    print("Best Parameters:", best_params)
    print(f"Best {score_key}: {best_score:.4f}")
    return best_params, best_metrics


# Logistic Regression Models

In [None]:
from sklearn.linear_model import LogisticRegression

def train_sklearn_model(model, X_train, y_train, X_test, y_test):
    start = time.time()
    model.fit(X_train, y_train)
    prob = model.predict_proba(X_test)[:, 1]
    pred = (prob >= 0.5).astype(int)
    return model, metric_dict(y_test, pred, prob, time.time() - start)



These first two cells include the embeddings in model training / hyperparameter tuning and look at the effects of using weights / no weights for BCE.

Standard BCE with embeddings

In [32]:
# Grid of hyperparameters to test
logreg_grid = [
    {'C': 0.01, 'class_weight': None},
    {'C': 0.1,  'class_weight': None},
    {'C': 0.2, 'class_weight': None},
    {'C': 0.3, 'class_weight': None},
    {'C': 0.4, 'class_weight': None},
    {'C': 0.5, 'class_weight': None},
    {'C': 0.6, 'class_weight': None},
    {'C': 0.7, 'class_weight': None},
    {'C': 0.8, 'class_weight': None},
    {'C': 0.9, 'class_weight': None},
    {'C': 1.0,  'class_weight': None},
    {'C': 10.0, 'class_weight': None}
]

logreg_fn = lambda **kwargs: LogisticRegression(max_iter=1000, solver='lbfgs', **kwargs)

manual_grid_search(logreg_fn, logreg_grid, train_sklearn_model, X_train, y_train, X_test, y_test)

Testing: {'C': 0.01, 'class_weight': None}
F1: 0.0483\n
Testing: {'C': 0.1, 'class_weight': None}
F1: 0.0795\n
Testing: {'C': 0.2, 'class_weight': None}
F1: 0.0820\n
Testing: {'C': 0.3, 'class_weight': None}
F1: 0.0816\n
Testing: {'C': 0.4, 'class_weight': None}
F1: 0.0878\n
Testing: {'C': 0.5, 'class_weight': None}
F1: 0.0909\n
Testing: {'C': 0.6, 'class_weight': None}
F1: 0.0911\n
Testing: {'C': 0.7, 'class_weight': None}
F1: 0.0906\n
Testing: {'C': 0.8, 'class_weight': None}
F1: 0.0902\n
Testing: {'C': 0.9, 'class_weight': None}
F1: 0.0931\n
Testing: {'C': 1.0, 'class_weight': None}
F1: 0.0900\n
Testing: {'C': 10.0, 'class_weight': None}
F1: 0.1193\n
✅ Best Parameters: {'C': 10.0, 'class_weight': None}
✅ Best F1: 0.1193


({'C': 10.0, 'class_weight': None},
 {'Accuracy': 0.8625298329355608,
  'Precision': 0.38613861386138615,
  'Recall': 0.0705244122965642,
  'F1': 0.11926605504587157,
  'ROC_AUC': np.float64(0.7034154194806144),
  'Runtime_s': 66.59792733192444})

Weighted BCE with embeddings

In [30]:
# Model 2: weighted BCE and embeddings

pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

# Grid of hyperparameters to test
logreg_grid = [
    {'C': 0.01, 'class_weight': {0:1, 1:pos_weight}},
    {'C': 0.1,  'class_weight': {0:1, 1:pos_weight}},
    {'C': 0.2, 'class_weight': {0:1, 1:pos_weight}},
    {'C': 0.3, 'class_weight': {0:1, 1:pos_weight}},
    {'C': 0.4, 'class_weight': {0:1, 1:pos_weight}},
    {'C': 0.5, 'class_weight': {0:1, 1:pos_weight}},
    {'C': 0.6, 'class_weight': {0:1, 1:pos_weight}},
    {'C': 0.7, 'class_weight': {0:1, 1:pos_weight}},
    {'C': 0.8, 'class_weight': {0:1, 1:pos_weight}},
    {'C': 0.9, 'class_weight': {0:1, 1:pos_weight}},
    {'C': 1.0,  'class_weight': {0:1, 1:pos_weight}},
    {'C': 10.0, 'class_weight': {0:1, 1:pos_weight}}
]

logreg_fn = lambda **kwargs: LogisticRegression(max_iter=1000, solver='lbfgs', **kwargs)

manual_grid_search(logreg_fn, logreg_grid, train_sklearn_model, X_train, y_train, X_test, y_test)


Testing: {'C': 0.01, 'class_weight': {0: 1, 1: np.float64(6.579375848032565)}}
F1: 0.3369\n
Testing: {'C': 0.1, 'class_weight': {0: 1, 1: np.float64(6.579375848032565)}}
F1: 0.3400\n
Testing: {'C': 0.2, 'class_weight': {0: 1, 1: np.float64(6.579375848032565)}}
F1: 0.3404\n
Testing: {'C': 0.3, 'class_weight': {0: 1, 1: np.float64(6.579375848032565)}}
F1: 0.3395\n
Testing: {'C': 0.4, 'class_weight': {0: 1, 1: np.float64(6.579375848032565)}}
F1: 0.3438\n
Testing: {'C': 0.5, 'class_weight': {0: 1, 1: np.float64(6.579375848032565)}}
F1: 0.3391\n
Testing: {'C': 0.6, 'class_weight': {0: 1, 1: np.float64(6.579375848032565)}}
F1: 0.3379\n
Testing: {'C': 0.7, 'class_weight': {0: 1, 1: np.float64(6.579375848032565)}}
F1: 0.3386\n
Testing: {'C': 0.8, 'class_weight': {0: 1, 1: np.float64(6.579375848032565)}}
F1: 0.3379\n
Testing: {'C': 0.9, 'class_weight': {0: 1, 1: np.float64(6.579375848032565)}}
F1: 0.3374\n
Testing: {'C': 1.0, 'class_weight': {0: 1, 1: np.float64(6.579375848032565)}}
F1: 0.3400\

({'C': 0.4, 'class_weight': {0: 1, 1: np.float64(6.579375848032565)}},
 {'Accuracy': 0.6947494033412888,
  'Precision': 0.23997134670487105,
  'Recall': 0.6057866184448463,
  'F1': 0.3437660338635197,
  'ROC_AUC': np.float64(0.7137288497116983),
  'Runtime_s': 31.037291765213013})

These next two sets of models do not include the word embeddings and simply use only the structured clinical data for model training. Similar to the two cells above, these two cells look at the differences in model performance with/wihout weights in BCE.

Standard BCE without embeddings

In [40]:
# Grid of hyperparameters to test
logreg_grid = [
    {'C': 0.01, 'class_weight': None},
    {'C': 0.1,  'class_weight': None},
    {'C': 0.2, 'class_weight': None},
    {'C': 0.3, 'class_weight': None},
    {'C': 0.4, 'class_weight': None},
    {'C': 0.5, 'class_weight': None},
    {'C': 0.6, 'class_weight': None},
    {'C': 0.7, 'class_weight': None},
    {'C': 0.8, 'class_weight': None},
    {'C': 0.9, 'class_weight': None},
    {'C': 1.0,  'class_weight': None},
    {'C': 10.0, 'class_weight': None}
]

logreg_fn = lambda **kwargs: LogisticRegression(max_iter=1000, solver='lbfgs', **kwargs)

X_train_structured = np.array([arr[:41] for arr in X_train])
X_test_structured = np.array([arr[:41] for arr in X_test])

manual_grid_search(logreg_fn, logreg_grid, train_sklearn_model, X_train_structured, y_train, X_test_structured, y_test)

Testing: {'C': 0.01, 'class_weight': None}
F1: 0.0280\n
Testing: {'C': 0.1, 'class_weight': None}
F1: 0.0314\n
Testing: {'C': 0.2, 'class_weight': None}
F1: 0.0314\n
Testing: {'C': 0.3, 'class_weight': None}
F1: 0.0314\n
Testing: {'C': 0.4, 'class_weight': None}
F1: 0.0314\n
Testing: {'C': 0.5, 'class_weight': None}
F1: 0.0314\n
Testing: {'C': 0.6, 'class_weight': None}
F1: 0.0314\n
Testing: {'C': 0.7, 'class_weight': None}
F1: 0.0314\n
Testing: {'C': 0.8, 'class_weight': None}
F1: 0.0314\n
Testing: {'C': 0.9, 'class_weight': None}
F1: 0.0314\n
Testing: {'C': 1.0, 'class_weight': None}
F1: 0.0314\n
Testing: {'C': 10.0, 'class_weight': None}
F1: 0.0314\n
✅ Best Parameters: {'C': 0.1, 'class_weight': None}
✅ Best F1: 0.0314


({'C': 0.1, 'class_weight': None},
 {'Accuracy': 0.8673031026252983,
  'Precision': 0.42857142857142855,
  'Recall': 0.0162748643761302,
  'F1': 0.0313588850174216,
  'ROC_AUC': np.float64(0.6882284298258655),
  'Runtime_s': 3.6940088272094727})

Weighted BCE without embeddings

In [41]:
logreg_grid = [
    {'C': 0.01, 'class_weight': {0:1, 1:pos_weight}},
    {'C': 0.1,  'class_weight': {0:1, 1:pos_weight}},
    {'C': 0.2, 'class_weight': {0:1, 1:pos_weight}},
    {'C': 0.3, 'class_weight': {0:1, 1:pos_weight}},
    {'C': 0.4, 'class_weight': {0:1, 1:pos_weight}},
    {'C': 0.5, 'class_weight': {0:1, 1:pos_weight}},
    {'C': 0.6, 'class_weight': {0:1, 1:pos_weight}},
    {'C': 0.7, 'class_weight': {0:1, 1:pos_weight}},
    {'C': 0.8, 'class_weight': {0:1, 1:pos_weight}},
    {'C': 0.9, 'class_weight': {0:1, 1:pos_weight}},
    {'C': 1.0,  'class_weight': {0:1, 1:pos_weight}},
    {'C': 10.0, 'class_weight': {0:1, 1:pos_weight}}
]

logreg_fn = lambda **kwargs: LogisticRegression(max_iter=1000, solver='lbfgs', **kwargs)

manual_grid_search(logreg_fn, logreg_grid, train_sklearn_model, X_train_structured, y_train, X_test_structured, y_test)

Testing: {'C': 0.01, 'class_weight': {0: 1, 1: np.float64(6.579375848032565)}}
F1: 0.3242\n
Testing: {'C': 0.1, 'class_weight': {0: 1, 1: np.float64(6.579375848032565)}}
F1: 0.3252\n
Testing: {'C': 0.2, 'class_weight': {0: 1, 1: np.float64(6.579375848032565)}}
F1: 0.3257\n
Testing: {'C': 0.3, 'class_weight': {0: 1, 1: np.float64(6.579375848032565)}}
F1: 0.3247\n
Testing: {'C': 0.4, 'class_weight': {0: 1, 1: np.float64(6.579375848032565)}}
F1: 0.3255\n
Testing: {'C': 0.5, 'class_weight': {0: 1, 1: np.float64(6.579375848032565)}}
F1: 0.3255\n
Testing: {'C': 0.6, 'class_weight': {0: 1, 1: np.float64(6.579375848032565)}}
F1: 0.3257\n
Testing: {'C': 0.7, 'class_weight': {0: 1, 1: np.float64(6.579375848032565)}}
F1: 0.3257\n
Testing: {'C': 0.8, 'class_weight': {0: 1, 1: np.float64(6.579375848032565)}}
F1: 0.3255\n
Testing: {'C': 0.9, 'class_weight': {0: 1, 1: np.float64(6.579375848032565)}}
F1: 0.3255\n
Testing: {'C': 1.0, 'class_weight': {0: 1, 1: np.float64(6.579375848032565)}}
F1: 0.3255\

({'C': 0.2, 'class_weight': {0: 1, 1: np.float64(6.579375848032565)}},
 {'Accuracy': 0.6778042959427207,
  'Precision': 0.22498274672187715,
  'Recall': 0.5895117540687161,
  'F1': 0.3256743256743257,
  'ROC_AUC': np.float64(0.6899880224396535),
  'Runtime_s': 2.336470127105713})

# XGBoost

In [51]:
from xgboost import XGBClassifier

# More granular grid
xgb_grid = [
    {
        'learning_rate': lr,
        'max_depth': md,
        'n_estimators': n,
        'subsample': ss,
        'colsample_bytree': cb,
        'scale_pos_weight': pos_weight,
        'seed': 42
    }
    for lr in [0.005, 0.05, 0.1]
    for md in [5, 7, 9]
    for n in [100, 250, 400]
    for ss in [0.6, 0.8, 1.0]
    for cb in [0.6, 0.8, 1.0]
]

xgb_fn = lambda **kwargs: XGBClassifier(
    eval_metric='logloss',
    use_label_encoder=False,
    nthread=4,
    verbosity=0,
    **kwargs
)

def train_sklearn_model(model, X_train, y_train, X_test, y_test):
    import time
    start = time.time()
    model.fit(X_train, y_train)
    prob = model.predict_proba(X_test)[:, 1]
    pred = (prob >= 0.5).astype(int)
    return model, metric_dict(y_test, pred, prob, time.time() - start)


Weighted BCE with embeddings

In [52]:
best_params, best_metrics = manual_grid_search(
    model_fn=xgb_fn,
    param_grid_list=xgb_grid,
    train_fn=train_sklearn_model,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    score_key='F1'
)

Testing: {'learning_rate': 0.005, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.6, 'colsample_bytree': 0.6, 'scale_pos_weight': np.float64(6.579375848032565), 'seed': 42}
F1: 0.3497\n
Testing: {'learning_rate': 0.005, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.6, 'colsample_bytree': 0.8, 'scale_pos_weight': np.float64(6.579375848032565), 'seed': 42}
F1: 0.3475\n
Testing: {'learning_rate': 0.005, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.6, 'colsample_bytree': 1.0, 'scale_pos_weight': np.float64(6.579375848032565), 'seed': 42}
F1: 0.3406\n
Testing: {'learning_rate': 0.005, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8, 'colsample_bytree': 0.6, 'scale_pos_weight': np.float64(6.579375848032565), 'seed': 42}
F1: 0.3401\n
Testing: {'learning_rate': 0.005, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8, 'colsample_bytree': 0.8, 'scale_pos_weight': np.float64(6.579375848032565), 'seed': 42}
F1: 0.3445\n
Testing: {'learning_rate': 0.005, 'max_depth': 5, 

This cell shows the metrics used to generate the results for the paper. We accidentally forgot to have the metrics display at the end of each hyperparamter sweep, so we need to find them here. You need to change the X_train_eval and X_test_eval and the best_params based on which model's results you want.

In [14]:
X_train_structured = np.array([arr[:41] for arr in X_train])
X_test_structured = np.array([arr[:41] for arr in X_test])

In [16]:
from xgboost import XGBClassifier
import time
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Use this depending on the dataset variant you evaluated:
X_train_eval = X_train  # or X_train
X_test_eval = X_test    # or X_test

# INSERT the best params found here
best_params = {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 400, 'subsample': 0.6, 'colsample_bytree': 1.0, 'seed': 42}

# Retrain best model
model = XGBClassifier(eval_metric='logloss', use_label_encoder=False, nthread=4, verbosity=0, **best_params)

start = time.time()
model.fit(X_train_eval, y_train)
runtime = time.time() - start

# Predict and evaluate
prob = model.predict_proba(X_test_eval)[:, 1]
pred = (prob >= 0.5).astype(int)

# Compute and print metrics
xgb_eval_metrics = {
    "Accuracy": accuracy_score(y_test, pred),
    "Precision": precision_score(y_test, pred),
    "Recall": recall_score(y_test, pred),
    "F1-score": f1_score(y_test, pred),
    "ROC-AUC": roc_auc_score(y_test, prob),
    "Training Time (s)": runtime
}

print("Evaluation of Best XGBoost Model:")
for k, v in xgb_eval_metrics.items():
    print(f"{k}: {v:.4f}")


Evaluation of Best XGBoost Model:
Accuracy: 0.8635
Precision: 0.3956
Recall: 0.0651
F1-score: 0.1118
ROC-AUC: 0.6897
Training Time (s): 26.8761


Weighted BCE without embeddings

In [53]:
best_params, best_metrics = manual_grid_search(
    model_fn=xgb_fn,
    param_grid_list=xgb_grid,
    train_fn=train_sklearn_model,
    X_train=X_train_structured,
    y_train=y_train,
    X_test=X_test_structured,
    y_test=y_test,
    score_key='F1'
)

Testing: {'learning_rate': 0.005, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.6, 'colsample_bytree': 0.6, 'scale_pos_weight': np.float64(6.579375848032565), 'seed': 42}
F1: 0.3197\n
Testing: {'learning_rate': 0.005, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.6, 'colsample_bytree': 0.8, 'scale_pos_weight': np.float64(6.579375848032565), 'seed': 42}
F1: 0.3155\n
Testing: {'learning_rate': 0.005, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.6, 'colsample_bytree': 1.0, 'scale_pos_weight': np.float64(6.579375848032565), 'seed': 42}
F1: 0.3189\n
Testing: {'learning_rate': 0.005, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8, 'colsample_bytree': 0.6, 'scale_pos_weight': np.float64(6.579375848032565), 'seed': 42}
F1: 0.3161\n
Testing: {'learning_rate': 0.005, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8, 'colsample_bytree': 0.8, 'scale_pos_weight': np.float64(6.579375848032565), 'seed': 42}
F1: 0.3180\n
Testing: {'learning_rate': 0.005, 'max_depth': 5, 

In [54]:
# remove "scale_pos_weight" param from grid to use standard BCE
xgb_grid = [
    {
        'learning_rate': lr,
        'max_depth': md,
        'n_estimators': n,
        'subsample': ss,
        'colsample_bytree': cb,
        'seed': 42
    }
    for lr in [0.005, 0.05, 0.1]
    for md in [5, 7, 9]
    for n in [100, 250, 400]
    for ss in [0.6, 0.8, 1.0]
    for cb in [0.6, 0.8, 1.0]
]

Standard BCE with embeddings

In [55]:
best_params, best_metrics = manual_grid_search(
    model_fn=xgb_fn,
    param_grid_list=xgb_grid,
    train_fn=train_sklearn_model,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    score_key='F1'
)

Testing: {'learning_rate': 0.005, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.6, 'colsample_bytree': 0.6, 'seed': 42}
F1: 0.0000\n
Testing: {'learning_rate': 0.005, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.6, 'colsample_bytree': 0.8, 'seed': 42}
F1: 0.0000\n
Testing: {'learning_rate': 0.005, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.6, 'colsample_bytree': 1.0, 'seed': 42}
F1: 0.0000\n
Testing: {'learning_rate': 0.005, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8, 'colsample_bytree': 0.6, 'seed': 42}
F1: 0.0000\n
Testing: {'learning_rate': 0.005, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8, 'colsample_bytree': 0.8, 'seed': 42}
F1: 0.0000\n
Testing: {'learning_rate': 0.005, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8, 'colsample_bytree': 1.0, 'seed': 42}
F1: 0.0000\n
Testing: {'learning_rate': 0.005, 'max_depth': 5, 'n_estimators': 100, 'subsample': 1.0, 'colsample_bytree': 0.6, 'seed': 42}
F1: 0.0000\n
Testing: {'learning_rate': 

Standard BCE without embeddings

In [56]:
best_params, best_metrics = manual_grid_search(
    model_fn=xgb_fn,
    param_grid_list=xgb_grid,
    train_fn=train_sklearn_model,
    X_train=X_train_structured,
    y_train=y_train,
    X_test=X_test_structured,
    y_test=y_test,
    score_key='F1'
)

Testing: {'learning_rate': 0.005, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.6, 'colsample_bytree': 0.6, 'seed': 42}
F1: 0.0000\n
Testing: {'learning_rate': 0.005, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.6, 'colsample_bytree': 0.8, 'seed': 42}
F1: 0.0000\n
Testing: {'learning_rate': 0.005, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.6, 'colsample_bytree': 1.0, 'seed': 42}
F1: 0.0000\n
Testing: {'learning_rate': 0.005, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8, 'colsample_bytree': 0.6, 'seed': 42}
F1: 0.0000\n
Testing: {'learning_rate': 0.005, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8, 'colsample_bytree': 0.8, 'seed': 42}
F1: 0.0000\n
Testing: {'learning_rate': 0.005, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8, 'colsample_bytree': 1.0, 'seed': 42}
F1: 0.0000\n
Testing: {'learning_rate': 0.005, 'max_depth': 5, 'n_estimators': 100, 'subsample': 1.0, 'colsample_bytree': 0.6, 'seed': 42}
F1: 0.0000\n
Testing: {'learning_rate': 

# TabNet model

In [15]:
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.model_selection import train_test_split

pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

tabnet_class_weights = [1.0, pos_weight]

def train_tabnet(model, X_train, y_train, X_test, y_test):
    start = time.time()

    from sklearn.model_selection import train_test_split

    y_train_1d = y_train.astype(np.int64)
    X_tr, X_val, y_tr, y_val = train_test_split(
        X_train, y_train_1d, test_size=0.2, stratify=y_train_1d, random_state=42
    )

    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        eval_metric=['auc'],
        patience=20,
        batch_size=1024,
        virtual_batch_size=1024,
        # set weights to be 1 to ensure that BCE is weighted: https://pypi.org/project/pytorch-tabnet/
        weights = 1
    )

    prob = model.predict_proba(X_test)[:, 1]
    pred = (prob >= 0.5).astype(int)
    return model, metric_dict(y_test, pred, prob, time.time()-start)



Weighted BCE and embeddings

In [48]:
tabnet_grid = [
    {'n_d': d, 'n_a': a, 'n_steps': s, 'gamma': g}
    for d in [16, 32, 64]
    for a in [16, 32, 64]
    for s in [3, 5, 7]
    for g in [1.0, 1.5, 2.0]
]

tabnet_fn = lambda **kwargs: TabNetClassifier(
    n_independent=2, n_shared=2, seed=42, verbose=0, **kwargs
)

manual_grid_search(tabnet_fn, tabnet_grid, train_tabnet,
                   X_train, y_train, X_test, y_test,
                   score_key='F1')

Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 3, 'gamma': 1.0}

Early stopping occurred at epoch 56 with best_epoch = 36 and best_val_0_auc = 0.66103
F1: 0.2818\n
Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 3, 'gamma': 1.5}

Early stopping occurred at epoch 51 with best_epoch = 31 and best_val_0_auc = 0.65418
F1: 0.2774\n
Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 3, 'gamma': 2.0}

Early stopping occurred at epoch 53 with best_epoch = 33 and best_val_0_auc = 0.66457
F1: 0.2959\n
Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 5, 'gamma': 1.0}

Early stopping occurred at epoch 64 with best_epoch = 44 and best_val_0_auc = 0.67524
F1: 0.3060\n
Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 5, 'gamma': 1.5}

Early stopping occurred at epoch 30 with best_epoch = 10 and best_val_0_auc = 0.63031
F1: 0.2639\n
Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 5, 'gamma': 2.0}

Early stopping occurred at epoch 55 with best_epoch = 35 and best_val_0_auc = 0.60747
F1: 0.2527\n
Testing: {'n_d': 16, 'n_a': 16, 'n_steps

({'n_d': 64, 'n_a': 32, 'n_steps': 3, 'gamma': 2.0},
 {'Accuracy': 0.6761336515513127,
  'Precision': 0.22044506258692628,
  'Recall': 0.5732368896925859,
  'F1': 0.3184329482672024,
  'ROC_AUC': np.float64(0.6800977098447192),
  'Runtime_s': 0})

Weighted BCE, no embeddings

In [16]:
tabnet_grid = [
    {'n_d': d, 'n_a': a, 'n_steps': s, 'gamma': g}
    for d in [16, 32, 64]
    for a in [16, 32, 64]
    for s in [3, 5, 7]
    for g in [1.0, 1.5, 2.0]
]

tabnet_fn = lambda **kwargs: TabNetClassifier(
    n_independent=2, n_shared=2, seed=42, verbose=0, **kwargs
)

X_train_structured = np.array([arr[:41] for arr in X_train])
X_test_structured = np.array([arr[:41] for arr in X_test])

manual_grid_search(tabnet_fn, tabnet_grid, train_tabnet,
                   X_train_structured, y_train, X_test_structured, y_test,
                   score_key='F1')

Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 3, 'gamma': 1.0}

Early stopping occurred at epoch 23 with best_epoch = 3 and best_val_0_auc = 0.66837




F1: 0.2876\n
Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 3, 'gamma': 1.5}

Early stopping occurred at epoch 55 with best_epoch = 35 and best_val_0_auc = 0.66998




F1: 0.3150\n
Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 3, 'gamma': 2.0}

Early stopping occurred at epoch 30 with best_epoch = 10 and best_val_0_auc = 0.66163




F1: 0.2884\n
Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 5, 'gamma': 1.0}

Early stopping occurred at epoch 39 with best_epoch = 19 and best_val_0_auc = 0.67923




F1: 0.3019\n
Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 5, 'gamma': 1.5}

Early stopping occurred at epoch 25 with best_epoch = 5 and best_val_0_auc = 0.64566




F1: 0.2837\n
Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 5, 'gamma': 2.0}

Early stopping occurred at epoch 71 with best_epoch = 51 and best_val_0_auc = 0.68462




F1: 0.3050\n
Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 7, 'gamma': 1.0}

Early stopping occurred at epoch 37 with best_epoch = 17 and best_val_0_auc = 0.68541




F1: 0.3023\n
Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 7, 'gamma': 1.5}

Early stopping occurred at epoch 41 with best_epoch = 21 and best_val_0_auc = 0.6622




F1: 0.2968\n
Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 7, 'gamma': 2.0}

Early stopping occurred at epoch 59 with best_epoch = 39 and best_val_0_auc = 0.67342




F1: 0.2740\n
Testing: {'n_d': 16, 'n_a': 32, 'n_steps': 3, 'gamma': 1.0}

Early stopping occurred at epoch 37 with best_epoch = 17 and best_val_0_auc = 0.65625




F1: 0.2975\n
Testing: {'n_d': 16, 'n_a': 32, 'n_steps': 3, 'gamma': 1.5}

Early stopping occurred at epoch 45 with best_epoch = 25 and best_val_0_auc = 0.6679




F1: 0.2740\n
Testing: {'n_d': 16, 'n_a': 32, 'n_steps': 3, 'gamma': 2.0}

Early stopping occurred at epoch 46 with best_epoch = 26 and best_val_0_auc = 0.66148




F1: 0.2975\n
Testing: {'n_d': 16, 'n_a': 32, 'n_steps': 5, 'gamma': 1.0}

Early stopping occurred at epoch 65 with best_epoch = 45 and best_val_0_auc = 0.67663




F1: 0.3149\n
Testing: {'n_d': 16, 'n_a': 32, 'n_steps': 5, 'gamma': 1.5}

Early stopping occurred at epoch 47 with best_epoch = 27 and best_val_0_auc = 0.66547




F1: 0.3057\n
Testing: {'n_d': 16, 'n_a': 32, 'n_steps': 5, 'gamma': 2.0}

Early stopping occurred at epoch 66 with best_epoch = 46 and best_val_0_auc = 0.66934




F1: 0.2953\n
Testing: {'n_d': 16, 'n_a': 32, 'n_steps': 7, 'gamma': 1.0}

Early stopping occurred at epoch 50 with best_epoch = 30 and best_val_0_auc = 0.67862




F1: 0.3051\n
Testing: {'n_d': 16, 'n_a': 32, 'n_steps': 7, 'gamma': 1.5}

Early stopping occurred at epoch 27 with best_epoch = 7 and best_val_0_auc = 0.65478




F1: 0.2846\n
Testing: {'n_d': 16, 'n_a': 32, 'n_steps': 7, 'gamma': 2.0}

Early stopping occurred at epoch 29 with best_epoch = 9 and best_val_0_auc = 0.65988




F1: 0.2853\n
Testing: {'n_d': 16, 'n_a': 64, 'n_steps': 3, 'gamma': 1.0}

Early stopping occurred at epoch 22 with best_epoch = 2 and best_val_0_auc = 0.64815




F1: 0.2763\n
Testing: {'n_d': 16, 'n_a': 64, 'n_steps': 3, 'gamma': 1.5}

Early stopping occurred at epoch 25 with best_epoch = 5 and best_val_0_auc = 0.66246




F1: 0.2994\n
Testing: {'n_d': 16, 'n_a': 64, 'n_steps': 3, 'gamma': 2.0}

Early stopping occurred at epoch 45 with best_epoch = 25 and best_val_0_auc = 0.67061




F1: 0.2884\n
Testing: {'n_d': 16, 'n_a': 64, 'n_steps': 5, 'gamma': 1.0}

Early stopping occurred at epoch 28 with best_epoch = 8 and best_val_0_auc = 0.66886




F1: 0.3076\n
Testing: {'n_d': 16, 'n_a': 64, 'n_steps': 5, 'gamma': 1.5}

Early stopping occurred at epoch 69 with best_epoch = 49 and best_val_0_auc = 0.67587




F1: 0.3079\n
Testing: {'n_d': 16, 'n_a': 64, 'n_steps': 5, 'gamma': 2.0}

Early stopping occurred at epoch 45 with best_epoch = 25 and best_val_0_auc = 0.65755




F1: 0.2798\n
Testing: {'n_d': 16, 'n_a': 64, 'n_steps': 7, 'gamma': 1.0}

Early stopping occurred at epoch 43 with best_epoch = 23 and best_val_0_auc = 0.67429




F1: 0.3029\n
Testing: {'n_d': 16, 'n_a': 64, 'n_steps': 7, 'gamma': 1.5}

Early stopping occurred at epoch 44 with best_epoch = 24 and best_val_0_auc = 0.68315




F1: 0.2898\n
Testing: {'n_d': 16, 'n_a': 64, 'n_steps': 7, 'gamma': 2.0}

Early stopping occurred at epoch 88 with best_epoch = 68 and best_val_0_auc = 0.6778




F1: 0.2957\n
Testing: {'n_d': 32, 'n_a': 16, 'n_steps': 3, 'gamma': 1.0}

Early stopping occurred at epoch 45 with best_epoch = 25 and best_val_0_auc = 0.67236




F1: 0.2945\n
Testing: {'n_d': 32, 'n_a': 16, 'n_steps': 3, 'gamma': 1.5}

Early stopping occurred at epoch 42 with best_epoch = 22 and best_val_0_auc = 0.66264




F1: 0.3131\n
Testing: {'n_d': 32, 'n_a': 16, 'n_steps': 3, 'gamma': 2.0}

Early stopping occurred at epoch 36 with best_epoch = 16 and best_val_0_auc = 0.66252




F1: 0.3009\n
Testing: {'n_d': 32, 'n_a': 16, 'n_steps': 5, 'gamma': 1.0}

Early stopping occurred at epoch 26 with best_epoch = 6 and best_val_0_auc = 0.6576




F1: 0.2995\n
Testing: {'n_d': 32, 'n_a': 16, 'n_steps': 5, 'gamma': 1.5}

Early stopping occurred at epoch 36 with best_epoch = 16 and best_val_0_auc = 0.66553




F1: 0.2892\n
Testing: {'n_d': 32, 'n_a': 16, 'n_steps': 5, 'gamma': 2.0}

Early stopping occurred at epoch 43 with best_epoch = 23 and best_val_0_auc = 0.6691




F1: 0.2980\n
Testing: {'n_d': 32, 'n_a': 16, 'n_steps': 7, 'gamma': 1.0}

Early stopping occurred at epoch 59 with best_epoch = 39 and best_val_0_auc = 0.68331




F1: 0.3157\n
Testing: {'n_d': 32, 'n_a': 16, 'n_steps': 7, 'gamma': 1.5}

Early stopping occurred at epoch 53 with best_epoch = 33 and best_val_0_auc = 0.66076




F1: 0.2882\n
Testing: {'n_d': 32, 'n_a': 16, 'n_steps': 7, 'gamma': 2.0}

Early stopping occurred at epoch 94 with best_epoch = 74 and best_val_0_auc = 0.68313




F1: 0.3002\n
Testing: {'n_d': 32, 'n_a': 32, 'n_steps': 3, 'gamma': 1.0}

Early stopping occurred at epoch 26 with best_epoch = 6 and best_val_0_auc = 0.66078




F1: 0.2936\n
Testing: {'n_d': 32, 'n_a': 32, 'n_steps': 3, 'gamma': 1.5}

Early stopping occurred at epoch 33 with best_epoch = 13 and best_val_0_auc = 0.66571




F1: 0.2941\n
Testing: {'n_d': 32, 'n_a': 32, 'n_steps': 3, 'gamma': 2.0}

Early stopping occurred at epoch 39 with best_epoch = 19 and best_val_0_auc = 0.66773




F1: 0.2887\n
Testing: {'n_d': 32, 'n_a': 32, 'n_steps': 5, 'gamma': 1.0}

Early stopping occurred at epoch 32 with best_epoch = 12 and best_val_0_auc = 0.67305




F1: 0.3101\n
Testing: {'n_d': 32, 'n_a': 32, 'n_steps': 5, 'gamma': 1.5}
Stop training because you reached max_epochs = 100 with best_epoch = 92 and best_val_0_auc = 0.6885




F1: 0.2963\n
Testing: {'n_d': 32, 'n_a': 32, 'n_steps': 5, 'gamma': 2.0}

Early stopping occurred at epoch 59 with best_epoch = 39 and best_val_0_auc = 0.67604




F1: 0.3057\n
Testing: {'n_d': 32, 'n_a': 32, 'n_steps': 7, 'gamma': 1.0}

Early stopping occurred at epoch 49 with best_epoch = 29 and best_val_0_auc = 0.67691




F1: 0.3120\n
Testing: {'n_d': 32, 'n_a': 32, 'n_steps': 7, 'gamma': 1.5}

Early stopping occurred at epoch 38 with best_epoch = 18 and best_val_0_auc = 0.66282




F1: 0.2981\n
Testing: {'n_d': 32, 'n_a': 32, 'n_steps': 7, 'gamma': 2.0}

Early stopping occurred at epoch 28 with best_epoch = 8 and best_val_0_auc = 0.66722




F1: 0.2793\n
Testing: {'n_d': 32, 'n_a': 64, 'n_steps': 3, 'gamma': 1.0}

Early stopping occurred at epoch 33 with best_epoch = 13 and best_val_0_auc = 0.6701




F1: 0.3016\n
Testing: {'n_d': 32, 'n_a': 64, 'n_steps': 3, 'gamma': 1.5}

Early stopping occurred at epoch 24 with best_epoch = 4 and best_val_0_auc = 0.66652




F1: 0.2731\n
Testing: {'n_d': 32, 'n_a': 64, 'n_steps': 3, 'gamma': 2.0}

Early stopping occurred at epoch 27 with best_epoch = 7 and best_val_0_auc = 0.66389




F1: 0.2910\n
Testing: {'n_d': 32, 'n_a': 64, 'n_steps': 5, 'gamma': 1.0}

Early stopping occurred at epoch 91 with best_epoch = 71 and best_val_0_auc = 0.67666




F1: 0.3010\n
Testing: {'n_d': 32, 'n_a': 64, 'n_steps': 5, 'gamma': 1.5}

Early stopping occurred at epoch 33 with best_epoch = 13 and best_val_0_auc = 0.6677




F1: 0.2745\n
Testing: {'n_d': 32, 'n_a': 64, 'n_steps': 5, 'gamma': 2.0}

Early stopping occurred at epoch 45 with best_epoch = 25 and best_val_0_auc = 0.68346




F1: 0.2990\n
Testing: {'n_d': 32, 'n_a': 64, 'n_steps': 7, 'gamma': 1.0}

Early stopping occurred at epoch 31 with best_epoch = 11 and best_val_0_auc = 0.66728




F1: 0.2987\n
Testing: {'n_d': 32, 'n_a': 64, 'n_steps': 7, 'gamma': 1.5}
Stop training because you reached max_epochs = 100 with best_epoch = 98 and best_val_0_auc = 0.67862




F1: 0.3026\n
Testing: {'n_d': 32, 'n_a': 64, 'n_steps': 7, 'gamma': 2.0}

Early stopping occurred at epoch 48 with best_epoch = 28 and best_val_0_auc = 0.67695




F1: 0.2937\n
Testing: {'n_d': 64, 'n_a': 16, 'n_steps': 3, 'gamma': 1.0}

Early stopping occurred at epoch 59 with best_epoch = 39 and best_val_0_auc = 0.65799




F1: 0.2837\n
Testing: {'n_d': 64, 'n_a': 16, 'n_steps': 3, 'gamma': 1.5}

Early stopping occurred at epoch 40 with best_epoch = 20 and best_val_0_auc = 0.64844




F1: 0.2936\n
Testing: {'n_d': 64, 'n_a': 16, 'n_steps': 3, 'gamma': 2.0}

Early stopping occurred at epoch 40 with best_epoch = 20 and best_val_0_auc = 0.65927




F1: 0.3009\n
Testing: {'n_d': 64, 'n_a': 16, 'n_steps': 5, 'gamma': 1.0}

Early stopping occurred at epoch 40 with best_epoch = 20 and best_val_0_auc = 0.66293




F1: 0.3058\n
Testing: {'n_d': 64, 'n_a': 16, 'n_steps': 5, 'gamma': 1.5}

Early stopping occurred at epoch 51 with best_epoch = 31 and best_val_0_auc = 0.68709




F1: 0.3046\n
Testing: {'n_d': 64, 'n_a': 16, 'n_steps': 5, 'gamma': 2.0}

Early stopping occurred at epoch 86 with best_epoch = 66 and best_val_0_auc = 0.6743




F1: 0.2915\n
Testing: {'n_d': 64, 'n_a': 16, 'n_steps': 7, 'gamma': 1.0}

Early stopping occurred at epoch 33 with best_epoch = 13 and best_val_0_auc = 0.66529




F1: 0.2969\n
Testing: {'n_d': 64, 'n_a': 16, 'n_steps': 7, 'gamma': 1.5}

Early stopping occurred at epoch 45 with best_epoch = 25 and best_val_0_auc = 0.66849




F1: 0.2910\n
Testing: {'n_d': 64, 'n_a': 16, 'n_steps': 7, 'gamma': 2.0}

Early stopping occurred at epoch 42 with best_epoch = 22 and best_val_0_auc = 0.66137




F1: 0.2832\n
Testing: {'n_d': 64, 'n_a': 32, 'n_steps': 3, 'gamma': 1.0}

Early stopping occurred at epoch 31 with best_epoch = 11 and best_val_0_auc = 0.66506




F1: 0.3020\n
Testing: {'n_d': 64, 'n_a': 32, 'n_steps': 3, 'gamma': 1.5}

Early stopping occurred at epoch 35 with best_epoch = 15 and best_val_0_auc = 0.64167




F1: 0.2914\n
Testing: {'n_d': 64, 'n_a': 32, 'n_steps': 3, 'gamma': 2.0}

Early stopping occurred at epoch 39 with best_epoch = 19 and best_val_0_auc = 0.6656




F1: 0.3086\n
Testing: {'n_d': 64, 'n_a': 32, 'n_steps': 5, 'gamma': 1.0}

Early stopping occurred at epoch 53 with best_epoch = 33 and best_val_0_auc = 0.67638




F1: 0.3159\n
Testing: {'n_d': 64, 'n_a': 32, 'n_steps': 5, 'gamma': 1.5}

Early stopping occurred at epoch 52 with best_epoch = 32 and best_val_0_auc = 0.66806




F1: 0.3212\n
Testing: {'n_d': 64, 'n_a': 32, 'n_steps': 5, 'gamma': 2.0}

Early stopping occurred at epoch 27 with best_epoch = 7 and best_val_0_auc = 0.64923




F1: 0.2803\n
Testing: {'n_d': 64, 'n_a': 32, 'n_steps': 7, 'gamma': 1.0}

Early stopping occurred at epoch 27 with best_epoch = 7 and best_val_0_auc = 0.66115




F1: 0.2947\n
Testing: {'n_d': 64, 'n_a': 32, 'n_steps': 7, 'gamma': 1.5}

Early stopping occurred at epoch 53 with best_epoch = 33 and best_val_0_auc = 0.66523




F1: 0.3025\n
Testing: {'n_d': 64, 'n_a': 32, 'n_steps': 7, 'gamma': 2.0}

Early stopping occurred at epoch 32 with best_epoch = 12 and best_val_0_auc = 0.66648




F1: 0.2801\n
Testing: {'n_d': 64, 'n_a': 64, 'n_steps': 3, 'gamma': 1.0}

Early stopping occurred at epoch 47 with best_epoch = 27 and best_val_0_auc = 0.66491




F1: 0.2738\n
Testing: {'n_d': 64, 'n_a': 64, 'n_steps': 3, 'gamma': 1.5}

Early stopping occurred at epoch 34 with best_epoch = 14 and best_val_0_auc = 0.66845




F1: 0.2918\n
Testing: {'n_d': 64, 'n_a': 64, 'n_steps': 3, 'gamma': 2.0}

Early stopping occurred at epoch 24 with best_epoch = 4 and best_val_0_auc = 0.66627




F1: 0.2960\n
Testing: {'n_d': 64, 'n_a': 64, 'n_steps': 5, 'gamma': 1.0}

Early stopping occurred at epoch 58 with best_epoch = 38 and best_val_0_auc = 0.66983




F1: 0.3058\n
Testing: {'n_d': 64, 'n_a': 64, 'n_steps': 5, 'gamma': 1.5}

Early stopping occurred at epoch 34 with best_epoch = 14 and best_val_0_auc = 0.65108




F1: 0.2856\n
Testing: {'n_d': 64, 'n_a': 64, 'n_steps': 5, 'gamma': 2.0}

Early stopping occurred at epoch 89 with best_epoch = 69 and best_val_0_auc = 0.67627




F1: 0.2992\n
Testing: {'n_d': 64, 'n_a': 64, 'n_steps': 7, 'gamma': 1.0}

Early stopping occurred at epoch 40 with best_epoch = 20 and best_val_0_auc = 0.67913




F1: 0.2903\n
Testing: {'n_d': 64, 'n_a': 64, 'n_steps': 7, 'gamma': 1.5}

Early stopping occurred at epoch 34 with best_epoch = 14 and best_val_0_auc = 0.66485




F1: 0.2942\n
Testing: {'n_d': 64, 'n_a': 64, 'n_steps': 7, 'gamma': 2.0}

Early stopping occurred at epoch 40 with best_epoch = 20 and best_val_0_auc = 0.66531




F1: 0.2997\n
Best Parameters: {'n_d': 64, 'n_a': 32, 'n_steps': 5, 'gamma': 1.5}
Best F1: 0.3212


({'n_d': 64, 'n_a': 32, 'n_steps': 5, 'gamma': 1.5},
 {'Accuracy': 0.6439140811455847,
  'Precision': 0.21458966565349544,
  'Recall': 0.6383363471971067,
  'F1': 0.32120109190172885,
  'ROC_AUC': np.float64(0.6745504437265973),
  'Runtime_s': 33.384838581085205})

In [17]:
def train_tabnet(model, X_train, y_train, X_test, y_test):
    start = time.time()

    from sklearn.model_selection import train_test_split

    y_train_1d = y_train.astype(np.int64)
    X_tr, X_val, y_tr, y_val = train_test_split(
        X_train, y_train_1d, test_size=0.2, stratify=y_train_1d, random_state=42
    )

    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        eval_metric=['auc'],
        patience=20,
        batch_size=1024,
        virtual_batch_size=1024,
    )

    prob = model.predict_proba(X_test)[:, 1]
    pred = (prob >= 0.5).astype(int)
    return model, metric_dict(y_test, pred, prob, time.time()-start)

Standard BCE, embeddings

In [18]:
manual_grid_search(tabnet_fn, tabnet_grid, train_tabnet,
                   X_train, y_train, X_test, y_test,
                   score_key='F1')

Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 3, 'gamma': 1.0}

Early stopping occurred at epoch 64 with best_epoch = 44 and best_val_0_auc = 0.67357




F1: 0.0436\n
Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 3, 'gamma': 1.5}

Early stopping occurred at epoch 63 with best_epoch = 43 and best_val_0_auc = 0.66189




F1: 0.0451\n
Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 3, 'gamma': 2.0}

Early stopping occurred at epoch 49 with best_epoch = 29 and best_val_0_auc = 0.64159




F1: 0.0000\n
Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 5, 'gamma': 1.0}

Early stopping occurred at epoch 73 with best_epoch = 53 and best_val_0_auc = 0.64503




F1: 0.0106\n
Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 5, 'gamma': 1.5}

Early stopping occurred at epoch 61 with best_epoch = 41 and best_val_0_auc = 0.61694




F1: 0.0000\n
Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 5, 'gamma': 2.0}

Early stopping occurred at epoch 24 with best_epoch = 4 and best_val_0_auc = 0.63679




F1: 0.0000\n
Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 7, 'gamma': 1.0}

Early stopping occurred at epoch 36 with best_epoch = 16 and best_val_0_auc = 0.65104




F1: 0.0000\n
Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 7, 'gamma': 1.5}

Early stopping occurred at epoch 25 with best_epoch = 5 and best_val_0_auc = 0.61076




F1: 0.0000\n
Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 7, 'gamma': 2.0}

Early stopping occurred at epoch 46 with best_epoch = 26 and best_val_0_auc = 0.5952




F1: 0.0000\n
Testing: {'n_d': 16, 'n_a': 32, 'n_steps': 3, 'gamma': 1.0}

Early stopping occurred at epoch 52 with best_epoch = 32 and best_val_0_auc = 0.63671




F1: 0.0211\n
Testing: {'n_d': 16, 'n_a': 32, 'n_steps': 3, 'gamma': 1.5}

Early stopping occurred at epoch 66 with best_epoch = 46 and best_val_0_auc = 0.66413




F1: 0.0670\n
Testing: {'n_d': 16, 'n_a': 32, 'n_steps': 3, 'gamma': 2.0}

Early stopping occurred at epoch 57 with best_epoch = 37 and best_val_0_auc = 0.67095




F1: 0.0404\n
Testing: {'n_d': 16, 'n_a': 32, 'n_steps': 5, 'gamma': 1.0}

Early stopping occurred at epoch 99 with best_epoch = 79 and best_val_0_auc = 0.66882




F1: 0.0344\n
Testing: {'n_d': 16, 'n_a': 32, 'n_steps': 5, 'gamma': 1.5}

Early stopping occurred at epoch 44 with best_epoch = 24 and best_val_0_auc = 0.60209




F1: 0.0000\n
Testing: {'n_d': 16, 'n_a': 32, 'n_steps': 5, 'gamma': 2.0}

Early stopping occurred at epoch 41 with best_epoch = 21 and best_val_0_auc = 0.59918




F1: 0.0036\n
Testing: {'n_d': 16, 'n_a': 32, 'n_steps': 7, 'gamma': 1.0}

Early stopping occurred at epoch 27 with best_epoch = 7 and best_val_0_auc = 0.6292




F1: 0.0000\n
Testing: {'n_d': 16, 'n_a': 32, 'n_steps': 7, 'gamma': 1.5}

Early stopping occurred at epoch 45 with best_epoch = 25 and best_val_0_auc = 0.61237




F1: 0.0000\n
Testing: {'n_d': 16, 'n_a': 32, 'n_steps': 7, 'gamma': 2.0}

Early stopping occurred at epoch 42 with best_epoch = 22 and best_val_0_auc = 0.58591




F1: 0.0000\n
Testing: {'n_d': 16, 'n_a': 64, 'n_steps': 3, 'gamma': 1.0}

Early stopping occurred at epoch 51 with best_epoch = 31 and best_val_0_auc = 0.66433




F1: 0.0106\n
Testing: {'n_d': 16, 'n_a': 64, 'n_steps': 3, 'gamma': 1.5}

Early stopping occurred at epoch 77 with best_epoch = 57 and best_val_0_auc = 0.66281




F1: 0.1184\n
Testing: {'n_d': 16, 'n_a': 64, 'n_steps': 3, 'gamma': 2.0}

Early stopping occurred at epoch 64 with best_epoch = 44 and best_val_0_auc = 0.66917




F1: 0.0661\n
Testing: {'n_d': 16, 'n_a': 64, 'n_steps': 5, 'gamma': 1.0}

Early stopping occurred at epoch 54 with best_epoch = 34 and best_val_0_auc = 0.66528




F1: 0.0000\n
Testing: {'n_d': 16, 'n_a': 64, 'n_steps': 5, 'gamma': 1.5}

Early stopping occurred at epoch 64 with best_epoch = 44 and best_val_0_auc = 0.63905




F1: 0.0000\n
Testing: {'n_d': 16, 'n_a': 64, 'n_steps': 5, 'gamma': 2.0}

Early stopping occurred at epoch 24 with best_epoch = 4 and best_val_0_auc = 0.58208




F1: 0.0000\n
Testing: {'n_d': 16, 'n_a': 64, 'n_steps': 7, 'gamma': 1.0}

Early stopping occurred at epoch 24 with best_epoch = 4 and best_val_0_auc = 0.62015




F1: 0.0000\n
Testing: {'n_d': 16, 'n_a': 64, 'n_steps': 7, 'gamma': 1.5}

Early stopping occurred at epoch 63 with best_epoch = 43 and best_val_0_auc = 0.61998




F1: 0.0000\n
Testing: {'n_d': 16, 'n_a': 64, 'n_steps': 7, 'gamma': 2.0}

Early stopping occurred at epoch 30 with best_epoch = 10 and best_val_0_auc = 0.61788




F1: 0.0108\n
Testing: {'n_d': 32, 'n_a': 16, 'n_steps': 3, 'gamma': 1.0}

Early stopping occurred at epoch 70 with best_epoch = 50 and best_val_0_auc = 0.67395




F1: 0.0409\n
Testing: {'n_d': 32, 'n_a': 16, 'n_steps': 3, 'gamma': 1.5}

Early stopping occurred at epoch 55 with best_epoch = 35 and best_val_0_auc = 0.67829




F1: 0.0000\n
Testing: {'n_d': 32, 'n_a': 16, 'n_steps': 3, 'gamma': 2.0}

Early stopping occurred at epoch 64 with best_epoch = 44 and best_val_0_auc = 0.66191




F1: 0.1392\n
Testing: {'n_d': 32, 'n_a': 16, 'n_steps': 5, 'gamma': 1.0}

Early stopping occurred at epoch 48 with best_epoch = 28 and best_val_0_auc = 0.62943




F1: 0.0000\n
Testing: {'n_d': 32, 'n_a': 16, 'n_steps': 5, 'gamma': 1.5}

Early stopping occurred at epoch 36 with best_epoch = 16 and best_val_0_auc = 0.65117




F1: 0.0000\n
Testing: {'n_d': 32, 'n_a': 16, 'n_steps': 5, 'gamma': 2.0}

Early stopping occurred at epoch 50 with best_epoch = 30 and best_val_0_auc = 0.62231




F1: 0.0000\n
Testing: {'n_d': 32, 'n_a': 16, 'n_steps': 7, 'gamma': 1.0}

Early stopping occurred at epoch 71 with best_epoch = 51 and best_val_0_auc = 0.67563




F1: 0.0143\n
Testing: {'n_d': 32, 'n_a': 16, 'n_steps': 7, 'gamma': 1.5}

Early stopping occurred at epoch 78 with best_epoch = 58 and best_val_0_auc = 0.6334




F1: 0.0036\n
Testing: {'n_d': 32, 'n_a': 16, 'n_steps': 7, 'gamma': 2.0}

Early stopping occurred at epoch 48 with best_epoch = 28 and best_val_0_auc = 0.64194




F1: 0.0000\n
Testing: {'n_d': 32, 'n_a': 32, 'n_steps': 3, 'gamma': 1.0}

Early stopping occurred at epoch 44 with best_epoch = 24 and best_val_0_auc = 0.64711




F1: 0.0036\n
Testing: {'n_d': 32, 'n_a': 32, 'n_steps': 3, 'gamma': 1.5}

Early stopping occurred at epoch 42 with best_epoch = 22 and best_val_0_auc = 0.66053




F1: 0.0000\n
Testing: {'n_d': 32, 'n_a': 32, 'n_steps': 3, 'gamma': 2.0}

Early stopping occurred at epoch 88 with best_epoch = 68 and best_val_0_auc = 0.65815




F1: 0.1023\n
Testing: {'n_d': 32, 'n_a': 32, 'n_steps': 5, 'gamma': 1.0}

Early stopping occurred at epoch 52 with best_epoch = 32 and best_val_0_auc = 0.66045




F1: 0.0000\n
Testing: {'n_d': 32, 'n_a': 32, 'n_steps': 5, 'gamma': 1.5}

Early stopping occurred at epoch 40 with best_epoch = 20 and best_val_0_auc = 0.6204




F1: 0.0000\n
Testing: {'n_d': 32, 'n_a': 32, 'n_steps': 5, 'gamma': 2.0}

Early stopping occurred at epoch 33 with best_epoch = 13 and best_val_0_auc = 0.63462




F1: 0.0000\n
Testing: {'n_d': 32, 'n_a': 32, 'n_steps': 7, 'gamma': 1.0}

Early stopping occurred at epoch 77 with best_epoch = 57 and best_val_0_auc = 0.66417




F1: 0.0036\n
Testing: {'n_d': 32, 'n_a': 32, 'n_steps': 7, 'gamma': 1.5}

Early stopping occurred at epoch 58 with best_epoch = 38 and best_val_0_auc = 0.60335




F1: 0.0000\n
Testing: {'n_d': 32, 'n_a': 32, 'n_steps': 7, 'gamma': 2.0}

Early stopping occurred at epoch 23 with best_epoch = 3 and best_val_0_auc = 0.60663




F1: 0.0000\n
Testing: {'n_d': 32, 'n_a': 64, 'n_steps': 3, 'gamma': 1.0}

Early stopping occurred at epoch 46 with best_epoch = 26 and best_val_0_auc = 0.65758




F1: 0.0036\n
Testing: {'n_d': 32, 'n_a': 64, 'n_steps': 3, 'gamma': 1.5}

Early stopping occurred at epoch 51 with best_epoch = 31 and best_val_0_auc = 0.65978




F1: 0.0071\n
Testing: {'n_d': 32, 'n_a': 64, 'n_steps': 3, 'gamma': 2.0}

Early stopping occurred at epoch 46 with best_epoch = 26 and best_val_0_auc = 0.67832




F1: 0.0000\n
Testing: {'n_d': 32, 'n_a': 64, 'n_steps': 5, 'gamma': 1.0}

Early stopping occurred at epoch 47 with best_epoch = 27 and best_val_0_auc = 0.66909




F1: 0.0036\n
Testing: {'n_d': 32, 'n_a': 64, 'n_steps': 5, 'gamma': 1.5}

Early stopping occurred at epoch 68 with best_epoch = 48 and best_val_0_auc = 0.63108




F1: 0.0000\n
Testing: {'n_d': 32, 'n_a': 64, 'n_steps': 5, 'gamma': 2.0}

Early stopping occurred at epoch 59 with best_epoch = 39 and best_val_0_auc = 0.62145




F1: 0.0000\n
Testing: {'n_d': 32, 'n_a': 64, 'n_steps': 7, 'gamma': 1.0}

Early stopping occurred at epoch 67 with best_epoch = 47 and best_val_0_auc = 0.66342




F1: 0.0000\n
Testing: {'n_d': 32, 'n_a': 64, 'n_steps': 7, 'gamma': 1.5}

Early stopping occurred at epoch 33 with best_epoch = 13 and best_val_0_auc = 0.62979




F1: 0.0000\n
Testing: {'n_d': 32, 'n_a': 64, 'n_steps': 7, 'gamma': 2.0}

Early stopping occurred at epoch 36 with best_epoch = 16 and best_val_0_auc = 0.65345




F1: 0.0000\n
Testing: {'n_d': 64, 'n_a': 16, 'n_steps': 3, 'gamma': 1.0}

Early stopping occurred at epoch 34 with best_epoch = 14 and best_val_0_auc = 0.68084




F1: 0.0036\n
Testing: {'n_d': 64, 'n_a': 16, 'n_steps': 3, 'gamma': 1.5}

Early stopping occurred at epoch 57 with best_epoch = 37 and best_val_0_auc = 0.67154




F1: 0.0374\n
Testing: {'n_d': 64, 'n_a': 16, 'n_steps': 3, 'gamma': 2.0}

Early stopping occurred at epoch 53 with best_epoch = 33 and best_val_0_auc = 0.65765




F1: 0.0000\n
Testing: {'n_d': 64, 'n_a': 16, 'n_steps': 5, 'gamma': 1.0}

Early stopping occurred at epoch 53 with best_epoch = 33 and best_val_0_auc = 0.67116




F1: 0.0000\n
Testing: {'n_d': 64, 'n_a': 16, 'n_steps': 5, 'gamma': 1.5}

Early stopping occurred at epoch 34 with best_epoch = 14 and best_val_0_auc = 0.63206




F1: 0.0000\n
Testing: {'n_d': 64, 'n_a': 16, 'n_steps': 5, 'gamma': 2.0}

Early stopping occurred at epoch 46 with best_epoch = 26 and best_val_0_auc = 0.63447




F1: 0.0000\n
Testing: {'n_d': 64, 'n_a': 16, 'n_steps': 7, 'gamma': 1.0}

Early stopping occurred at epoch 58 with best_epoch = 38 and best_val_0_auc = 0.64271




F1: 0.0000\n
Testing: {'n_d': 64, 'n_a': 16, 'n_steps': 7, 'gamma': 1.5}

Early stopping occurred at epoch 68 with best_epoch = 48 and best_val_0_auc = 0.61904




F1: 0.0000\n
Testing: {'n_d': 64, 'n_a': 16, 'n_steps': 7, 'gamma': 2.0}

Early stopping occurred at epoch 32 with best_epoch = 12 and best_val_0_auc = 0.60681




F1: 0.0000\n
Testing: {'n_d': 64, 'n_a': 32, 'n_steps': 3, 'gamma': 1.0}

Early stopping occurred at epoch 51 with best_epoch = 31 and best_val_0_auc = 0.67222




F1: 0.0498\n
Testing: {'n_d': 64, 'n_a': 32, 'n_steps': 3, 'gamma': 1.5}

Early stopping occurred at epoch 57 with best_epoch = 37 and best_val_0_auc = 0.65701




F1: 0.0393\n
Testing: {'n_d': 64, 'n_a': 32, 'n_steps': 3, 'gamma': 2.0}

Early stopping occurred at epoch 52 with best_epoch = 32 and best_val_0_auc = 0.65092




F1: 0.0000\n
Testing: {'n_d': 64, 'n_a': 32, 'n_steps': 5, 'gamma': 1.0}

Early stopping occurred at epoch 49 with best_epoch = 29 and best_val_0_auc = 0.66725




F1: 0.0036\n
Testing: {'n_d': 64, 'n_a': 32, 'n_steps': 5, 'gamma': 1.5}

Early stopping occurred at epoch 63 with best_epoch = 43 and best_val_0_auc = 0.64169




F1: 0.0000\n
Testing: {'n_d': 64, 'n_a': 32, 'n_steps': 5, 'gamma': 2.0}

Early stopping occurred at epoch 52 with best_epoch = 32 and best_val_0_auc = 0.63395




F1: 0.0000\n
Testing: {'n_d': 64, 'n_a': 32, 'n_steps': 7, 'gamma': 1.0}

Early stopping occurred at epoch 47 with best_epoch = 27 and best_val_0_auc = 0.6367




F1: 0.0000\n
Testing: {'n_d': 64, 'n_a': 32, 'n_steps': 7, 'gamma': 1.5}

Early stopping occurred at epoch 62 with best_epoch = 42 and best_val_0_auc = 0.6213




F1: 0.0000\n
Testing: {'n_d': 64, 'n_a': 32, 'n_steps': 7, 'gamma': 2.0}

Early stopping occurred at epoch 45 with best_epoch = 25 and best_val_0_auc = 0.60822




F1: 0.0000\n
Testing: {'n_d': 64, 'n_a': 64, 'n_steps': 3, 'gamma': 1.0}

Early stopping occurred at epoch 55 with best_epoch = 35 and best_val_0_auc = 0.62523




F1: 0.0426\n
Testing: {'n_d': 64, 'n_a': 64, 'n_steps': 3, 'gamma': 1.5}

Early stopping occurred at epoch 55 with best_epoch = 35 and best_val_0_auc = 0.67145




F1: 0.0107\n
Testing: {'n_d': 64, 'n_a': 64, 'n_steps': 3, 'gamma': 2.0}

Early stopping occurred at epoch 57 with best_epoch = 37 and best_val_0_auc = 0.66099




F1: 0.0381\n
Testing: {'n_d': 64, 'n_a': 64, 'n_steps': 5, 'gamma': 1.0}

Early stopping occurred at epoch 61 with best_epoch = 41 and best_val_0_auc = 0.6711




F1: 0.0000\n
Testing: {'n_d': 64, 'n_a': 64, 'n_steps': 5, 'gamma': 1.5}

Early stopping occurred at epoch 40 with best_epoch = 20 and best_val_0_auc = 0.63569




F1: 0.0072\n
Testing: {'n_d': 64, 'n_a': 64, 'n_steps': 5, 'gamma': 2.0}

Early stopping occurred at epoch 62 with best_epoch = 42 and best_val_0_auc = 0.61786




F1: 0.0000\n
Testing: {'n_d': 64, 'n_a': 64, 'n_steps': 7, 'gamma': 1.0}

Early stopping occurred at epoch 21 with best_epoch = 1 and best_val_0_auc = 0.60385




F1: 0.0000\n
Testing: {'n_d': 64, 'n_a': 64, 'n_steps': 7, 'gamma': 1.5}

Early stopping occurred at epoch 60 with best_epoch = 40 and best_val_0_auc = 0.64198




F1: 0.0000\n
Testing: {'n_d': 64, 'n_a': 64, 'n_steps': 7, 'gamma': 2.0}

Early stopping occurred at epoch 31 with best_epoch = 11 and best_val_0_auc = 0.58342




F1: 0.0000\n
Best Parameters: {'n_d': 32, 'n_a': 16, 'n_steps': 3, 'gamma': 2.0}
Best F1: 0.1392


({'n_d': 32, 'n_a': 16, 'n_steps': 3, 'gamma': 2.0},
 {'Accuracy': 0.8553699284009546,
  'Precision': 0.32450331125827814,
  'Recall': 0.08860759493670886,
  'F1': 0.13920454545454544,
  'ROC_AUC': np.float64(0.6259247308032125),
  'Runtime_s': 32.45985412597656})

Standard BCE, no embeddings

In [19]:
manual_grid_search(tabnet_fn, tabnet_grid, train_tabnet,
                   X_train_structured, y_train, X_test_structured, y_test,
                   score_key='F1')

Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 3, 'gamma': 1.0}

Early stopping occurred at epoch 32 with best_epoch = 12 and best_val_0_auc = 0.67089




F1: 0.0071\n
Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 3, 'gamma': 1.5}

Early stopping occurred at epoch 25 with best_epoch = 5 and best_val_0_auc = 0.6524




F1: 0.0036\n
Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 3, 'gamma': 2.0}

Early stopping occurred at epoch 32 with best_epoch = 12 and best_val_0_auc = 0.66431




F1: 0.0036\n
Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 5, 'gamma': 1.0}

Early stopping occurred at epoch 64 with best_epoch = 44 and best_val_0_auc = 0.68321




F1: 0.0000\n
Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 5, 'gamma': 1.5}

Early stopping occurred at epoch 55 with best_epoch = 35 and best_val_0_auc = 0.67238




F1: 0.0000\n
Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 5, 'gamma': 2.0}

Early stopping occurred at epoch 42 with best_epoch = 22 and best_val_0_auc = 0.65866




F1: 0.0000\n
Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 7, 'gamma': 1.0}

Early stopping occurred at epoch 67 with best_epoch = 47 and best_val_0_auc = 0.68437




F1: 0.0072\n
Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 7, 'gamma': 1.5}

Early stopping occurred at epoch 38 with best_epoch = 18 and best_val_0_auc = 0.66064




F1: 0.0106\n
Testing: {'n_d': 16, 'n_a': 16, 'n_steps': 7, 'gamma': 2.0}

Early stopping occurred at epoch 39 with best_epoch = 19 and best_val_0_auc = 0.66478




F1: 0.0000\n
Testing: {'n_d': 16, 'n_a': 32, 'n_steps': 3, 'gamma': 1.0}

Early stopping occurred at epoch 62 with best_epoch = 42 and best_val_0_auc = 0.66798




F1: 0.0999\n
Testing: {'n_d': 16, 'n_a': 32, 'n_steps': 3, 'gamma': 1.5}

Early stopping occurred at epoch 34 with best_epoch = 14 and best_val_0_auc = 0.66255




F1: 0.0036\n
Testing: {'n_d': 16, 'n_a': 32, 'n_steps': 3, 'gamma': 2.0}

Early stopping occurred at epoch 39 with best_epoch = 19 and best_val_0_auc = 0.66867




F1: 0.0072\n
Testing: {'n_d': 16, 'n_a': 32, 'n_steps': 5, 'gamma': 1.0}

Early stopping occurred at epoch 43 with best_epoch = 23 and best_val_0_auc = 0.6694




F1: 0.0000\n
Testing: {'n_d': 16, 'n_a': 32, 'n_steps': 5, 'gamma': 1.5}
Stop training because you reached max_epochs = 100 with best_epoch = 81 and best_val_0_auc = 0.67876




F1: 0.0036\n
Testing: {'n_d': 16, 'n_a': 32, 'n_steps': 5, 'gamma': 2.0}

Early stopping occurred at epoch 61 with best_epoch = 41 and best_val_0_auc = 0.65792




F1: 0.0000\n
Testing: {'n_d': 16, 'n_a': 32, 'n_steps': 7, 'gamma': 1.0}

Early stopping occurred at epoch 44 with best_epoch = 24 and best_val_0_auc = 0.67561




F1: 0.0036\n
Testing: {'n_d': 16, 'n_a': 32, 'n_steps': 7, 'gamma': 1.5}

Early stopping occurred at epoch 43 with best_epoch = 23 and best_val_0_auc = 0.66875




F1: 0.0000\n
Testing: {'n_d': 16, 'n_a': 32, 'n_steps': 7, 'gamma': 2.0}

Early stopping occurred at epoch 48 with best_epoch = 28 and best_val_0_auc = 0.66577




F1: 0.0000\n
Testing: {'n_d': 16, 'n_a': 64, 'n_steps': 3, 'gamma': 1.0}

Early stopping occurred at epoch 39 with best_epoch = 19 and best_val_0_auc = 0.65851




F1: 0.0280\n
Testing: {'n_d': 16, 'n_a': 64, 'n_steps': 3, 'gamma': 1.5}

Early stopping occurred at epoch 44 with best_epoch = 24 and best_val_0_auc = 0.65454




F1: 0.0276\n
Testing: {'n_d': 16, 'n_a': 64, 'n_steps': 3, 'gamma': 2.0}

Early stopping occurred at epoch 35 with best_epoch = 15 and best_val_0_auc = 0.66334




F1: 0.0000\n
Testing: {'n_d': 16, 'n_a': 64, 'n_steps': 5, 'gamma': 1.0}

Early stopping occurred at epoch 33 with best_epoch = 13 and best_val_0_auc = 0.66505




F1: 0.0000\n
Testing: {'n_d': 16, 'n_a': 64, 'n_steps': 5, 'gamma': 1.5}

Early stopping occurred at epoch 35 with best_epoch = 15 and best_val_0_auc = 0.66724




F1: 0.0000\n
Testing: {'n_d': 16, 'n_a': 64, 'n_steps': 5, 'gamma': 2.0}

Early stopping occurred at epoch 49 with best_epoch = 29 and best_val_0_auc = 0.6538




F1: 0.0000\n
Testing: {'n_d': 16, 'n_a': 64, 'n_steps': 7, 'gamma': 1.0}

Early stopping occurred at epoch 43 with best_epoch = 23 and best_val_0_auc = 0.66499




F1: 0.0036\n
Testing: {'n_d': 16, 'n_a': 64, 'n_steps': 7, 'gamma': 1.5}

Early stopping occurred at epoch 42 with best_epoch = 22 and best_val_0_auc = 0.65898




F1: 0.0036\n
Testing: {'n_d': 16, 'n_a': 64, 'n_steps': 7, 'gamma': 2.0}

Early stopping occurred at epoch 77 with best_epoch = 57 and best_val_0_auc = 0.66923




F1: 0.0108\n
Testing: {'n_d': 32, 'n_a': 16, 'n_steps': 3, 'gamma': 1.0}

Early stopping occurred at epoch 49 with best_epoch = 29 and best_val_0_auc = 0.68143




F1: 0.0475\n
Testing: {'n_d': 32, 'n_a': 16, 'n_steps': 3, 'gamma': 1.5}

Early stopping occurred at epoch 33 with best_epoch = 13 and best_val_0_auc = 0.65994




F1: 0.0000\n
Testing: {'n_d': 32, 'n_a': 16, 'n_steps': 3, 'gamma': 2.0}

Early stopping occurred at epoch 37 with best_epoch = 17 and best_val_0_auc = 0.67628




F1: 0.0606\n
Testing: {'n_d': 32, 'n_a': 16, 'n_steps': 5, 'gamma': 1.0}

Early stopping occurred at epoch 34 with best_epoch = 14 and best_val_0_auc = 0.66656




F1: 0.0072\n
Testing: {'n_d': 32, 'n_a': 16, 'n_steps': 5, 'gamma': 1.5}

Early stopping occurred at epoch 87 with best_epoch = 67 and best_val_0_auc = 0.66919




F1: 0.0000\n
Testing: {'n_d': 32, 'n_a': 16, 'n_steps': 5, 'gamma': 2.0}

Early stopping occurred at epoch 32 with best_epoch = 12 and best_val_0_auc = 0.64306




F1: 0.0000\n
Testing: {'n_d': 32, 'n_a': 16, 'n_steps': 7, 'gamma': 1.0}

Early stopping occurred at epoch 32 with best_epoch = 12 and best_val_0_auc = 0.67423




F1: 0.0105\n
Testing: {'n_d': 32, 'n_a': 16, 'n_steps': 7, 'gamma': 1.5}

Early stopping occurred at epoch 62 with best_epoch = 42 and best_val_0_auc = 0.68745




F1: 0.0000\n
Testing: {'n_d': 32, 'n_a': 16, 'n_steps': 7, 'gamma': 2.0}

Early stopping occurred at epoch 35 with best_epoch = 15 and best_val_0_auc = 0.66231




F1: 0.0072\n
Testing: {'n_d': 32, 'n_a': 32, 'n_steps': 3, 'gamma': 1.0}

Early stopping occurred at epoch 31 with best_epoch = 11 and best_val_0_auc = 0.67033




F1: 0.0107\n
Testing: {'n_d': 32, 'n_a': 32, 'n_steps': 3, 'gamma': 1.5}

Early stopping occurred at epoch 46 with best_epoch = 26 and best_val_0_auc = 0.67329




F1: 0.0401\n
Testing: {'n_d': 32, 'n_a': 32, 'n_steps': 3, 'gamma': 2.0}

Early stopping occurred at epoch 63 with best_epoch = 43 and best_val_0_auc = 0.67413




F1: 0.0036\n
Testing: {'n_d': 32, 'n_a': 32, 'n_steps': 5, 'gamma': 1.0}

Early stopping occurred at epoch 29 with best_epoch = 9 and best_val_0_auc = 0.66106




F1: 0.0108\n
Testing: {'n_d': 32, 'n_a': 32, 'n_steps': 5, 'gamma': 1.5}

Early stopping occurred at epoch 46 with best_epoch = 26 and best_val_0_auc = 0.66399




F1: 0.0000\n
Testing: {'n_d': 32, 'n_a': 32, 'n_steps': 5, 'gamma': 2.0}

Early stopping occurred at epoch 35 with best_epoch = 15 and best_val_0_auc = 0.65347




F1: 0.0036\n
Testing: {'n_d': 32, 'n_a': 32, 'n_steps': 7, 'gamma': 1.0}

Early stopping occurred at epoch 66 with best_epoch = 46 and best_val_0_auc = 0.68249




F1: 0.0211\n
Testing: {'n_d': 32, 'n_a': 32, 'n_steps': 7, 'gamma': 1.5}

Early stopping occurred at epoch 59 with best_epoch = 39 and best_val_0_auc = 0.66545




F1: 0.0000\n
Testing: {'n_d': 32, 'n_a': 32, 'n_steps': 7, 'gamma': 2.0}

Early stopping occurred at epoch 87 with best_epoch = 67 and best_val_0_auc = 0.678




F1: 0.0000\n
Testing: {'n_d': 32, 'n_a': 64, 'n_steps': 3, 'gamma': 1.0}

Early stopping occurred at epoch 34 with best_epoch = 14 and best_val_0_auc = 0.6703




F1: 0.0176\n
Testing: {'n_d': 32, 'n_a': 64, 'n_steps': 3, 'gamma': 1.5}

Early stopping occurred at epoch 32 with best_epoch = 12 and best_val_0_auc = 0.67447




F1: 0.0000\n
Testing: {'n_d': 32, 'n_a': 64, 'n_steps': 3, 'gamma': 2.0}

Early stopping occurred at epoch 38 with best_epoch = 18 and best_val_0_auc = 0.65984




F1: 0.0072\n
Testing: {'n_d': 32, 'n_a': 64, 'n_steps': 5, 'gamma': 1.0}

Early stopping occurred at epoch 26 with best_epoch = 6 and best_val_0_auc = 0.67017




F1: 0.0036\n
Testing: {'n_d': 32, 'n_a': 64, 'n_steps': 5, 'gamma': 1.5}

Early stopping occurred at epoch 36 with best_epoch = 16 and best_val_0_auc = 0.66897




F1: 0.0000\n
Testing: {'n_d': 32, 'n_a': 64, 'n_steps': 5, 'gamma': 2.0}

Early stopping occurred at epoch 59 with best_epoch = 39 and best_val_0_auc = 0.66864




F1: 0.0000\n
Testing: {'n_d': 32, 'n_a': 64, 'n_steps': 7, 'gamma': 1.0}

Early stopping occurred at epoch 68 with best_epoch = 48 and best_val_0_auc = 0.68994




F1: 0.0036\n
Testing: {'n_d': 32, 'n_a': 64, 'n_steps': 7, 'gamma': 1.5}

Early stopping occurred at epoch 44 with best_epoch = 24 and best_val_0_auc = 0.66783




F1: 0.0000\n
Testing: {'n_d': 32, 'n_a': 64, 'n_steps': 7, 'gamma': 2.0}

Early stopping occurred at epoch 37 with best_epoch = 17 and best_val_0_auc = 0.67222




F1: 0.0141\n
Testing: {'n_d': 64, 'n_a': 16, 'n_steps': 3, 'gamma': 1.0}

Early stopping occurred at epoch 39 with best_epoch = 19 and best_val_0_auc = 0.64951




F1: 0.0594\n
Testing: {'n_d': 64, 'n_a': 16, 'n_steps': 3, 'gamma': 1.5}

Early stopping occurred at epoch 39 with best_epoch = 19 and best_val_0_auc = 0.66153




F1: 0.0247\n
Testing: {'n_d': 64, 'n_a': 16, 'n_steps': 3, 'gamma': 2.0}

Early stopping occurred at epoch 33 with best_epoch = 13 and best_val_0_auc = 0.65899




F1: 0.0000\n
Testing: {'n_d': 64, 'n_a': 16, 'n_steps': 5, 'gamma': 1.0}

Early stopping occurred at epoch 69 with best_epoch = 49 and best_val_0_auc = 0.68111




F1: 0.0142\n
Testing: {'n_d': 64, 'n_a': 16, 'n_steps': 5, 'gamma': 1.5}

Early stopping occurred at epoch 39 with best_epoch = 19 and best_val_0_auc = 0.68115




F1: 0.0108\n
Testing: {'n_d': 64, 'n_a': 16, 'n_steps': 5, 'gamma': 2.0}

Early stopping occurred at epoch 59 with best_epoch = 39 and best_val_0_auc = 0.65863




F1: 0.0000\n
Testing: {'n_d': 64, 'n_a': 16, 'n_steps': 7, 'gamma': 1.0}

Early stopping occurred at epoch 39 with best_epoch = 19 and best_val_0_auc = 0.68441




F1: 0.0143\n
Testing: {'n_d': 64, 'n_a': 16, 'n_steps': 7, 'gamma': 1.5}

Early stopping occurred at epoch 42 with best_epoch = 22 and best_val_0_auc = 0.66652




F1: 0.0000\n
Testing: {'n_d': 64, 'n_a': 16, 'n_steps': 7, 'gamma': 2.0}

Early stopping occurred at epoch 35 with best_epoch = 15 and best_val_0_auc = 0.66257




F1: 0.0036\n
Testing: {'n_d': 64, 'n_a': 32, 'n_steps': 3, 'gamma': 1.0}

Early stopping occurred at epoch 36 with best_epoch = 16 and best_val_0_auc = 0.65963




F1: 0.0106\n
Testing: {'n_d': 64, 'n_a': 32, 'n_steps': 3, 'gamma': 1.5}

Early stopping occurred at epoch 47 with best_epoch = 27 and best_val_0_auc = 0.66494




F1: 0.0469\n
Testing: {'n_d': 64, 'n_a': 32, 'n_steps': 3, 'gamma': 2.0}

Early stopping occurred at epoch 47 with best_epoch = 27 and best_val_0_auc = 0.68794




F1: 0.0311\n
Testing: {'n_d': 64, 'n_a': 32, 'n_steps': 5, 'gamma': 1.0}

Early stopping occurred at epoch 43 with best_epoch = 23 and best_val_0_auc = 0.6789




F1: 0.0000\n
Testing: {'n_d': 64, 'n_a': 32, 'n_steps': 5, 'gamma': 1.5}

Early stopping occurred at epoch 27 with best_epoch = 7 and best_val_0_auc = 0.65443




F1: 0.0141\n
Testing: {'n_d': 64, 'n_a': 32, 'n_steps': 5, 'gamma': 2.0}

Early stopping occurred at epoch 70 with best_epoch = 50 and best_val_0_auc = 0.67435




F1: 0.0106\n
Testing: {'n_d': 64, 'n_a': 32, 'n_steps': 7, 'gamma': 1.0}

Early stopping occurred at epoch 39 with best_epoch = 19 and best_val_0_auc = 0.67182




F1: 0.0000\n
Testing: {'n_d': 64, 'n_a': 32, 'n_steps': 7, 'gamma': 1.5}

Early stopping occurred at epoch 33 with best_epoch = 13 and best_val_0_auc = 0.65925




F1: 0.0036\n
Testing: {'n_d': 64, 'n_a': 32, 'n_steps': 7, 'gamma': 2.0}

Early stopping occurred at epoch 38 with best_epoch = 18 and best_val_0_auc = 0.65544




F1: 0.0345\n
Testing: {'n_d': 64, 'n_a': 64, 'n_steps': 3, 'gamma': 1.0}

Early stopping occurred at epoch 28 with best_epoch = 8 and best_val_0_auc = 0.65753




F1: 0.0243\n
Testing: {'n_d': 64, 'n_a': 64, 'n_steps': 3, 'gamma': 1.5}

Early stopping occurred at epoch 41 with best_epoch = 21 and best_val_0_auc = 0.66003




F1: 0.0175\n
Testing: {'n_d': 64, 'n_a': 64, 'n_steps': 3, 'gamma': 2.0}

Early stopping occurred at epoch 49 with best_epoch = 29 and best_val_0_auc = 0.66585




F1: 0.0000\n
Testing: {'n_d': 64, 'n_a': 64, 'n_steps': 5, 'gamma': 1.0}

Early stopping occurred at epoch 76 with best_epoch = 56 and best_val_0_auc = 0.68049




F1: 0.0036\n
Testing: {'n_d': 64, 'n_a': 64, 'n_steps': 5, 'gamma': 1.5}

Early stopping occurred at epoch 63 with best_epoch = 43 and best_val_0_auc = 0.6606




F1: 0.0000\n
Testing: {'n_d': 64, 'n_a': 64, 'n_steps': 5, 'gamma': 2.0}

Early stopping occurred at epoch 74 with best_epoch = 54 and best_val_0_auc = 0.66601




F1: 0.0036\n
Testing: {'n_d': 64, 'n_a': 64, 'n_steps': 7, 'gamma': 1.0}

Early stopping occurred at epoch 64 with best_epoch = 44 and best_val_0_auc = 0.68492




F1: 0.0143\n
Testing: {'n_d': 64, 'n_a': 64, 'n_steps': 7, 'gamma': 1.5}

Early stopping occurred at epoch 93 with best_epoch = 73 and best_val_0_auc = 0.69421




F1: 0.0000\n
Testing: {'n_d': 64, 'n_a': 64, 'n_steps': 7, 'gamma': 2.0}

Early stopping occurred at epoch 76 with best_epoch = 56 and best_val_0_auc = 0.66675




F1: 0.0000\n
Best Parameters: {'n_d': 16, 'n_a': 32, 'n_steps': 3, 'gamma': 1.0}
Best F1: 0.0999


({'n_d': 16, 'n_a': 32, 'n_steps': 3, 'gamma': 1.0},
 {'Accuracy': 0.8494033412887828,
  'Precision': 0.23648648648648649,
  'Recall': 0.06329113924050633,
  'F1': 0.09985734664764621,
  'ROC_AUC': np.float64(0.6458905631839925),
  'Runtime_s': 27.727808713912964})

# FT Transformer

In [24]:
def train_and_evaluate_torch_model(model, train_dl, X_test, y_test, criterion=None, optimizer=None, n_epochs=30, lr=3e-4, verbose=True):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    if criterion is None:
        criterion = nn.BCEWithLogitsLoss()
    if optimizer is None:
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr)


    start_time = time.time()
    for ep in range(n_epochs):
        model.train()
        for xb, yb in train_dl:
            xb, yb = xb.to(device), yb.to(device).float().unsqueeze(1)
            optimizer.zero_grad()
            logits = model(xb)
            loss = criterion(logits, yb)
            loss.backward()
            optimizer.step()
        if verbose:
            print(f"Epoch {ep+1}/{n_epochs}, Loss: {loss.item():.4f}")

    model.eval()
    with torch.no_grad():
        X_te_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
        logits = model(X_te_tensor)
        prob = torch.sigmoid(logits).cpu().numpy().ravel()

    runtime = time.time() - start_time
    preds = (prob >= 0.5).astype(int)
    metrics = metric_dict(y_test, preds, prob, runtime)
    return model, metrics

def train_ft(model, X_train, y_train, X_test, y_test):
    import torch
    from torch.utils.data import TensorDataset, DataLoader

    pos_weight = torch.tensor([(y_train == 0).sum() / (y_train == 1).sum()], dtype=torch.float32)

    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight.to('cuda' if torch.cuda.is_available() else 'cpu'))

    X_tr = torch.tensor(X_train, dtype=torch.float32)
    y_tr = torch.tensor(y_train, dtype=torch.float32)
    train_dl = DataLoader(TensorDataset(X_tr, y_tr), batch_size=256, shuffle=True)

    return train_and_evaluate_torch_model(model, train_dl, X_test, y_test, criterion=criterion, n_epochs=30)

import torch
import torch.nn as nn

class FeatureTokenizer(nn.Module):
    def __init__(self, n_feat, d):
        super().__init__()
        self.w = nn.Parameter(torch.randn(n_feat, d) * 0.02)
        self.b = nn.Parameter(torch.zeros(n_feat, d))
        self.cls = nn.Parameter(torch.zeros(1, 1, d))

    def forward(self, x):
        B = x.size(0)
        tok = x.unsqueeze(-1) * self.w + self.b
        cls = self.cls.expand(B, -1, -1)
        return torch.cat([cls, tok], 1)

def make_encoder(d, heads, ff, layers):
    enc = nn.TransformerEncoderLayer(d_model=d, nhead=heads, dim_feedforward=ff,
                                     dropout=0.1, batch_first=True, activation='gelu')
    return nn.TransformerEncoder(enc, num_layers=layers)

class FTTransformer(nn.Module):
    def __init__(self, n_feat, d=32, heads=4, layers=2, ff=64):
        super().__init__()
        self.tok = FeatureTokenizer(n_feat, d)
        self.enc = make_encoder(d, heads, ff, layers)
        self.head = nn.Sequential(nn.LayerNorm(d), nn.Linear(d, 1))

    def forward(self, x):
        h = self.enc(self.tok(x))
        return self.head(h[:, 0, :])



Weighted BCE with embeddings

In [26]:

ft_grid = [
    {'d': d, 'heads': h, 'layers': l, 'ff': 64}
    for d in [32, 64]
    for h in [2, 4]
    for l in [1, 2]
]


ft_fn = lambda d, heads, layers, ff: FTTransformer(
    n_feat=X_train.shape[1], d=d, heads=heads, layers=layers, ff=ff
)

manual_grid_search(ft_fn, ft_grid, train_ft,
                   X_train, y_train, X_test, y_test,
                   score_key='F1')


Testing: {'d': 32, 'heads': 2, 'layers': 1, 'ff': 64}
Epoch 1/30, Loss: 1.3392
Epoch 2/30, Loss: 1.1897
Epoch 3/30, Loss: 1.1296
Epoch 4/30, Loss: 1.3980
Epoch 5/30, Loss: 1.2399
Epoch 6/30, Loss: 1.1571
Epoch 7/30, Loss: 1.0145
Epoch 8/30, Loss: 1.2721
Epoch 9/30, Loss: 0.9868
Epoch 10/30, Loss: 0.9622
Epoch 11/30, Loss: 1.0901
Epoch 12/30, Loss: 0.8465
Epoch 13/30, Loss: 1.0054
Epoch 14/30, Loss: 1.1165
Epoch 15/30, Loss: 1.1860
Epoch 16/30, Loss: 1.1128
Epoch 17/30, Loss: 1.1432
Epoch 18/30, Loss: 0.9380
Epoch 19/30, Loss: 1.1214
Epoch 20/30, Loss: 1.3903
Epoch 21/30, Loss: 1.2046
Epoch 22/30, Loss: 0.8796
Epoch 23/30, Loss: 1.0171
Epoch 24/30, Loss: 0.9299
Epoch 25/30, Loss: 0.8286
Epoch 26/30, Loss: 0.9199
Epoch 27/30, Loss: 1.0266
Epoch 28/30, Loss: 0.9659
Epoch 29/30, Loss: 1.0466
Epoch 30/30, Loss: 1.2263
F1: 0.3183\n
Testing: {'d': 32, 'heads': 2, 'layers': 2, 'ff': 64}
Epoch 1/30, Loss: 1.4527
Epoch 2/30, Loss: 1.1950
Epoch 3/30, Loss: 1.1985
Epoch 4/30, Loss: 1.0629
Epoch 5/

({'d': 64, 'heads': 2, 'layers': 2, 'ff': 64},
 {'Accuracy': 0.6381861575178998,
  'Precision': 0.2198952879581152,
  'Recall': 0.6835443037974683,
  'F1': 0.33274647887323944,
  'ROC_AUC': np.float64(0.7084873619087727),
  'Runtime_s': 321.2794826030731})

Weighted BCE with no embeddings

In [29]:
ft_fn = lambda d, heads, layers, ff: FTTransformer(
    n_feat=X_train_structured.shape[1], d=d, heads=heads, layers=layers, ff=ff
)

manual_grid_search(ft_fn, ft_grid, train_ft,
                   X_train_structured, y_train, X_test_structured, y_test,
                   score_key='F1')

Testing: {'d': 32, 'heads': 2, 'layers': 1, 'ff': 64}
Epoch 1/30, Loss: 1.1991
Epoch 2/30, Loss: 0.8340
Epoch 3/30, Loss: 1.3992
Epoch 4/30, Loss: 0.9532
Epoch 5/30, Loss: 1.1528
Epoch 6/30, Loss: 0.9582
Epoch 7/30, Loss: 1.6137
Epoch 8/30, Loss: 0.9077
Epoch 9/30, Loss: 1.1395
Epoch 10/30, Loss: 1.2135
Epoch 11/30, Loss: 0.8906
Epoch 12/30, Loss: 1.3362
Epoch 13/30, Loss: 1.1875
Epoch 14/30, Loss: 1.1487
Epoch 15/30, Loss: 1.1922
Epoch 16/30, Loss: 1.2288
Epoch 17/30, Loss: 0.9584
Epoch 18/30, Loss: 1.0934
Epoch 19/30, Loss: 0.9449
Epoch 20/30, Loss: 1.0557
Epoch 21/30, Loss: 1.1899
Epoch 22/30, Loss: 1.0070
Epoch 23/30, Loss: 1.0888
Epoch 24/30, Loss: 0.9368
Epoch 25/30, Loss: 1.0466
Epoch 26/30, Loss: 1.1823
Epoch 27/30, Loss: 1.1365
Epoch 28/30, Loss: 1.0321
Epoch 29/30, Loss: 0.9935
Epoch 30/30, Loss: 1.0128
F1: 0.3181\n
Testing: {'d': 32, 'heads': 2, 'layers': 2, 'ff': 64}
Epoch 1/30, Loss: 1.1831
Epoch 2/30, Loss: 1.3080
Epoch 3/30, Loss: 0.9642
Epoch 4/30, Loss: 1.3410
Epoch 5/

({'d': 64, 'heads': 4, 'layers': 2, 'ff': 64},
 {'Accuracy': 0.6417661097852029,
  'Precision': 0.21785714285714286,
  'Recall': 0.6618444846292948,
  'F1': 0.3278101209135692,
  'ROC_AUC': np.float64(0.6977637412548645),
  'Runtime_s': 17.673937797546387})

In [32]:

ft_fn = lambda d, heads, layers, ff: FTTransformer(
    n_feat=X_train.shape[1], d=d, heads=heads, layers=layers, ff=ff
)

def train_ft(model, X_train, y_train, X_test, y_test):
    import torch
    from torch.utils.data import TensorDataset, DataLoader
    criterion = nn.BCEWithLogitsLoss()

    X_tr = torch.tensor(X_train, dtype=torch.float32)
    y_tr = torch.tensor(y_train, dtype=torch.float32)
    train_dl = DataLoader(TensorDataset(X_tr, y_tr), batch_size=256, shuffle=True)

    return train_and_evaluate_torch_model(model, train_dl, X_test, y_test, criterion=criterion, n_epochs=30)


Standard BCE with embeddings

In [33]:
manual_grid_search(ft_fn, ft_grid, train_ft,
                   X_train, y_train, X_test, y_test,
                   score_key='F1')

Testing: {'d': 32, 'heads': 2, 'layers': 1, 'ff': 64}
Epoch 1/30, Loss: 0.2505
Epoch 2/30, Loss: 0.4437
Epoch 3/30, Loss: 0.3182
Epoch 4/30, Loss: 0.4402
Epoch 5/30, Loss: 0.2820
Epoch 6/30, Loss: 0.5107
Epoch 7/30, Loss: 0.3629
Epoch 8/30, Loss: 0.2635
Epoch 9/30, Loss: 0.4359
Epoch 10/30, Loss: 0.2801
Epoch 11/30, Loss: 0.4232
Epoch 12/30, Loss: 0.3813
Epoch 13/30, Loss: 0.3886
Epoch 14/30, Loss: 0.2815
Epoch 15/30, Loss: 0.4969
Epoch 16/30, Loss: 0.3769
Epoch 17/30, Loss: 0.4268
Epoch 18/30, Loss: 0.3936
Epoch 19/30, Loss: 0.5024
Epoch 20/30, Loss: 0.3773
Epoch 21/30, Loss: 0.4361
Epoch 22/30, Loss: 0.3984
Epoch 23/30, Loss: 0.3459
Epoch 24/30, Loss: 0.4358
Epoch 25/30, Loss: 0.3728
Epoch 26/30, Loss: 0.3392
Epoch 27/30, Loss: 0.3108
Epoch 28/30, Loss: 0.3890
Epoch 29/30, Loss: 0.3041
Epoch 30/30, Loss: 0.3159
F1: 0.0000\n
Testing: {'d': 32, 'heads': 2, 'layers': 2, 'ff': 64}
Epoch 1/30, Loss: 0.3987
Epoch 2/30, Loss: 0.4532
Epoch 3/30, Loss: 0.3642
Epoch 4/30, Loss: 0.2930
Epoch 5/

({'d': 64, 'heads': 2, 'layers': 2, 'ff': 64},
 {'Accuracy': 0.8682577565632458,
  'Precision': 0.5238095238095238,
  'Recall': 0.019891500904159132,
  'F1': 0.03832752613240418,
  'ROC_AUC': np.float64(0.7126638462138927),
  'Runtime_s': 320.8132083415985})

Standard BCE without embeddings

In [34]:
ft_fn = lambda d, heads, layers, ff: FTTransformer(
    n_feat=X_train_structured.shape[1], d=d, heads=heads, layers=layers, ff=ff
)
manual_grid_search(ft_fn, ft_grid, train_ft,
                   X_train_structured, y_train, X_test_structured, y_test,
                   score_key='F1')

Testing: {'d': 32, 'heads': 2, 'layers': 1, 'ff': 64}
Epoch 1/30, Loss: 0.4617
Epoch 2/30, Loss: 0.4016
Epoch 3/30, Loss: 0.4108
Epoch 4/30, Loss: 0.3859
Epoch 5/30, Loss: 0.4300
Epoch 6/30, Loss: 0.3646
Epoch 7/30, Loss: 0.4258
Epoch 8/30, Loss: 0.4068
Epoch 9/30, Loss: 0.4197
Epoch 10/30, Loss: 0.3299
Epoch 11/30, Loss: 0.3811
Epoch 12/30, Loss: 0.2757
Epoch 13/30, Loss: 0.4052
Epoch 14/30, Loss: 0.3530
Epoch 15/30, Loss: 0.3823
Epoch 16/30, Loss: 0.4156
Epoch 17/30, Loss: 0.4342
Epoch 18/30, Loss: 0.3895
Epoch 19/30, Loss: 0.3768
Epoch 20/30, Loss: 0.3817
Epoch 21/30, Loss: 0.3236
Epoch 22/30, Loss: 0.2994
Epoch 23/30, Loss: 0.3456
Epoch 24/30, Loss: 0.3095
Epoch 25/30, Loss: 0.3767
Epoch 26/30, Loss: 0.3536
Epoch 27/30, Loss: 0.3182
Epoch 28/30, Loss: 0.3765
Epoch 29/30, Loss: 0.3512
Epoch 30/30, Loss: 0.3458
F1: 0.0108\n
Testing: {'d': 32, 'heads': 2, 'layers': 2, 'ff': 64}
Epoch 1/30, Loss: 0.3707
Epoch 2/30, Loss: 0.3697
Epoch 3/30, Loss: 0.4123
Epoch 4/30, Loss: 0.3798
Epoch 5/

({'d': 64, 'heads': 4, 'layers': 2, 'ff': 64},
 {'Accuracy': 0.8673031026252983,
  'Precision': 0.4716981132075472,
  'Recall': 0.045207956600361664,
  'F1': 0.08250825082508251,
  'ROC_AUC': np.float64(0.6882771554760918),
  'Runtime_s': 17.822336435317993})

# Multi-layer perceptron

In [41]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, WeightedRandomSampler
import numpy as np
import time

class SimpleMLP(nn.Module):
    def __init__(self, input_dim, hidden_layers=(64, 32)):
        super().__init__()
        layers = []
        prev_dim = input_dim
        for h in hidden_layers:
            layers.extend([nn.Linear(prev_dim, h), nn.ReLU()])
            prev_dim = h
        layers.append(nn.Linear(prev_dim, 1))
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)


In [42]:
def train_torch_mlp(model, X_train, y_train, X_test, y_test, sample_weights, n_epochs=30, lr=1e-3):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
    weights_tensor = torch.tensor(sample_weights, dtype=torch.float32)

    train_ds = TensorDataset(X_train_tensor, y_train_tensor, weights_tensor)
    train_dl = DataLoader(train_ds, batch_size=256, shuffle=True)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

    def weighted_bce(logits, targets, weights):
        loss = nn.BCEWithLogitsLoss(reduction='none')(logits, targets)
        return (loss * weights).mean()

    start = time.time()
    for epoch in range(n_epochs):
        model.train()
        for xb, yb, wb in train_dl:
            xb, yb, wb = xb.to(device), yb.to(device), wb.to(device).unsqueeze(1)
            optimizer.zero_grad()
            logits = model(xb)
            loss = weighted_bce(logits, yb, wb)
            loss.backward()
            optimizer.step()

    model.eval()
    with torch.no_grad():
        prob = torch.sigmoid(model(torch.tensor(X_test, dtype=torch.float32).to(device))).cpu().numpy().ravel()
    runtime = time.time() - start
    preds = (prob >= 0.5).astype(int)
    return model, metric_dict(y_test, preds, prob, runtime)


In [43]:
pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
class_weight_dict = {0: 1.0, 1: pos_weight}
sample_weights = np.array([class_weight_dict[y] for y in y_train])

model = SimpleMLP(input_dim=X_train.shape[1])
model, metrics = train_torch_mlp(model, X_train, y_train, X_test, y_test, sample_weights)
print(metrics)


{'Accuracy': 0.681145584725537, 'Precision': 0.23742454728370221, 'Recall': 0.6401446654611211, 'F1': 0.34637964774951074, 'ROC_AUC': np.float64(0.7173253496189703), 'Runtime_s': 10.50873351097107}


In [44]:
mlp_grid = [
    {
        'hidden_layer_sizes': h,
        'lr': lr,
        'alpha': alpha  # L2 regularization (weight decay)
    }
    for h in [(64, 32), (128, 64), (128, 64, 32)]
    for alpha in [0.0001, 0.001, 0.01]
    for lr in [0.0005, 0.001, 0.005]
]


In [45]:
mlp_fn = lambda hidden_layer_sizes, lr, alpha: (
    SimpleMLP(X_train.shape[1], hidden_layer_sizes),
    lr,
    alpha
)


In [46]:
def train_pytorch_mlp_grid(model_tuple, X_train, y_train, X_test, y_test):
    model, lr, alpha = model_tuple

    pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
    class_weight_dict = {0: 1.0, 1: pos_weight}
    sample_weights = np.array([class_weight_dict[y] for y in y_train])

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
    weights_tensor = torch.tensor(sample_weights, dtype=torch.float32)

    train_ds = TensorDataset(X_train_tensor, y_train_tensor, weights_tensor)
    train_dl = DataLoader(train_ds, batch_size=256, shuffle=True)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=alpha)

    def weighted_bce(logits, targets, weights):
        loss = nn.BCEWithLogitsLoss(reduction='none')(logits, targets)
        return (loss * weights).mean()

    start = time.time()
    for epoch in range(30):
        model.train()
        for xb, yb, wb in train_dl:
            xb, yb, wb = xb.to(device), yb.to(device), wb.to(device).unsqueeze(1)
            optimizer.zero_grad()
            logits = model(xb)
            loss = weighted_bce(logits, yb, wb)
            loss.backward()
            optimizer.step()

    model.eval()
    with torch.no_grad():
        prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()

    runtime = time.time() - start
    preds = (prob >= 0.5).astype(int)
    return model, metric_dict(y_test, preds, prob, runtime)


Weighted BCE with embeddings

In [47]:
mlp_fn = lambda hidden_layer_sizes, lr, alpha: (
    SimpleMLP(X_train.shape[1], hidden_layer_sizes),
    lr,
    alpha
)

best_params, best_metrics = manual_grid_search(
    model_fn=mlp_fn,
    param_grid_list=mlp_grid,
    train_fn=train_pytorch_mlp_grid,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    score_key='F1'
)


Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.0005, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3258\n
Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.001, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3359\n
Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.005, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3211\n
Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.0005, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3293\n
Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.001, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3225\n
Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.005, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3251\n
Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.0005, 'alpha': 0.01}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3441\n
Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.001, 'alpha': 0.01}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3305\n
Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.005, 'alpha': 0.01}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3152\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.0005, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3350\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.001, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3251\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.005, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3295\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.0005, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3301\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.001, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3202\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.005, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3193\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.0005, 'alpha': 0.01}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3309\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.001, 'alpha': 0.01}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3224\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.005, 'alpha': 0.01}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3206\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.0005, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3300\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.001, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3281\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.005, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3163\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.0005, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3278\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.001, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3303\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.005, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3240\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.0005, 'alpha': 0.01}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3322\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.001, 'alpha': 0.01}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3373\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.005, 'alpha': 0.01}
F1: 0.3322\n
Best Parameters: {'hidden_layer_sizes': (64, 32), 'lr': 0.0005, 'alpha': 0.01}
Best F1: 0.3441


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


Weighted BCE without embeddings

In [50]:
mlp_fn = lambda hidden_layer_sizes, lr, alpha: (
    SimpleMLP(X_train_structured.shape[1], hidden_layer_sizes),
    lr,
    alpha
)


best_params, best_metrics = manual_grid_search(
    model_fn=mlp_fn,
    param_grid_list=mlp_grid,
    train_fn=train_pytorch_mlp_grid,
    X_train=X_train_structured,
    y_train=y_train,
    X_test=X_test_structured,
    y_test=y_test,
    score_key='F1'
)

Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.0005, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3202\n
Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.001, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3104\n
Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.005, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.2847\n
Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.0005, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3256\n
Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.001, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3059\n
Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.005, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.2889\n
Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.0005, 'alpha': 0.01}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3284\n
Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.001, 'alpha': 0.01}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3174\n
Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.005, 'alpha': 0.01}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.2805\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.0005, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3089\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.001, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3105\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.005, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.2736\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.0005, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3338\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.001, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3002\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.005, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.2573\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.0005, 'alpha': 0.01}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3146\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.001, 'alpha': 0.01}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.2974\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.005, 'alpha': 0.01}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.2759\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.0005, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.2979\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.001, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3034\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.005, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.2697\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.0005, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3036\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.001, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.2974\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.005, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.2826\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.0005, 'alpha': 0.01}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.3136\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.001, 'alpha': 0.01}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.2818\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.005, 'alpha': 0.01}
F1: 0.2669\n
Best Parameters: {'hidden_layer_sizes': (128, 64), 'lr': 0.0005, 'alpha': 0.001}
Best F1: 0.3338


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


In [51]:
def train_pytorch_mlp_grid(model_tuple, X_train, y_train, X_test, y_test):
    model, lr, alpha = model_tuple

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)

    train_ds = TensorDataset(X_train_tensor, y_train_tensor)
    train_dl = DataLoader(train_ds, batch_size=256, shuffle=True)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=alpha)
    criterion = nn.BCEWithLogitsLoss()

    start = time.time()
    for epoch in range(30):
        model.train()
        for xb, yb in train_dl:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            logits = model(xb)
            loss = criterion(logits, yb)
            loss.backward()
            optimizer.step()

    model.eval()
    with torch.no_grad():
        prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()

    runtime = time.time() - start
    preds = (prob >= 0.5).astype(int)
    return model, metric_dict(y_test, preds, prob, runtime)


Standard BCE with embeddings

In [52]:
mlp_fn = lambda hidden_layer_sizes, lr, alpha: (
    SimpleMLP(X_train.shape[1], hidden_layer_sizes),
    lr,
    alpha
)

best_params, best_metrics = manual_grid_search(
    model_fn=mlp_fn,
    param_grid_list=mlp_grid,
    train_fn=train_pytorch_mlp_grid,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    score_key='F1'
)


Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.0005, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1175\n
Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.001, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1088\n
Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.005, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.0983\n
Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.0005, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.0348\n
Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.001, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1167\n
Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.005, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.2208\n
Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.0005, 'alpha': 0.01}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.0960\n
Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.001, 'alpha': 0.01}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.0862\n
Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.005, 'alpha': 0.01}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1005\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.0005, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1213\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.001, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.2320\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.005, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.2182\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.0005, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1277\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.001, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.2527\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.005, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1674\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.0005, 'alpha': 0.01}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1254\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.001, 'alpha': 0.01}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.2155\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.005, 'alpha': 0.01}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1536\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.0005, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1852\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.001, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1024\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.005, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1424\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.0005, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1300\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.001, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1667\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.005, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1453\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.0005, 'alpha': 0.01}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.0886\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.001, 'alpha': 0.01}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1300\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.005, 'alpha': 0.01}
F1: 0.1624\n
Best Parameters: {'hidden_layer_sizes': (128, 64), 'lr': 0.001, 'alpha': 0.001}
Best F1: 0.2527


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


Standard BCE w/o embeddings

In [53]:
mlp_fn = lambda hidden_layer_sizes, lr, alpha: (
    SimpleMLP(X_train_structured.shape[1], hidden_layer_sizes),
    lr,
    alpha
)


best_params, best_metrics = manual_grid_search(
    model_fn=mlp_fn,
    param_grid_list=mlp_grid,
    train_fn=train_pytorch_mlp_grid,
    X_train=X_train_structured,
    y_train=y_train,
    X_test=X_test_structured,
    y_test=y_test,
    score_key='F1'
)

Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.0005, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.0833\n
Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.001, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.0960\n
Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.005, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1377\n
Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.0005, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.0797\n
Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.001, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1265\n
Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.005, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1728\n
Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.0005, 'alpha': 0.01}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1016\n
Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.001, 'alpha': 0.01}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.0909\n
Testing: {'hidden_layer_sizes': (64, 32), 'lr': 0.005, 'alpha': 0.01}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1632\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.0005, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.0575\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.001, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1673\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.005, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1865\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.0005, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.0912\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.001, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1131\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.005, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.2081\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.0005, 'alpha': 0.01}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.0956\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.001, 'alpha': 0.01}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1022\n
Testing: {'hidden_layer_sizes': (128, 64), 'lr': 0.005, 'alpha': 0.01}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.2077\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.0005, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1223\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.001, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1721\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.005, 'alpha': 0.0001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1803\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.0005, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1315\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.001, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1528\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.005, 'alpha': 0.001}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1659\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.0005, 'alpha': 0.01}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1673\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.001, 'alpha': 0.01}


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()


F1: 0.1917\n
Testing: {'hidden_layer_sizes': (128, 64, 32), 'lr': 0.005, 'alpha': 0.01}
F1: 0.1922\n
Best Parameters: {'hidden_layer_sizes': (128, 64), 'lr': 0.005, 'alpha': 0.001}
Best F1: 0.2081


  prob = torch.sigmoid(torch.tensor(model(torch.tensor(X_test, dtype=torch.float32).to(device)))).cpu().numpy().ravel()
