<a target="_blank" href="https://colab.research.google.com/github/yandex-research/rtdl-revisiting-models/blob/main/package/example.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

---

**See also** [RTDL](https://github.com/yandex-research/rtdl)
-- **other projects on tabular deep learning**.

---

- This notebook provides a usage example of the
  [rtdl_revisiting_models](https://github.com/yandex-research/rtdl-revisiting-models)
  package.
- Hyperparameters are not tuned and may be suboptimal.

In [14]:
%pip install delu==0.0.23
%pip install rtdl_revisiting_models



In [15]:
# ruff: noqa: E402
import math
import warnings
from typing import Dict, Literal

warnings.simplefilter("ignore")
import delu  # Deep Learning Utilities: https://github.com/Yura52/delu
import numpy as np
import scipy.special
import sklearn.datasets
import sklearn.metrics
import sklearn.model_selection
import sklearn.preprocessing
import torch
import torch.nn.functional as F
import torch.optim
from torch import Tensor
from tqdm.std import tqdm

warnings.resetwarnings()

from rtdl_revisiting_models import MLP, ResNet, FTTransformer

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Set random seeds in all libraries.
delu.random.seed(0)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


## Dataset

In [4]:

dataset_id = 1590   #45068
dataset = openml.datasets.get_dataset(dataset_id)
X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute)
full_data = X.copy()
full_data[dataset.default_target_attribute] = y

csv_file = 'heloc.csv'
full_data.to_csv(csv_file, index=False)

print(f"Dataset saved as {csv_file}")

NameError: name 'openml' is not defined

In [5]:
from sklearn.datasets import fetch_openml

# Fetch dataset from OpenML
TaskType = Literal["regression", "binclass", "multiclass"]

task_type: TaskType = "binclass"
n_classes = None
#dataset = sklearn.datasets.fetch_california_housing()
dataset = fetch_openml(data_id=1590, as_frame=False)
X_cont: np.ndarray = dataset["data"]
Y: np.ndarray = dataset["target"]

# Check the dtype of the target
print(f"Original dtype of Y: {Y.dtype}")

# Map class '1' to '0' and class '2' to '1' while preserving object dtype
mapping = {'<=50K': '0', '2': '1'}
Y = np.array([mapping[str(y)] if str(y) in mapping else y for y in Y], dtype=object)





Original dtype of Y: object


In [7]:
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import OrdinalEncoder

# Fetch dataset from OpenML
TaskType = Literal["regression", "binclass", "multiclass"]

task_type: TaskType = "binclass"
n_classes = 2
#dataset = sklearn.datasets.fetch_california_housing()
dataset = fetch_openml(data_id=1590, as_frame=False)
X: np.ndarray = dataset["data"]
Y: np.ndarray = dataset["target"]

# Check the dtype of the target
print(f"Original dtype of Y: {Y.dtype}")

# Map class '1' to '0' and class '2' to '1' while preserving object dtype
mapping = {'<=50K': '0', '>50K': '1'}
Y = np.array([mapping[str(y)] if str(y) in mapping else y for y in Y], dtype=object)
# NOTE: uncomment to solve a classification task.
# n_classes = 2
#assert n_classes >= 2
# task_type: TaskType = 'binclass' if n_classes == 2 else 'multiclass'
# X_cont, Y = sklearn.datasets.make_classification(
#     n_samples=20000,
#     n_features=8,
#     n_classes=n_classes,
#     n_informative=3,
#     n_redundant=2,
# )

numerical_indices = [0, 2, 4, 10, 11,12]  # Replace with actual indices of numerical features
categorical_indices = [1, 3, 5, 6,7,8,9,13]
X_cont: np.ndarray = X[:, numerical_indices]
X_cat: np.ndarray = X[:, categorical_indices] if categorical_indices else None
X_cont: np.ndarray = X_cont.astype(np.float32)
n_cont_features = X_cont.shape[1]

if X_cat is not None:
    print(f"X_cat dtype: {X_cat.dtype}")
    print("Example categorical data:", X_cat[:5])
else:
    print("No categorical features.")

if X_cat is not None:
    # Handle NaN values (replace NaN with a placeholder, e.g., "missing")
    X_cat = np.where(X_cat == np.array(None), "missing", X_cat)
    X_cat = np.where(X_cat == np.nan, "missing", X_cat)

    # Use OrdinalEncoder to convert categories to integers
    encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    X_cat = encoder.fit_transform(X_cat)

    # Convert to integers for compatibility with embedding layers
    X_cat = X_cat.astype(np.int64)  # <- Change here

cat_cardinalities = [len(np.unique(X_cat[:, col])) for col in range(X_cat.shape[1])]
print(f"Cardinalities of categorical features: {cat_cardinalities}")


cat_cardinalities = [len(np.unique(X_cat[:, col])) for col in range(X_cat.shape[1])]
print(f"Cardinalities of categorical features: {cat_cardinalities}")
# >>> Categorical features.
# NOTE: the above datasets do not have categorical features, but,
# for the demonstration purposes, it is possible to generate them.
#cat_cardinalities = [
    # NOTE: uncomment the two lines below to add two categorical features.
    # 4,  # Allowed values: [0, 1, 2, 3].
    # 7,  # Allowed values: [0, 1, 2, 3, 4, 5, 6].
#]
# X_cat = (
#     np.column_stack(
#         [np.random.randint(0, c, (len(X_cont),)) for c in cat_cardinalities]
#     )
#     if cat_cardinalities
#     else None
# )

# >>> Labels.
# Regression labels must be represented by float32.
if task_type == "regression":
    Y = Y.astype(np.float32)
else:
    assert n_classes is not None
    Y = Y.astype(np.int64)
    assert set(Y.tolist()) == set(
        range(n_classes)
    ), "Classification labels must form the range [0, 1, ..., n_classes - 1]"

# >>> Split the dataset.
all_idx = np.arange(len(Y))
trainval_idx, test_idx = sklearn.model_selection.train_test_split(
    all_idx, train_size=0.8
)
train_idx, val_idx = sklearn.model_selection.train_test_split(
    trainval_idx, train_size=0.8125
)
data_numpy = {
    "train": {"x_cont": X_cont[train_idx], "y": Y[train_idx]},
    "val": {"x_cont": X_cont[val_idx], "y": Y[val_idx]},
    "test": {"x_cont": X_cont[test_idx], "y": Y[test_idx]},
}
if X_cat is not None:
    data_numpy["train"]["x_cat"] = X_cat[train_idx]
    data_numpy["val"]["x_cat"] = X_cat[val_idx]
    data_numpy["test"]["x_cat"] = X_cat[test_idx]

Original dtype of Y: object
X_cat dtype: object
Example categorical data: [['Private' '11th' 'Never-married' 'Machine-op-inspct' 'Own-child'
  'Black' 'Male' 'United-States']
 ['Private' 'HS-grad' 'Married-civ-spouse' 'Farming-fishing' 'Husband'
  'White' 'Male' 'United-States']
 ['Local-gov' 'Assoc-acdm' 'Married-civ-spouse' 'Protective-serv'
  'Husband' 'White' 'Male' 'United-States']
 ['Private' 'Some-college' 'Married-civ-spouse' 'Machine-op-inspct'
  'Husband' 'Black' 'Male' 'United-States']
 [nan 'Some-college' 'Never-married' nan 'Own-child' 'White' 'Female'
  'United-States']]
Cardinalities of categorical features: [9, 16, 7, 15, 6, 5, 2, 42]
Cardinalities of categorical features: [9, 16, 7, 15, 6, 5, 2, 42]


  X_cat = X_cat.astype(np.int64)  # <- Change here


In [8]:
# Check the shape of features (X_cont) and target (Y)
print(dataset["data"].shape)  # Shape of the features (e.g., (20640, 8))
print(dataset["target"].shape)  # Shape of the target (e.g., (20640,))


(48842, 14)
(48842,)


In [9]:
# Check the shape of features (X_cont) and target (Y)
print(dataset["data"].shape)  # Shape of the features (e.g., (20640, 8))
print(dataset["target"].shape)  # Shape of the target (e.g., (20640,))

(48842, 14)
(48842,)


## Preprocessing

In [10]:
# >>> Feature preprocessing.
# NOTE
# The choice between preprocessing strategies depends on a task and a model.

# (A) Simple preprocessing strategy.
# preprocessing = sklearn.preprocessing.StandardScaler().fit(
#     data_numpy['train']['x_cont']
# )

# (B) Fancy preprocessing strategy.
# The noise is added to improve the output of QuantileTransformer in some cases.
X_cont_train_numpy = data_numpy["train"]["x_cont"]
noise = (
    np.random.default_rng(0)
    .normal(0.0, 1e-5, X_cont_train_numpy.shape)
    .astype(X_cont_train_numpy.dtype)
)
preprocessing = sklearn.preprocessing.QuantileTransformer(
    n_quantiles=max(min(len(train_idx) // 30, 1000), 10),
    output_distribution="normal",
    subsample=10**9,
).fit(X_cont_train_numpy + noise)
del X_cont_train_numpy

for part in data_numpy:
    data_numpy[part]["x_cont"] = preprocessing.transform(data_numpy[part]["x_cont"])

# >>> Label preprocessing.
if task_type == "regression":
    Y_mean = data_numpy["train"]["y"].mean().item()
    Y_std = data_numpy["train"]["y"].std().item()
    for part in data_numpy:
        data_numpy[part]["y"] = (data_numpy[part]["y"] - Y_mean) / Y_std

# >>> Convert data to tensors.
data = {
    part: {k: torch.as_tensor(v, device=device) for k, v in data_numpy[part].items()}
    for part in data_numpy
}

if task_type != "multiclass":
    # Required by F.binary_cross_entropy_with_logits
    for part in data:
        data[part]["y"] = data[part]["y"].float()

## Model

In [13]:
# The output size.
d_out = n_classes if task_type == "multiclass" else 1

# # NOTE: uncomment to train MLP
# model = MLP(
#     d_in=n_cont_features + sum(cat_cardinalities),
#     d_out=d_out,
#     n_blocks=2,
#     d_block=384,
#     dropout=0.1,
# ).to(device)
# optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-5)

# # NOTE: uncomment to train ResNet
# model = ResNet(
#     d_in=n_cont_features + sum(cat_cardinalities),
#     d_out=d_out,
#     n_blocks=2,
#     d_block=192,
#     d_hidden=None,
#     d_hidden_multiplier=2.0,
#     dropout1=0.3,
#     dropout2=0.0,
# ).to(device)
# optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-5)

model = FTTransformer(
    n_cont_features=n_cont_features,
    cat_cardinalities=cat_cardinalities,
    d_out=d_out,
    **FTTransformer.get_default_kwargs(),
).to(device)
optimizer = model.make_default_optimizer()

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


## Training

In [12]:
def apply_model(batch: Dict[str, Tensor]) -> Tensor:
    if isinstance(model, (MLP, ResNet)):
        x_cat_ohe = (
            [
                F.one_hot(column, cardinality)
                for column, cardinality in zip(batch["x_cat"].T, cat_cardinalities)
            ]
            if "x_cat" in batch
            else []
        )
        return model(torch.column_stack([batch["x_cont"]] + x_cat_ohe)).squeeze(-1)

    elif isinstance(model, FTTransformer):
        return model(batch["x_cont"], batch.get("x_cat")).squeeze(-1)

    else:
        raise RuntimeError(f"Unknown model type: {type(model)}")


loss_fn = (
    F.binary_cross_entropy_with_logits
    if task_type == "binclass"
    else F.cross_entropy
    if task_type == "multiclass"
    else F.mse_loss
)


@torch.no_grad()
def evaluate(part: str) -> float:
    model.eval()

    eval_batch_size = 8096
    y_pred = (
        torch.cat(
            [
                apply_model(batch)
                for batch in delu.iter_batches(data[part], eval_batch_size)
            ]
        )
        .cpu()
        .numpy()
    )
    y_true = data[part]["y"].cpu().numpy()

    if task_type == "binclass":
        y_pred = np.round(scipy.special.expit(y_pred))
        score = sklearn.metrics.accuracy_score(y_true, y_pred)
    elif task_type == "multiclass":
        y_pred = y_pred.argmax(1)
        score = sklearn.metrics.accuracy_score(y_true, y_pred)
    else:
        assert task_type == "regression"
        score = -(sklearn.metrics.mean_squared_error(y_true, y_pred) ** 0.5 * Y_std)
    return score  # The higher -- the better.


print(f'Test score before training: {evaluate("test"):.4f}')

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [22]:
n_epochs = 1_000_000_000
patience = 30

batch_size = 256
epoch_size = math.ceil(len(train_idx) / batch_size)
timer = delu.tools.Timer()
early_stopping = delu.tools.EarlyStopping(patience, mode="max")

best = {
    "val": -math.inf,
    "test": None,  # Store test score only for the best validation score
    "epoch": -1,
}

test_runs = 5  # Number of test evaluations to average

print(f"Device: {device.type.upper()}")
print("-" * 88 + "\n")
timer.run()

for epoch in range(n_epochs):
    for batch in tqdm(
        delu.iter_batches(data["train"], batch_size, shuffle=True),
        desc=f"Epoch {epoch}",
        total=epoch_size,
    ):
        model.train()
        optimizer.zero_grad()
        loss = loss_fn(apply_model(batch), batch["y"])
        loss.backward()
        optimizer.step()

    val_score = evaluate("val")

    # Update early stopping
    early_stopping.update(val_score)

    # Check if early stopping indicates to stop
    if early_stopping.should_stop():
        print("\nEarly stopping triggered. Evaluating test score for the best validation score...\n")

        # Evaluate test score multiple times for the best validation score
        test_scores = [evaluate("test") for _ in range(test_runs)]
        print(f" the test score for epoch {epoch} is {test_scores}")
        average_test_score = sum(test_scores) / test_runs
        best["test"] = average_test_score

        print(f"Best validation score: {best['val']:.4f}")
        print(f"Averaged Test score over {test_runs} runs: {best['test']:.4f}")
        break

    # Check if current epoch has the best validation score
    if val_score > best["val"]:
        print("🌸 New best epoch! 🌸")
        best = {"val": val_score, "test": None, "epoch": epoch}

    print(f"(val) {val_score:.4f} [time] {timer}")
    print()

print("\n\nResult:")
print(f"Best Epoch: {best['epoch']}")
print(f"Validation Score: {best['val']:.4f}")
if best["test"] is not None:
    print(f"Averaged Test Score: {best['test']:.4f}")
else:
    print("Test score was not evaluated.")


Device: CUDA
----------------------------------------------------------------------------------------



Epoch 0: 100%|██████████| 26/26 [00:00<00:00, 309.31it/s]


🌸 New best epoch! 🌸
(val) 0.6467 [time] 0:00:00.092546



Epoch 1: 100%|██████████| 26/26 [00:00<00:00, 318.38it/s]


(val) 0.6260 [time] 0:00:00.182463



Epoch 2: 100%|██████████| 26/26 [00:00<00:00, 303.80it/s]


(val) 0.6300 [time] 0:00:00.275029



Epoch 3: 100%|██████████| 26/26 [00:00<00:00, 300.49it/s]


(val) 0.6387 [time] 0:00:00.370079



Epoch 4: 100%|██████████| 26/26 [00:00<00:00, 283.83it/s]


(val) 0.6260 [time] 0:00:00.469796



Epoch 5: 100%|██████████| 26/26 [00:00<00:00, 310.15it/s]


(val) 0.6347 [time] 0:00:00.561598



Epoch 6: 100%|██████████| 26/26 [00:00<00:00, 306.25it/s]


(val) 0.6333 [time] 0:00:00.653941



Epoch 7: 100%|██████████| 26/26 [00:00<00:00, 303.18it/s]


(val) 0.6340 [time] 0:00:00.746652



Epoch 8: 100%|██████████| 26/26 [00:00<00:00, 312.94it/s]


(val) 0.6293 [time] 0:00:00.837251



Epoch 9: 100%|██████████| 26/26 [00:00<00:00, 277.26it/s]


(val) 0.6313 [time] 0:00:00.938682



Epoch 10: 100%|██████████| 26/26 [00:00<00:00, 309.13it/s]


(val) 0.6400 [time] 0:00:01.029833



Epoch 11: 100%|██████████| 26/26 [00:00<00:00, 305.39it/s]


(val) 0.6247 [time] 0:00:01.123154



Epoch 12: 100%|██████████| 26/26 [00:00<00:00, 293.86it/s]


(val) 0.6253 [time] 0:00:01.219594



Epoch 13: 100%|██████████| 26/26 [00:00<00:00, 309.35it/s]


(val) 0.6273 [time] 0:00:01.311247



Epoch 14: 100%|██████████| 26/26 [00:00<00:00, 293.76it/s]


(val) 0.6300 [time] 0:00:01.407736



Epoch 15: 100%|██████████| 26/26 [00:00<00:00, 294.32it/s]


(val) 0.6180 [time] 0:00:01.504283



Epoch 16: 100%|██████████| 26/26 [00:00<00:00, 315.40it/s]


(val) 0.6293 [time] 0:00:01.594580



Epoch 17: 100%|██████████| 26/26 [00:00<00:00, 308.17it/s]


(val) 0.6300 [time] 0:00:01.687571



Epoch 18: 100%|██████████| 26/26 [00:00<00:00, 311.58it/s]


(val) 0.6273 [time] 0:00:01.779710



Epoch 19: 100%|██████████| 26/26 [00:00<00:00, 271.28it/s]


(val) 0.6227 [time] 0:00:01.884808



Epoch 20: 100%|██████████| 26/26 [00:00<00:00, 278.77it/s]


(val) 0.6200 [time] 0:00:01.986599



Epoch 21: 100%|██████████| 26/26 [00:00<00:00, 283.90it/s]


(val) 0.6273 [time] 0:00:02.086197



Epoch 22: 100%|██████████| 26/26 [00:00<00:00, 325.47it/s]


(val) 0.6247 [time] 0:00:02.174031



Epoch 23: 100%|██████████| 26/26 [00:00<00:00, 304.40it/s]


(val) 0.6380 [time] 0:00:02.266648



Epoch 24: 100%|██████████| 26/26 [00:00<00:00, 293.79it/s]


(val) 0.6233 [time] 0:00:02.361806



Epoch 25: 100%|██████████| 26/26 [00:00<00:00, 321.60it/s]


(val) 0.6133 [time] 0:00:02.449261



Epoch 26: 100%|██████████| 26/26 [00:00<00:00, 316.68it/s]


(val) 0.6113 [time] 0:00:02.538756



Epoch 27: 100%|██████████| 26/26 [00:00<00:00, 312.50it/s]


(val) 0.6247 [time] 0:00:02.628570



Epoch 28: 100%|██████████| 26/26 [00:00<00:00, 316.58it/s]


(val) 0.6293 [time] 0:00:02.717503



Epoch 29: 100%|██████████| 26/26 [00:00<00:00, 329.98it/s]


(val) 0.6233 [time] 0:00:02.803523



Epoch 30: 100%|██████████| 26/26 [00:00<00:00, 318.48it/s]


Early stopping triggered. Evaluating test score for the best validation score...

 the test score for epoch 30 is [0.657, 0.657, 0.657, 0.657, 0.657]
Best validation score: 0.6467
Averaged Test score over 5 runs: 0.6570


Result:
Best Epoch: 0
Validation Score: 0.6467
Averaged Test Score: 0.6570





In [42]:
# For demonstration purposes (fast training and bad performance),
# one can set smaller values:
# n_epochs = 20
# patience = 2
n_epochs = 1_000_000_000
patience = 16

batch_size = 256
epoch_size = math.ceil(len(train_idx) / batch_size)
timer = delu.tools.Timer()
early_stopping = delu.tools.EarlyStopping(patience, mode="max")
best = {
    "val": -math.inf,
    "test": None,
    "epoch": -1,
}

print(f"Device: {device.type.upper()}")
print("-" * 88 + "\n")
timer.run()
for epoch in range(n_epochs):
    for batch in tqdm(
        delu.iter_batches(data["train"], batch_size, shuffle=True),
        desc=f"Epoch {epoch}",
        total=epoch_size,
    ):
        model.train()
        optimizer.zero_grad()
        loss = loss_fn(apply_model(batch), batch["y"])
        loss.backward()
        optimizer.step()

    val_score = evaluate("val")
    #test_score = evaluate("test")
    #print(f"(val) {val_score:.4f} (test) {test_score:.4f} [time] {timer}")

    early_stopping.update(val_score)
    if early_stopping.should_stop():
        break

    if val_score > best["val"]:
        print("🌸 New best epoch! 🌸")
        #best = {"val": val_score, "test": test_score, "epoch": epoch}
        best = {"val": val_score}

    print()

print("\n\nResult:")
print(best)

Device: CUDA
----------------------------------------------------------------------------------------



Epoch 0: 100%|██████████| 125/125 [00:01<00:00, 73.35it/s]


(val) 0.8466 (test) 0.8458 [time] 0:00:02.021044
🌸 New best epoch! 🌸



Epoch 1: 100%|██████████| 125/125 [00:01<00:00, 75.67it/s]


(val) 0.8555 (test) 0.8549 [time] 0:00:04.002714
🌸 New best epoch! 🌸



Epoch 2: 100%|██████████| 125/125 [00:01<00:00, 75.68it/s]


(val) 0.8522 (test) 0.8498 [time] 0:00:05.988895



Epoch 3: 100%|██████████| 125/125 [00:01<00:00, 75.67it/s]


(val) 0.8582 (test) 0.8565 [time] 0:00:07.974658
🌸 New best epoch! 🌸



Epoch 4: 100%|██████████| 125/125 [00:01<00:00, 75.11it/s]


(val) 0.8624 (test) 0.8622 [time] 0:00:09.974212
🌸 New best epoch! 🌸



Epoch 5: 100%|██████████| 125/125 [00:01<00:00, 74.66it/s]


(val) 0.8575 (test) 0.8592 [time] 0:00:11.993332



Epoch 6: 100%|██████████| 125/125 [00:01<00:00, 74.88it/s]


(val) 0.8592 (test) 0.8596 [time] 0:00:14.012918



Epoch 7: 100%|██████████| 125/125 [00:01<00:00, 74.85it/s]


(val) 0.8585 (test) 0.8567 [time] 0:00:16.030653



Epoch 8: 100%|██████████| 125/125 [00:01<00:00, 74.94it/s]


(val) 0.8615 (test) 0.8599 [time] 0:00:18.048731



Epoch 9: 100%|██████████| 125/125 [00:01<00:00, 74.98it/s]


(val) 0.8557 (test) 0.8545 [time] 0:00:20.062073



Epoch 10: 100%|██████████| 125/125 [00:01<00:00, 75.52it/s]


(val) 0.8600 (test) 0.8585 [time] 0:00:22.060763



Epoch 11: 100%|██████████| 125/125 [00:01<00:00, 75.61it/s]


(val) 0.8578 (test) 0.8594 [time] 0:00:24.055356



Epoch 12: 100%|██████████| 125/125 [00:01<00:00, 76.09it/s]


(val) 0.8500 (test) 0.8501 [time] 0:00:26.031260



Epoch 13: 100%|██████████| 125/125 [00:01<00:00, 76.08it/s]


(val) 0.8395 (test) 0.8387 [time] 0:00:27.998452



Epoch 14: 100%|██████████| 125/125 [00:01<00:00, 76.18it/s]


(val) 0.8527 (test) 0.8538 [time] 0:00:29.960950



Epoch 15: 100%|██████████| 125/125 [00:01<00:00, 76.39it/s]


(val) 0.8519 (test) 0.8561 [time] 0:00:31.925696



Epoch 16: 100%|██████████| 125/125 [00:01<00:00, 76.61it/s]


(val) 0.8545 (test) 0.8556 [time] 0:00:33.881496



Epoch 17: 100%|██████████| 125/125 [00:01<00:00, 76.93it/s]


(val) 0.8536 (test) 0.8555 [time] 0:00:35.830254



Epoch 18: 100%|██████████| 125/125 [00:01<00:00, 77.07it/s]


(val) 0.8575 (test) 0.8595 [time] 0:00:37.773218



Epoch 19: 100%|██████████| 125/125 [00:01<00:00, 76.82it/s]


(val) 0.8596 (test) 0.8570 [time] 0:00:39.711747



Epoch 20: 100%|██████████| 125/125 [00:01<00:00, 77.41it/s]


(val) 0.8600 (test) 0.8576 [time] 0:00:41.638404


Result:
{'val': 0.8624266411901187, 'test': 0.8622172177295526, 'epoch': 4}
