In [50]:
from pathlib import Path
import numpy as np
import torch
from typing import List
from torch.nn.utils.rnn import pad_sequence
from mltrainer import rnn_models, Trainer
from torch import optim

from mads_datasets import datatools
from mlflow.models.signature import infer_signature
import mlflow.pytorch
import mlflow
import mltrainer
mltrainer.__version__

'0.2.5'

# 1 Iterators
We will be using an interesting dataset. [link](https://tev.fbk.eu/resources/smartwatch)

From the site:
> The SmartWatch Gestures Dataset has been collected to evaluate several gesture recognition algorithms for interacting with mobile applications using arm gestures. Eight different users performed twenty repetitions of twenty different gestures, for a total of 3200 sequences. Each sequence contains acceleration data from the 3-axis accelerometer of a first generation Sony SmartWatch™, as well as timestamps from the different clock sources available on an Android device. The smartwatch was worn on the user's right wrist. 


In [51]:
from mads_datasets import DatasetFactoryProvider, DatasetType
from mltrainer.preprocessors import PaddedPreprocessor
preprocessor = PaddedPreprocessor()

gesturesdatasetfactory = DatasetFactoryProvider.create_factory(DatasetType.GESTURES)
streamers = gesturesdatasetfactory.create_datastreamer(batchsize=32, preprocessor=preprocessor)
train = streamers["train"]
valid = streamers["valid"]

[32m2025-09-25 22:16:23.142[0m | [1mINFO    [0m | [36mmads_datasets.base[0m:[36mdownload_data[0m:[36m121[0m - [1mFolder already exists at /Users/christelvanharen/.cache/mads_datasets/gestures[0m
100%|[38;2;30;71;6m██████████[0m| 2600/2600 [00:10<00:00, 259.82it/s]
100%|[38;2;30;71;6m██████████[0m| 651/651 [00:02<00:00, 260.51it/s]


In [52]:
len(train), len(valid)

(81, 20)

In [53]:
trainstreamer = train.stream()
validstreamer = valid.stream()
x, y = next(iter(trainstreamer))
x.shape, y

(torch.Size([32, 30, 3]),
 tensor([10, 16,  8, 11,  5,  3,  9,  6, 15, 11, 12, 18, 15, 13,  8, 18,  1,  8,
         11, 14,  0,  6,  7, 10, 18, 14,  7,  3,  7,  6, 12, 18]))

Can you make sense of the shape?
What does it mean that the shapes are sometimes (32, 27, 3), but a second time might look like (32, 30, 3)? In other words, the second (or first, if you insist on starting at 0) dimension changes. Why is that? How does the model handle this? Do you think this is already padded, or still has to be padded?


# 2 Excercises
Lets test a basemodel, and try to improve upon that.

Fill the gestures.gin file with relevant settings for `input_size`, `hidden_size`, `num_layers` and `horizon` (which, in our case, will be the number of classes...)

As a rule of thumbs: start lower than you expect to need!

In [54]:
from mltrainer import TrainerSettings, ReportTypes
from mltrainer.metrics import Accuracy

accuracy = Accuracy()


In [55]:
model = rnn_models.BaseRNN(
    input_size=3,
    hidden_size=64,
    num_layers=1,
    horizon=20,
)

Test the model. What is the output shape you need? Remember, we are doing classification!

In [56]:
yhat = model(x)
yhat.shape

torch.Size([32, 20])

Test the accuracy

In [57]:
accuracy(y, yhat)

0.03125

What do you think of the accuracy? What would you expect from blind guessing?

Check shape of `y` and `yhat`

In [58]:
yhat.shape, y.shape

(torch.Size([32, 20]), torch.Size([32]))

And look at the output of yhat

In [59]:
yhat[0]

tensor([ 0.0710,  0.1273,  0.0120, -0.1085,  0.0478,  0.0258,  0.0286,  0.1333,
         0.1436,  0.1596,  0.1096,  0.1469, -0.0045, -0.0848,  0.0081,  0.1249,
        -0.1494, -0.0020, -0.1531,  0.0009], grad_fn=<SelectBackward0>)

Does this make sense to you? If you are unclear, go back to the classification problem with the MNIST, where we had 10 classes.

We have a classification problem, so we need Cross Entropy Loss.
Remember, [this has a softmax built in](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html) 

In [60]:
loss_fn = torch.nn.CrossEntropyLoss()
loss = loss_fn(yhat, y)
loss

tensor(2.9946, grad_fn=<NllLossBackward0>)

In [61]:
import torch
if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
    print("Using MPS")
elif torch.cuda.is_available():
    device = "cuda:0"
    print("using cuda")
else:
    device = "cpu"
    print("using cpu")

# on my mac, at least for the BaseRNN model, mps does not speed up training
# probably because the overhead of copying the data to the GPU is too high
# so i override the device to cpu
device = "cpu"
# however, it might speed up training for larger models, with more parameters

Using MPS


Set up the settings for the trainer and the different types of logging you want

In [62]:
settings = TrainerSettings(
    epochs=80,  # allow longer training; early stopping will end sooner
    metrics=[accuracy],
    logdir=Path("gestures"),
    train_steps=len(train),
    valid_steps=len(valid),
    reporttypes=[ReportTypes.TOML, ReportTypes.TENSORBOARD, ReportTypes.MLFLOW],
    scheduler_kwargs={"factor": 0.5, "patience": 4},
    earlystop_kwargs={
        "save": False,
        "verbose": True,
        "patience": 8,
        "delta": 1e-4,
    },
)
settings


epochs: 80
metrics: [Accuracy]
logdir: gestures
train_steps: 81
valid_steps: 20
reporttypes: [<ReportTypes.TOML: 'TOML'>, <ReportTypes.TENSORBOARD: 'TENSORBOARD'>, <ReportTypes.MLFLOW: 'MLFLOW'>]
optimizer_kwargs: {'lr': 0.001, 'weight_decay': 1e-05}
scheduler_kwargs: {'factor': 0.5, 'patience': 4}
earlystop_kwargs: {'save': False, 'verbose': True, 'patience': 8, 'delta': 0.0001}

In [63]:
import torch.nn as nn
import torch
from torch import Tensor
from dataclasses import dataclass, asdict
from typing import Tuple, Sequence, Dict, Any

@dataclass
class RNNConfig:
    input_size: int
    hidden_size: int
    num_layers: int
    output_size: int
    dropout: float = 0.0
    bidirectional: bool = False
    pooling: str = "mean"

    @property
    def hidden_factor(self) -> int:
        return 2 if self.bidirectional else 1

    def to_dict(self) -> Dict[str, Any]:
        data = asdict(self)
        data["pooling"] = self.pooling
        return data

@dataclass
class ConvRNNConfig(RNNConfig):
    conv_channels: Tuple[int, ...] = (32, 64)
    conv_kernel: int = 3
    conv_dropout: float = 0.1

def infer_lengths(batch: Tensor) -> Tensor:
    mask = batch.abs().sum(dim=-1) > 0
    lengths = mask.sum(dim=-1)
    lengths[lengths == 0] = batch.size(1)
    return lengths

def pool_sequence(outputs: Tensor, lengths: Tensor, pooling: str) -> Tensor:
    if pooling not in {"last", "mean"}:
        raise ValueError(f"Unknown pooling mode: {pooling}")
    batch_indices = torch.arange(outputs.size(0), device=outputs.device)
    if pooling == "last":
        last_indices = torch.clamp(lengths - 1, min=0)
        return outputs[batch_indices, last_indices]
    mask = (torch.arange(outputs.size(1), device=outputs.device).unsqueeze(0)
            < lengths.unsqueeze(1))
    masked = outputs * mask.unsqueeze(-1)
    summed = masked.sum(dim=1)
    return summed / lengths.unsqueeze(-1)

class GRUClassifier(nn.Module):
    def __init__(self, config: RNNConfig) -> None:
        super().__init__()
        self.config = config
        self.rnn = nn.GRU(
            input_size=config.input_size,
            hidden_size=config.hidden_size,
            num_layers=config.num_layers,
            dropout=config.dropout if config.num_layers > 1 else 0.0,
            bidirectional=config.bidirectional,
            batch_first=True,
        )
        self.dropout = nn.Dropout(config.dropout)
        self.fc = nn.Linear(config.hidden_size * config.hidden_factor, config.output_size)

    def forward(self, x: Tensor) -> Tensor:
        lengths = infer_lengths(x)
        outputs, _ = self.rnn(x)
        pooled = pool_sequence(outputs, lengths, self.config.pooling)
        pooled = self.dropout(pooled)
        return self.fc(pooled)

class LSTMClassifier(nn.Module):
    def __init__(self, config: RNNConfig) -> None:
        super().__init__()
        self.config = config
        self.rnn = nn.LSTM(
            input_size=config.input_size,
            hidden_size=config.hidden_size,
            num_layers=config.num_layers,
            dropout=config.dropout if config.num_layers > 1 else 0.0,
            bidirectional=config.bidirectional,
            batch_first=True,
        )
        self.dropout = nn.Dropout(config.dropout)
        self.fc = nn.Linear(config.hidden_size * config.hidden_factor, config.output_size)

    def forward(self, x: Tensor) -> Tensor:
        lengths = infer_lengths(x)
        outputs, _ = self.rnn(x)
        pooled = pool_sequence(outputs, lengths, self.config.pooling)
        pooled = self.dropout(pooled)
        return self.fc(pooled)

class ConvGRUClassifier(nn.Module):
    def __init__(self, config: ConvRNNConfig) -> None:
        super().__init__()
        self.config = config
        conv_layers: Sequence[nn.Module] = []
        in_channels = config.input_size
        for out_channels in config.conv_channels:
            conv_layers.extend(
                [
                    nn.Conv1d(
                        in_channels,
                        out_channels,
                        kernel_size=config.conv_kernel,
                        padding=config.conv_kernel // 2,
                    ),
                    nn.BatchNorm1d(out_channels),
                    nn.ReLU(inplace=True),
                    nn.Dropout(config.conv_dropout),
                ]
            )
            in_channels = out_channels
        self.feature_extractor = nn.Sequential(*conv_layers)
        self.rnn = nn.GRU(
            input_size=in_channels,
            hidden_size=config.hidden_size,
            num_layers=config.num_layers,
            dropout=config.dropout if config.num_layers > 1 else 0.0,
            bidirectional=config.bidirectional,
            batch_first=True,
        )
        self.dropout = nn.Dropout(config.dropout)
        self.fc = nn.Linear(config.hidden_size * config.hidden_factor, config.output_size)

    def forward(self, x: Tensor) -> Tensor:
        lengths = infer_lengths(x)
        x = x.transpose(1, 2)
        x = self.feature_extractor(x)
        x = x.transpose(1, 2)
        outputs, _ = self.rnn(x)
        pooled = pool_sequence(outputs, lengths, self.config.pooling)
        pooled = self.dropout(pooled)
        return self.fc(pooled)


### Hyperparameter templates
The experiments below instantiate `RNNConfig` and `ConvRNNConfig` objects to explore GRU, LSTM, and Conv+GRU variants.

In [64]:
from datetime import datetime
from dataclasses import asdict
from typing import Callable, List, Tuple

# Use a writable location for the MLflow database
mlflow_db_path = Path.cwd() / "mlflow.db"
mlflow.set_tracking_uri(f"sqlite:///{mlflow_db_path}")

registry_root = (Path("mlruns").resolve() / "model-registry")
registry_root.mkdir(parents=True, exist_ok=True)
mlflow.set_registry_uri(f"file:{registry_root}")
mlflow.set_experiment("gestures")

modeldir = Path("gestures").resolve()
modeldir.mkdir(parents=True, exist_ok=True)

def iterate_dataset(ds):
    stream = ds.stream()
    for _ in range(len(ds)):
        yield next(stream)

def evaluate_accuracy(model: nn.Module, dataset) -> float:
    model.eval()
    total = 0
    correct = 0
    with torch.no_grad():
        for xb, yb in iterate_dataset(dataset):
            xb = xb.to(device)
            yb = yb.to(device)
            logits = model(xb)
            preds = logits.argmax(dim=-1)
            correct += (preds == yb).sum().item()
            total += yb.size(0)
    return correct / total

experiment_results: List[dict] = []

def run_experiment(run_name: str, model_builder: Callable[[], nn.Module], config_dict: dict) -> float:
    torch.manual_seed(42)
    model = model_builder().to(device)
    with mlflow.start_run(run_name=run_name):
        mlflow.set_tag("model", run_name)
        mlflow.set_tag("dev", "cvh")
        mlflow.log_params({**config_dict, "epochs": settings.epochs})
        trainer = Trainer(
            model=model,
            settings=settings,
            loss_fn=loss_fn,
            optimizer=optim.Adam,
            traindataloader=train.stream(),
            validdataloader=valid.stream(),
            scheduler=optim.lr_scheduler.ReduceLROnPlateau,
            device=device,
        )
        trainer.loop()
        val_accuracy = evaluate_accuracy(model, valid)
        mlflow.log_metric("final_val_accuracy", val_accuracy)
        example_batch, _ = next(valid.stream())
        example_tensor = example_batch[:1].to(device)
        with torch.no_grad():
            prediction = model(example_tensor)
        example_numpy = example_batch[:1].cpu().numpy()
        signature = infer_signature(example_numpy, prediction.detach().cpu().numpy())
        model_alias = f"{run_name}-model"
        mlflow.pytorch.log_model(
            model,
            name=model_alias,
            input_example=example_numpy,
            signature=signature,
        )
        if not settings.earlystop_kwargs["save"]:
            tag = datetime.now().strftime("%Y%m%d-%H%M-")
            model_path = modeldir / f"{tag}{run_name}.pt"
            torch.save(model.state_dict(), model_path)
    experiment_results.append(
        {"run": run_name, "model": config_dict.get("model_class"), "val_accuracy": val_accuracy}
    )
    print(f"{run_name}: validation accuracy {val_accuracy:.4f}")
    return val_accuracy

gru_config = RNNConfig(
    input_size=3,
    hidden_size=128,
    num_layers=2,
    output_size=20,
    dropout=0.3,
    bidirectional=True,
    pooling="mean",
)
lstm_config = RNNConfig(
    input_size=3,
    hidden_size=160,
    num_layers=2,
    output_size=20,
    dropout=0.35,
    bidirectional=True,
    pooling="mean",
)
conv_gru_config = ConvRNNConfig(
    input_size=3,
    conv_channels=(32, 64),
    conv_kernel=5,
    conv_dropout=0.2,
    hidden_size=128,
    num_layers=2,
    output_size=20,
    dropout=0.3,
    bidirectional=True,
    pooling="mean",
)

experiments: List[Tuple[str, Callable[[], nn.Module], dict]] = [
    (
        "bigru_mean_pool",
        lambda: GRUClassifier(gru_config),
        {**gru_config.to_dict(), "model_class": "GRUClassifier"},
    ),
    (
        "bilstm_dropout",
        lambda: LSTMClassifier(lstm_config),
        {**lstm_config.to_dict(), "model_class": "LSTMClassifier"},
    ),
    (
        "conv_bigru",
        lambda: ConvGRUClassifier(conv_gru_config),
        {**conv_gru_config.to_dict(), "model_class": "ConvGRUClassifier"},
    ),
]

run_summaries = []
for run_name, builder, cfg in experiments:
    val_acc = run_experiment(run_name, builder, cfg)
    run_summaries.append((run_name, val_acc))

sorted(run_summaries, key=lambda x: x[1], reverse=True)


[32m2025-09-25 22:16:36.064[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mdir_add_timestamp[0m:[36m24[0m - [1mLogging to gestures/20250925-221636[0m
[32m2025-09-25 22:16:36.065[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36m__init__[0m:[36m68[0m - [1mFound earlystop_kwargs in settings.Set to None if you dont want earlystopping.[0m
100%|[38;2;30;71;6m██████████[0m| 81/81 [00:03<00:00, 22.37it/s]
[32m2025-09-25 22:16:40.003[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 2.2277 test 1.4582 metric ['0.5141'][0m
100%|[38;2;30;71;6m██████████[0m| 81/81 [00:03<00:00, 22.97it/s]
[32m2025-09-25 22:16:43.861[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 0.8875 test 0.3702 metric ['0.9469'][0m
100%|[38;2;30;71;6m██████████[0m| 81/81 [00:03<00:00, 23.04it/s]
[32m2025-09-25 22:16:47.693[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mrepor

bigru_mean_pool: validation accuracy 0.9938


100%|[38;2;30;71;6m██████████[0m| 81/81 [00:04<00:00, 17.36it/s]
[32m2025-09-25 22:18:48.633[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 2.1446 test 1.5029 metric ['0.4094'][0m
100%|[38;2;30;71;6m██████████[0m| 81/81 [00:04<00:00, 17.88it/s]
[32m2025-09-25 22:18:53.685[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 1.1271 test 0.8334 metric ['0.7266'][0m
100%|[38;2;30;71;6m██████████[0m| 81/81 [00:04<00:00, 17.76it/s]
[32m2025-09-25 22:18:58.737[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 2 train 0.5673 test 0.2940 metric ['0.9516'][0m
100%|[38;2;30;71;6m██████████[0m| 81/81 [00:04<00:00, 17.74it/s]
[32m2025-09-25 22:19:03.823[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 3 train 0.2185 test 0.1417 metric ['0.9719'][0m
100%|[38;2;30;71;6m██████████[0m| 81/81 [00:04

bilstm_dropout: validation accuracy 0.9938


100%|[38;2;30;71;6m██████████[0m| 81/81 [17:02<00:00, 12.62s/it]
[32m2025-09-26 00:03:45.925[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 2.0988 test 1.1628 metric ['0.5953'][0m
100%|[38;2;30;71;6m██████████[0m| 81/81 [16:30<00:00, 12.23s/it]/it]
[32m2025-09-26 00:20:16.550[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 0.7281 test 0.2803 metric ['0.9203'][0m
100%|[38;2;30;71;6m██████████[0m| 81/81 [10:21<00:00,  7.67s/it]/it]
[32m2025-09-26 00:30:38.082[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 2 train 0.3049 test 0.1582 metric ['0.9641'][0m
100%|[38;2;30;71;6m██████████[0m| 81/81 [00:08<00:00,  9.51it/s]it] 
[32m2025-09-26 00:30:47.387[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 3 train 0.1802 test 0.1024 metric ['0.9703'][0m
100%|[38;2;30;71;6m██████████[0m| 

conv_bigru: validation accuracy 0.9938


[('bigru_mean_pool', 0.99375),
 ('bilstm_dropout', 0.99375),
 ('conv_bigru', 0.99375)]

## 3 Results overview
The cell above trains three variants (BiGRU, BiLSTM, Conv+BiGRU) and logs them to MLflow.
You can re-run it after tweaking the configs to compare new experiments.


In [65]:
import pandas as pd

if not experiment_results:
    raise RuntimeError("No experiments recorded yet. Run the training cell first.")

results_df = pd.DataFrame(experiment_results).sort_values("val_accuracy", ascending=False)
display(results_df)
best_run = results_df.iloc[0]
print(
    f"Best validation accuracy: {best_run.val_accuracy:.4f} (run={best_run.run}, model={best_run.model})"
)


Unnamed: 0,run,model,val_accuracy
0,bigru_mean_pool,GRUClassifier,0.99375
1,bilstm_dropout,LSTMClassifier,0.99375
2,conv_bigru,ConvGRUClassifier,0.99375


Best validation accuracy: 0.9938 (run=bigru_mean_pool, model=GRUClassifier)


You can launch `mlflow ui --backend-store-uri sqlite:///mlflow.db` to inspect the training curves and compare runs interactively.