In [1]:
import torch
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from model_utils import WideResnetLit
import lightning as L
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import MLFlowLogger

torch.set_float32_matmul_precision("medium")

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [3]:
IMAGE_SIZE = 32
mean, std = [0.4914, 0.4822, 0.4465], [0.247, 0.243, 0.261]
# source: https://pytorch.org/vision/stable/transforms.html
transforms_train = transforms.Compose(
    [
        transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
        transforms.ToTensor(),
        transforms.Normalize(mean, std),
    ]
)


transforms_test = transforms.Compose(
    [
        transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
        transforms.ToTensor(),
        transforms.Normalize(mean, std),
    ]
)

train_dataset = datasets.CIFAR10(
    root="./data", train=True, download=True, transform=transforms_train
)
validation_dataset = datasets.CIFAR10(
    root="./data", train=False, download=True, transform=transforms_test
)

train_loader = torch.utils.data.DataLoader(
    dataset=train_dataset, batch_size=512, num_workers=31, persistent_workers=True
)
validation_loader = torch.utils.data.DataLoader(
    dataset=validation_dataset, batch_size=512, num_workers=31, persistent_workers=True
)

In [4]:
class DataModule(L.LightningDataModule):
    def __init__(self, train_loader, validation_loader):
        super().__init__()
        self.train_loader = train_loader
        self.validation_loader = validation_loader

    def train_dataloader(self):
        return self.train_loader

    def val_dataloader(self):
        return self.validation_loader


data = DataModule(train_loader, validation_loader)
hyperparameters = {
    "depth": 4,
    "first_output": 0,
    "growth_rate": 0,
    "dropout": 0,
}
model = WideResnetLit(hyperparameters=hyperparameters)
logger = MLFlowLogger(experiment_name="WideResnet", save_dir="mlruns")
trainer = L.Trainer(
    max_epochs=50,
    logger=logger,
    callbacks=[
        ModelCheckpoint(
            monitor="val_f1_macro",
            mode="max",
            dirpath="checkpoints/wide_resnet",
            filename="{epoch:02d}-{val_f1_macro:.3f}-{val_accuracy:.3f}",
        )
    ],
    precision="16-mixed",
    num_sanity_val_steps=0,
)
trainer.fit(model, datamodule=data)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Experiment with name WideResnet not found. Creating it.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type             | Params | Mode 
-----------------------------------------------------------
0 | model         | ResNet           | 66.9 M | train
1 | loss_fn       | CrossEntropyLoss | 0      | train
2 | train_metrics | MetricCollection | 0      | train
3 | val_metrics   | MetricCollection | 0      | train
4 | test_metrics  | MetricCollection | 0      | train
-----------------------------------------------------------
66.9 M    Trainable params
0         Non-trainable params
66.9 M    Total params
267.419   Total estimated model params size (MB)
170       Modules in train mode
0         Modules in eval mode


Epoch 0:   3%|▎         | 3/98 [00:01<00:35,  2.70it/s, v_num=4613]



Epoch 49: 100%|██████████| 98/98 [00:07<00:00, 13.87it/s, v_num=4613]

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 98/98 [00:07<00:00, 12.61it/s, v_num=4613]


Val F1 macro is 0.799

Val accuracy is 0.798