In [1]:
import torch
import sys
import os
import lightning as L
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping
from lightning.pytorch.loggers import MLFlowLogger

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))


if project_root not in sys.path:
    sys.path.append(project_root)
    
from model_utils import PretrainedVisionTransformer, ClassificationData, vit_transforms, vit_no_augmentation

torch.cuda.empty_cache()
torch.set_float32_matmul_precision("medium")
L.seed_everything(111)

Seed set to 111


111

In [2]:
model = PretrainedVisionTransformer(batch_size=16)
data = ClassificationData(batch_size=16, transform=vit_transforms)
logger = MLFlowLogger(experiment_name="ViT", save_dir="mlruns")
trainer = L.Trainer(max_epochs=5, 
                    logger=logger, 
                    callbacks=[ModelCheckpoint(monitor="val_f1_macro", mode="max", dirpath="checkpoints/vit", filename="augmentation-{epoch:02d}-{val_f1_macro:.2f}"), 
                               EarlyStopping(monitor="train_loss", min_delta=0.01, patience=3, mode="min", verbose=True)], 
                    precision="16-mixed", 
                    num_sanity_val_steps=0)
trainer.fit(model, datamodule=data)
trainer.test(model, datamodule=data)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type              | Params | Mode 
------------------------------------------------------------
0 | model         | VisionTransformer | 86.1 M | train
1 | loss_fn       | CrossEntropyLoss  | 0      | train
2 | train_metrics | MetricCollection  | 0      | train
3 | val_metrics   | MetricCollection  | 0      | train
4 | test_metrics  | MetricCollection  | 0      | train
------------------------------------------------------------
86.1 M    Trainable params
0         Non-trainable params
86.1 M    Total params
344.393   Total estimated model params size (MB)
168       Modules in train mode
0         Modules in eval mode


Epoch 0: 100%|██████████| 5625/5625 [41:32<00:00,  2.26it/s, v_num=5209]

Metric train_loss improved. New best score: 2.032


Epoch 1: 100%|██████████| 5625/5625 [41:14<00:00,  2.27it/s, v_num=5209]

Metric train_loss improved by 0.151 >= min_delta = 0.01. New best score: 1.880


Epoch 2: 100%|██████████| 5625/5625 [40:42<00:00,  2.30it/s, v_num=5209]

Metric train_loss improved by 0.082 >= min_delta = 0.01. New best score: 1.798


Epoch 3: 100%|██████████| 5625/5625 [41:05<00:00,  2.28it/s, v_num=5209]

Metric train_loss improved by 0.089 >= min_delta = 0.01. New best score: 1.709


Epoch 4: 100%|██████████| 5625/5625 [41:14<00:00,  2.27it/s, v_num=5209]

Metric train_loss improved by 0.082 >= min_delta = 0.01. New best score: 1.628
`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 5625/5625 [41:18<00:00,  2.27it/s, v_num=5209]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
c:\Users\SPCX\Desktop\github-repositories\dl-cnn\.venv\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:420: Consider setting `persistent_workers=True` in 'test_dataloader' to speed up the dataloader worker initialization.


Testing DataLoader 0: 100%|██████████| 5625/5625 [10:32<00:00,  8.89it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       test_auroc           0.8539234399795532
      test_f1_macro         0.40828847885131836
     test_precision         0.4262617230415344
       test_recall           0.423633337020874
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_auroc': 0.8539234399795532,
  'test_f1_macro': 0.40828847885131836,
  'test_precision': 0.4262617230415344,
  'test_recall': 0.423633337020874}]

## Conclusion

For our hardware, the model is too big, so we need to set really small batch size = 16. Thus, the training time is so long and the results are not good.