# 15 Minutes to Get Started with MMEngine

https://mmengine.readthedocs.io/en/latest/get_started/15_minutes.html

In [None]:
from pathlib import Path
from pprint import pprint

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader

import torchvision
import torchvision.transforms.v2 as T
import torchvision.transforms.v2.functional as TF

from mmengine.model import BaseModel

  _torch_pytree._register_pytree_node(


The tutorial uses a ResNet50 from `torchvision.models`, but let's take a look at what our options are through `torchvision`.

In [None]:
# Print the list of available models.
available_models = torchvision.models.list_models()
pprint(available_models)

['alexnet',
 'convnext_base',
 'convnext_large',
 'convnext_small',
 'convnext_tiny',
 'deeplabv3_mobilenet_v3_large',
 'deeplabv3_resnet101',
 'deeplabv3_resnet50',
 'densenet121',
 'densenet161',
 'densenet169',
 'densenet201',
 'efficientnet_b0',
 'efficientnet_b1',
 'efficientnet_b2',
 'efficientnet_b3',
 'efficientnet_b4',
 'efficientnet_b5',
 'efficientnet_b6',
 'efficientnet_b7',
 'efficientnet_v2_l',
 'efficientnet_v2_m',
 'efficientnet_v2_s',
 'fasterrcnn_mobilenet_v3_large_320_fpn',
 'fasterrcnn_mobilenet_v3_large_fpn',
 'fasterrcnn_resnet50_fpn',
 'fasterrcnn_resnet50_fpn_v2',
 'fcn_resnet101',
 'fcn_resnet50',
 'fcos_resnet50_fpn',
 'googlenet',
 'inception_v3',
 'keypointrcnn_resnet50_fpn',
 'lraspp_mobilenet_v3_large',
 'maskrcnn_resnet50_fpn',
 'maskrcnn_resnet50_fpn_v2',
 'maxvit_t',
 'mc3_18',
 'mnasnet0_5',
 'mnasnet0_75',
 'mnasnet1_0',
 'mnasnet1_3',
 'mobilenet_v2',
 'mobilenet_v3_large',
 'mobilenet_v3_small',
 'mvit_v1_b',
 'mvit_v2_s',
 'quantized_googlenet',
 '

In [None]:
# MMEngine provides some useful model analysis tools.
# Let's analyse the number of parameters and flops of the model.
from mmengine.analysis import get_model_complexity_info

# This function returns an analysis of the model as a dict:
#   out_table (str): a summary analysis of the model as a table.
#       NB: this includes a the total params, flops, and activations.
#   out_arch (str): an analysis inline with the repr of the model arch.
#   activations_str (str): the total number of activations of the model.
#   flops_str (str): the total flops of the model.
#   params_str (str): the total number of params of the model.

def summarize_model(model, input_shape=(3, 224, 224)):
    analysis = get_model_complexity_info(model, input_shape)
    table  = analysis['out_table']
    print(table)

In [None]:
vit_b_16 = torchvision.models.vit_b_16()

In [None]:
summarize_model(vit_b_16)

encoder.layers.encoder_layer_0.self_attention.out_proj, encoder.layers.encoder_layer_1.self_attention.out_proj, encoder.layers.encoder_layer_10.self_attention.out_proj, encoder.layers.encoder_layer_11.self_attention.out_proj, encoder.layers.encoder_layer_2.self_attention.out_proj, encoder.layers.encoder_layer_3.self_attention.out_proj, encoder.layers.encoder_layer_4.self_attention.out_proj, encoder.layers.encoder_layer_5.self_attention.out_proj, encoder.layers.encoder_layer_6.self_attention.out_proj, encoder.layers.encoder_layer_7.self_attention.out_proj, encoder.layers.encoder_layer_8.self_attention.out_proj, encoder.layers.encoder_layer_9.self_attention.out_proj



+------------------------------------+----------------------+-----------+--------------+
|[1m [0m[1mmodule                            [0m[1m [0m|[1m [0m[1m#parameters or shape[0m[1m [0m|[1m [0m[1m#flops   [0m[1m [0m|[1m [0m[1m#activations[0m[1m [0m|
+------------------------------------+----------------------+-----------+--------------+
| model                              | 86.568M              | 16.867G   | 16.491M      |
|  class_token                       |  (1, 1, 768)         |           |              |
|  conv_proj                         |  0.591M              |  0.116G   |  0.151M      |
|   conv_proj.weight                 |   (768, 3, 16, 16)   |           |              |
|   conv_proj.bias                   |   (768,)             |           |              |
|  encoder                           |  85.207M             |  16.751G  |  16.34M      |
|   encoder.pos_embedding            |   (1, 197, 768)      |           |              |
|   encoder.l

Well, ViT-base is pretty huge!
Let's try Swin, see if there is any difference.

In [None]:
 swin_v2_t = torchvision.models. swin_v2_t()

In [None]:
summarize_model(swin_v2_t)

features.1.0.attn.proj, features.1.0.attn.qkv, features.1.0.stochastic_depth, features.1.1.attn.proj, features.1.1.attn.qkv, features.3.0.attn.proj, features.3.0.attn.qkv, features.3.1.attn.proj, features.3.1.attn.qkv, features.5.0.attn.proj, features.5.0.attn.qkv, features.5.1.attn.proj, features.5.1.attn.qkv, features.5.2.attn.proj, features.5.2.attn.qkv, features.5.3.attn.proj, features.5.3.attn.qkv, features.5.4.attn.proj, features.5.4.attn.qkv, features.5.5.attn.proj, features.5.5.attn.qkv, features.7.0.attn.proj, features.7.0.attn.qkv, features.7.1.attn.proj, features.7.1.attn.qkv



+-------------------------+----------------------+------------+--------------+
|[1m [0m[1mmodule                 [0m[1m [0m|[1m [0m[1m#parameters or shape[0m[1m [0m|[1m [0m[1m#flops    [0m[1m [0m|[1m [0m[1m#activations[0m[1m [0m|
+-------------------------+----------------------+------------+--------------+
| model                   | 28.352M              | 4.956G     | 20.913M      |
|  features               |  27.581M             |  4.955G    |  20.912M     |
|   features.0            |   4.896K             |   15.956M  |   0.301M     |
|    features.0.0         |    4.704K            |    14.451M |    0.301M    |
|    features.0.2         |    0.192K            |    1.505M  |    0         |
|   features.1            |   0.23M              |   0.778G   |   7.457M     |
|    features.1.0         |    0.115M            |    0.389G  |    3.729M    |
|    features.1.1         |    0.115M            |    0.389G  |    3.729M    |
|   features.2            |   74.1

## Let's build a Model!

We'll wrap a model from `torchvision` in MMEngine's `BaseModel`.

In [None]:
# Let's try the smallest ViT we can get, a ViT-B16.
vit_b_16 = torchvision.models.vit_b_16()

class MMViT_B16(BaseModel):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, imgs, labels, mode):
        x = self.model(imgs)
        if mode == 'loss':
            return {'loss': F.cross_entropy(x, labels)}
        elif mode == 'predict':
            return x, labels

In [None]:
resnet50 = torchvision.models.resnet50()

class MMResNet50(BaseModel):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, imgs, labels, mode):
        x = self.model(imgs)
        if mode == 'loss':
            return {'loss': F.cross_entropy(x, labels)}
        elif mode == 'predict':
            return x, labels

model = MMResNet50(resnet50)

In [None]:
resnet18 = torchvision.models.resnet18()

class ModelWrapper(BaseModel):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, imgs, labels, mode):
        x = self.model(imgs)
        if mode == 'loss':
            return {'loss': F.cross_entropy(x, labels)}
        elif mode == 'predict':
            return x, labels

model = ModelWrapper(resnet18)

## Let's get some data!

We need to create a `Dataset` and `DataLoader` for training and validation.
Let's grab a dataset from `torchvision`.

In [None]:
# We'll be going for CIFAR10.

dpath = Path().home() / 'Data' / 'cifar10'

# First, let's define the transforms.
norm_cfg = dict(
    mean=[0.491, 0.482, 0.447],
    std=[0.202, 0.199, 0.201]
)

# ---------------------------------  train  --------------------------------- #

train_transforms = T.Compose([
    T.RandomCrop(32, padding=4),
    T.RandomHorizontalFlip(),
    T.ToTensor(),
    T.Normalize(**norm_cfg),
])

train_dataset = torchvision.datasets.CIFAR10(
    dpath,
    train=True,
    download=True,
    transform=train_transforms,
)

train_dataloader = DataLoader(batch_size=32, shuffle=True, dataset=train_dataset)

# ----------------------------------  val  ---------------------------------- #

val_transforms = T.Compose([
    T.ToTensor(),
    T.Normalize(**norm_cfg),
])

val_dataset = torchvision.datasets.CIFAR10(
    dpath,
    train=False,
    download=True,
    transform=val_transforms,
)

val_dataloader = DataLoader(batch_size=32, shuffle=False, dataset=val_dataset)



Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /home/evan/Data/cifar10/cifar-10-python.tar.gz


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 170498071/170498071 [00:19<00:00, 8547401.31it/s]


Extracting /home/evan/Data/cifar10/cifar-10-python.tar.gz to /home/evan/Data/cifar10
Files already downloaded and verified


## Build an Evaluation Metric
To evaluate model performance, we need to define a **Metric** called accuracy. 
This metric needs inherit from `BaseMetric` and implement the 
`process` and `compute_metrics` methods.

`process`: 
This method receives a batch of data from the dataloader and outputs from the
model (in `mode="predict"`). 
After processing this data, we save the information to `self.results` property.

```{note}
In a distributed environment, `self.results` is the information collected 
from all the processes.
```

`compute_metrics`:
This method receives the `results` processed by `process` to calculate an
evaluation metric, returning the information as a `dict`.

In [None]:
from mmengine.evaluator import BaseMetric

class Accuracy(BaseMetric):
    
    def process(self, data_batch, data_samples):
        score, gt = data_samples
        # Save the middle result of a batch to `self.results`
        self.results.append({
            'batch_size': len(gt),
            'correct': (score.argmax(dim=1) == gt).sum().cpu(),
        })

    def compute_metrics(self, results):
        total_correct = sum(item['correct'] for item in results)
        total_size = sum(item['batch_size'] for item in results)
        accuracy = 100 * total_correct / total_size
        # The key is the name of the metric.
        return dict(accuracy=accuracy)

## Build a Runner and SEND IT 🤙🏻

Now we can build a Runner with previously defined Model, DataLoader, and Metrics, and some other configs shown as follows:

In [None]:
work_dir = Path().home() / 'Experiments' / 'ShibaTown'

In [None]:
from torch.optim import SGD
from mmengine.runner import Runner


runner = Runner(
    # the model used for training and validation.
    # Needs to meet specific interface requirements
    #model=MMViT_B16(vit_b_16),
    model=model,
    # working directory which saves training logs and weight files
    work_dir=work_dir,
    # train dataloader needs to meet the PyTorch data loader protocol
    train_dataloader=train_dataloader,
    # optimize wrapper for optimization with additional features like
    # AMP, gradtient accumulation, etc
    optim_wrapper=dict(optimizer=dict(type=SGD, lr=0.001, momentum=0.9)),
    # trainging coinfs for specifying training epoches, verification intervals, etc
    train_cfg=dict(by_epoch=True, max_epochs=2, val_interval=1),
    # validation dataloaer also needs to meet the PyTorch data loader protocol
    val_dataloader=val_dataloader,
    # validation configs for specifying additional parameters required for validation
    val_cfg=dict(),
    # validation evaluator. The default one is used here
    val_evaluator=dict(type=Accuracy),
)

runner.train()

02/05 22:51:06 - mmengine - [4m[97mINFO[0m - 
------------------------------------------------------------
System environment:
    sys.platform: linux
    Python: 3.11.7 (main, Dec 20 2023, 09:04:31) [GCC 12.3.0]
    CUDA available: True
    MUSA available: False
    numpy_random_seed: 1743727311
    GPU 0: NVIDIA RTX A500 Laptop GPU
    CUDA_HOME: /usr/local/cuda
    NVCC: Cuda compilation tools, release 12.3, V12.3.107
    GCC: gcc (Ubuntu 12.3.0-1ubuntu1~22.04) 12.3.0
    PyTorch: 2.2.0+cu121
    PyTorch compiling details: PyTorch built with:
  - GCC 9.3
  - C++ Version: 201703
  - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications
  - Intel(R) MKL-DNN v3.3.2 (Git Hash 2dc95a2ad0841e29db8b22fbccaf3e5da7992b01)
  - OpenMP 201511 (a.k.a. OpenMP 4.5)
  - LAPACK is enabled (usually provided by MKL)
  - NNPACK is enabled
  - CPU capability usage: AVX2
  - CUDA Runtime 12.1
  - NVCC architecture flags: -gencode;arch=compu

ModelWrapper(
  (data_preprocessor): BaseDataPreprocessor()
  (model): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64

In [None]:
print('Fin!')

Fin!
