Set Up and Environment

In [None]:
!pip install ptflops

In [None]:
!pip install -U fvcore

In [None]:
!pip install albumentations

In [None]:
## Import and setup ##

import os, time, zipfile, gc, glob, torch
import numpy as np
from PIL import Image
from collections import deque

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import albumentations as A
from albumentations.pytorch import ToTensorV2

from ptflops import get_model_complexity_info
from google.colab import drive

from fvcore.nn import FlopCountAnalysis, flop_count_table


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
drive.mount("/content/drive", force_remount=True)


Mounted at /content/drive


Image preprocessing for training and validation datasets.
The transformations include:
1. Resizing all input images to 512x1024 pixels.
2. Normalizing pixel values using ImageNet statistics
 (mean and standard deviation per channel).
3. Converting images to PyTorch tensors

In [None]:
## Transform ##

train_transform = A.Compose([
    A.Resize(height=512, width=1024),A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)), ToTensorV2()])

val_transform = A.Compose([
    A.Resize(height=512, width=1024), A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),ToTensorV2()])


Cityscapes Dataset and DataLoader Setup.

This section defines a PyTorch Dataset class for loading the Cityscapes
dataset.
DataLoaders are then created for both training and validation step.

In [None]:
## Dataset Cityscapes ##

class CityscapesDataset(Dataset):
    def __init__(self, root, split="train", transform=None):
        self.transform = transform
        self.images = sorted(glob.glob(f"{root}/images/{split}/**/*.png", recursive=True))
        self.labels = sorted(glob.glob(f"{root}/gtFine/{split}/**/*_labelTrainIds.png", recursive=True))   # TrainId are extracted

        min_lenght = min(len(self.images), len(self.labels))
        self.images = self.images[:min_lenght]
        self.labels = self.labels[:min_lenght]

    def __len__(self):
        # Return the number of images
        return len(self.images)

    def __getitem__(self, idx):
        img = np.array(Image.open(self.images[idx]).convert("RGB"))
        label = np.array(Image.open(self.labels[idx]), dtype=np.int64)

        if self.transform:
            # Apply the training transformation to image and mask
            transformed_img_lab = self.transform(image=img, mask=label)
            # Extract the transformed image and mask
            img = transformed_img_lab['image']
            label = transformed_img_lab['mask']

        # Converting label to LongTensor
        if isinstance(label, np.ndarray):
            label = torch.from_numpy(label).long()
        else:
            label = label.long()

        return img, label


## Create datasets for training and validation ##

train_dataset = CityscapesDataset(root="/content/drive/MyDrive/Cityscapes/Cityspaces", split="train", transform=train_transform)

val_dataset = CityscapesDataset(root="/content/drive/MyDrive/Cityscapes/Cityspaces",split="val", transform=val_transform)


## DataLoader setup ##

train_loader = DataLoader(
    train_dataset,
    batch_size=2,
    shuffle=True,
    num_workers=2,
    pin_memory=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=2,
    shuffle=False,     # No shuffle for validation
    num_workers=2,
    pin_memory=True
)



Definition of DeepLab v2 model with pre-trained ResNet backbone

In [None]:
## MODEL DeepLabV2 ##

affine_par = True


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, stride=stride, bias=False)
        self.bn1 = nn.BatchNorm2d(planes, affine=affine_par)
        for i in self.bn1.parameters():
            i.requires_grad = False
        padding = dilation
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1,
                               padding=padding, bias=False, dilation=dilation)
        self.bn2 = nn.BatchNorm2d(planes, affine=affine_par)
        for i in self.bn2.parameters():
            i.requires_grad = False
        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * 4, affine=affine_par)
        for i in self.bn3.parameters():
            i.requires_grad = False
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = self.conv3(out)
        out = self.bn3(out)
        if self.downsample is not None:
            residual = self.downsample(x)
        out += residual
        out = self.relu(out)

        return out


class ClassifierModule(nn.Module):
    def __init__(self, inplanes, dilation_series, padding_series, num_classes):
        super(ClassifierModule, self).__init__()
        self.conv2d_list = nn.ModuleList()
        for dilation, padding in zip(dilation_series, padding_series):
            self.conv2d_list.append(
                nn.Conv2d(inplanes, num_classes, kernel_size=3, stride=1, padding=padding,
                          dilation=dilation, bias=True))

        for m in self.conv2d_list:
            m.weight.data.normal_(0, 0.01)

    def forward(self, x):
        out = self.conv2d_list[0](x)
        for i in range(len(self.conv2d_list) - 1):
            out += self.conv2d_list[i + 1](x)
        return out


class ResNetMulti(nn.Module):
    def __init__(self, block, layers, num_classes):
        self.inplanes = 64
        super(ResNetMulti, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64, affine=affine_par)
        for i in self.bn1.parameters():
            i.requires_grad = False
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1, ceil_mode=True)  # change
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=1, dilation=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=4)
        self.layer6 = ClassifierModule(2048, [6, 12, 18, 24], [6, 12, 18, 24], num_classes)
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                m.weight.data.normal_(0, 0.01)
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1, dilation=1):
        downsample = None
        if (stride != 1
                or self.inplanes != planes * block.expansion
                or dilation == 2
                or dilation == 4):
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion, affine=affine_par))
        for i in downsample._modules['1'].parameters():
            i.requires_grad = False
        layers = []
        layers.append(
            block(self.inplanes, planes, stride, dilation=dilation, downsample=downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes, dilation=dilation))

        return nn.Sequential(*layers)

    def forward(self, x):
        _, _, H, W = x.size()

        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.layer6(x)

        x = torch.nn.functional.interpolate(x, size=(H, W), mode='bilinear')

        if self.training == True:
            return x, None, None

        return x

    def get_1x_lr_params_no_scale(self):
        """
        This generator returns all the parameters of the net except for
        the last classification layer. Note that for each batchnorm layer,
        requires_grad is set to False in deeplab_resnet.py, therefore this function does not return
        any batchnorm parameter
        """
        b = []

        b.append(self.conv1)
        b.append(self.bn1)
        b.append(self.layer1)
        b.append(self.layer2)
        b.append(self.layer3)
        b.append(self.layer4)

        for i in range(len(b)):
            for j in b[i].modules():
                jj = 0
                for k in j.parameters():
                    jj += 1
                    if k.requires_grad:
                        yield k

    def get_10x_lr_params(self):
        """
        This generator returns all the parameters for the last layer of the net,
        which does the classification of pixel into classes
        """
        b = []
        if self.multi_level:
            b.append(self.layer5.parameters())
        b.append(self.layer6.parameters())

        for j in range(len(b)):
            for i in b[j]:
                yield i

    def optim_parameters(self, lr):
        return [{'params': self.get_1x_lr_params_no_scale(), 'lr': lr},
                {'params': self.get_10x_lr_params(), 'lr': 10 * lr}]


def get_deeplab_v2(num_classes=19, pretrain=True, pretrain_model_path='DeepLab_resnet_pretrained_imagenet.pth'):
    model = ResNetMulti(Bottleneck, [3, 4, 23, 3], num_classes)

    if pretrain:
        print('Deeplab pretraining loading...')
        saved_state_dict = torch.load(pretrain_model_path)

        new_params = model.state_dict().copy()
        for i in saved_state_dict:
            i_parts = i.split('.')
            new_params['.'.join(i_parts[1:])] = saved_state_dict[i]
        model.load_state_dict(new_params, strict=False)

    return model



This section presents two functions that
- Compute mean IoUs per class and mIoU
- Measure model latency and FPS

In [None]:
def compute_miou(preds, labels, num_classes=19, device="cuda"):
    """
    Compute the mean Intersection over Union (mIoU) for semantic segmentation.

    This function calculates the IoU for each class and returns both the mean IoU
    and the list of per-class IoUs. It handles the "void" label (255) by
    excluding it from false positive calculations.
    """


    # Initialize variable that stores True Positives, False Positives and False Negatives
    tp = torch.zeros(num_classes, dtype=torch.int64, device=device)
    fp = torch.zeros(num_classes, dtype=torch.int64, device=device)
    fn = torch.zeros(num_classes, dtype=torch.int64, device=device)


    # Compute TP, FP, FN for each class
    for cls in range(num_classes):
        # True Positive
        tp[cls] += ((labels == cls) & (preds == cls)).sum()
        # False Positive
        fp[cls] += ((labels != cls) & (labels != 255) & (preds == cls)).sum()
        # False Negative
        fn[cls] += ((labels == cls) & (preds != cls)).sum()

    iou_per_class = []

    # Compute IoU for each class and store in a list
    for cls in range(num_classes):
        denom = tp[cls] + fp[cls] + fn[cls]
        iou = tp[cls].float() / (denom.float() + 1e-10)
        print(f"Class {cls}: TP={tp[cls].item()}, FP={fp[cls].item()}, FN={fn[cls].item()}, IoU={iou.item():.4f}")
        if denom > 0:  # only include classes with at least one pixel
            iou_per_class.append(iou.item())

    mean_iou = np.mean(iou_per_class) if iou_per_class else 0.0
    return mean_iou, iou_per_class


def measure_latency_and_fps(model, device,iterations=1000, input_size=(3, 512, 1024)):
    """
    Measure inference latency and FPS of a model.
    This function runs the model multiple times on a random input tensor and computes
    the average latency per forward pass and frames per second (FPS).
    """
    model.eval().to(device)

    # Create a random tensor
    image = torch.randn(1, *input_size).to(device)

    latencies = []
    fps_list = []

    with torch.no_grad():
        for _ in range(iterations):
            start = time.time()
            _ = model(image)    # forward pass
            if device == "cuda":
                torch.cuda.synchronize()
            end = time.time()

            elapsed = end - start
            latencies.append(elapsed)
            fps_list.append(1.0 / elapsed)

    mean_latency = np.mean(latencies) * 1000
    std_latency = np.std(latencies) * 1000
    mean_fps = np.mean(fps_list)
    std_fps = np.std(fps_list)

    print(f"Latency : {mean_latency:.2f} ms ± {std_latency:.2f} ms")
    print(f"FPS: {mean_fps:.2f} ± {std_fps:.2f}")

Function for the learning rate: polynomial learning rate.

In [None]:
## learning rate ##

def poly_lr_scheduler(optimizer, init_lr, iter, lr_decay_iter=1,
                      max_iter=300, power=0.9):
    """
    Polynomial decay of learning rate
        :param init_lr is base learning rate
        :param iter is a current iteration
        :param lr_decay_iter how frequently decay occurs, default is 1
        :param max_iter is number of maximum iterations
        :param power is a polymomial power
      Returns the scalar learning rate
    """

    lr = init_lr*(1 - iter/max_iter)**power
    optimizer.param_groups[0]['lr'] = lr
    return lr

This section sets up the DeepLab v2 model with a pre-trained ResNet backbone,
defines the loss function, and configures the optimizer.

In [None]:
## Model, loss function and optimizer ##

# Initialize DeepLab v2 with pre-trained ResNet backbone
model = get_deeplab_v2(num_classes=19,pretrain=True, pretrain_model_path='/content/drive/MyDrive/2aMachineLearning/deeplab_resnet_pretrained_imagenet.pth')
model = model.to(device)

# Cross-entropy loss function
criterion = torch.nn.CrossEntropyLoss(ignore_index=255)

# SGD optimizer with momentum and weight decay
optimizer = torch.optim.SGD(model.parameters(),lr=0.0025, weight_decay=5e-4, momentum=0.9)

Deeplab pretraining loading...


Checkpoint Management, Device Setup, and AMP Initialization.

This section prepares the training environment by:
   - Creating a directory to save model checkpoints
   - Selecting the computation device (GPU if available, else CPU).
   - Initializing PyTorch’s Automatic Mixed Precision (AMP) GradScaler

In [None]:
## Setup directories, device, and AMP scaler ##

# Directory to save model checkpoints
checkpoint_dir = "/content/drive/MyDrive/2aMachineLearning/checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)
max_checkpoints = 2
saved_checkpoints = deque()


# Select device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Automatic Mixed Precision: GradScaler
scaler = torch.amp.GradScaler(device='cuda') if device.type == 'cuda' else None

## Restore from latest checkpoint if available ##
latest_checkpoint = None
latest_epoch = -1
for fname in os.listdir(checkpoint_dir):
    if fname.startswith("checkpoint_epoch") and fname.endswith(".pt"):
        epoch_num = int(fname.split("_epoch")[1].split(".")[0])
        if epoch_num > latest_epoch:
            latest_epoch = epoch_num
            latest_checkpoint = os.path.join(checkpoint_dir, fname)

start_epoch = 0
if latest_checkpoint:
    checkpoint = torch.load(latest_checkpoint, map_location=device, weights_only=False)     # Load checkpoint

    # Restore model and optimizer state:
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    # Restore AMP scaler if used
    if scaler and checkpoint.get('scaler_state_dict'):
        scaler.load_state_dict(checkpoint['scaler_state_dict'])

    # Resume from next epoch
    start_epoch = checkpoint['epoch'] + 1
    print(f" Restored from {latest_checkpoint}")
else:
    print(" No checkpoint found. Starting from scratch")

 Restored from /content/drive/MyDrive/2aMachineLearning/checkpoints/checkpoint_epoch48.pt


Training of the model

In [None]:

## Variables for training progress ##

num_epochs = 50   # number of epochs
num_classes = 19    # Number of segmentation classes

# Best mIoU, corresponding checkpoint and list for storing evaluation of loss function on training dataset during epochs
best_miou = 0.0
best_epoch_ckpt = None
epoch_loss_list = []


## hyperparameters ##

initial_lr = 0.025    # Initial learning rate
alpha = 0.4   # Weight coefficient for auxiliary loss

scaler = torch.cuda.amp.GradScaler()

## training loop ##

for epoch in range(start_epoch, num_epochs):
    model.train()
    total_loss = 0.0
    lr = poly_lr_scheduler(optimizer, initial_lr, epoch)

    for batch_idx, (images, labels) in enumerate(train_loader):
        images = images.to(device, non_blocking=True)
        labels = labels.long().to(device, non_blocking=True)

        optimizer.zero_grad()

        # Forward step
        with torch.cuda.amp.autocast():
            outputs = model(images)

            if isinstance(outputs, (list, tuple)):
                main_pred, *aux_preds = outputs
                main_pred = F.interpolate(main_pred, size=labels.shape[1:], mode='bilinear', align_corners=False)

                # Main loss
                loss = criterion(main_pred, labels)

                # Auxiliary losses
                for aux in aux_preds:
                    if aux is not None:
                      aux = F.interpolate(aux, size=labels.shape[1:], mode='bilinear', align_corners=False)
                      loss += alpha * criterion(aux, labels)
            else:
                main_pred = F.interpolate(outputs, size=labels.shape[1:], mode='bilinear', align_corners=False)
                loss = criterion(main_pred, labels)

        # Backward step
        scaler.scale(loss).backward()
        scaler.step(optimizer)

        scaler.update()

        total_loss += loss.item()

        del images, labels, outputs, loss
        torch.cuda.empty_cache()
        gc.collect()

    epoch_loss = total_loss/len(train_loader)
    epoch_loss_list.append(epoch_loss)


    ## Miou on validation set ##

    all_predictions = []
    all_labels = []

    model.eval()
    with torch.no_grad():
        for images, labels in val_loader:
            images = images.to(device, non_blocking=True)
            labels = labels.long().to(device, non_blocking=True)

            outputs = model(images)
            if isinstance(outputs, (list, tuple)):
                main_pred = outputs[0]
            else:
                main_pred = outputs

            main_pred = F.interpolate(main_pred, size=labels.shape[1:], mode="bilinear", align_corners=False)

            probabilities = torch.nn.functional.softmax(main_pred, dim=1)
            predictions = torch.argmax(probabilities, dim=1)

            all_predictions.append(predictions)
            all_labels.append(labels)

    all_predictions = torch.cat(all_predictions, dim=0)
    all_labels = torch.cat(all_labels, dim=0)

    epoch_miou, IoU_per_class = compute_miou(all_predictions, all_labels, num_classes=num_classes)
    print(f" End Epoch {epoch+1} — Loss: {epoch_loss:.4f}, mIoU: {epoch_miou:.4f}, LR at the end of epoch: {lr:.6f}")

    ## Saving checkpoints every 3 epochs ##
    if (epoch + 1) % 3 == 0:
        checkpoint_filename = f"checkpoint_epoch{epoch+1}.pt"
        checkpoint_path = os.path.join(checkpoint_dir, checkpoint_filename)

        # Save model, optimizer, scaler, loss, and mIoU
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scaler_state_dict': scaler.state_dict(),
            'loss': epoch_loss,
            'miou': epoch_miou,
        }, checkpoint_path)
        saved_checkpoints.append(checkpoint_path)
        print(f"Checkpoint saved: {checkpoint_filename}")

        while len(saved_checkpoints) > max_checkpoints:
            old_ckpt = saved_checkpoints.popleft()
            if os.path.exists(old_ckpt):
                os.remove(old_ckpt)
                print(f"Removed old checkpoint: {os.path.basename(old_ckpt)}")

    ## Saving the best model ##
    if epoch_miou > best_miou:  # Update best mIoU
        best_miou = epoch_miou
        best_epoch_ckpt = os.path.join(checkpoint_dir, "2a_best_epoch.pt")

        # Save model, optimizer, scaler, loss, and mIoU as the best checkpoin
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scaler_state_dict': scaler.state_dict(),
            'loss': epoch_loss,
            'miou': epoch_miou,
        }, best_epoch_ckpt)
        print(f"New best model saved: {best_epoch_ckpt} con mIoU {best_miou:.4f}")


print(f"epoch_loss_list: {epoch_loss_list}")



  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():


Computing metrics such as FLOPs, latency and FPS of the final model.

In [None]:
## FLOPs, latency e FPS on the trained model ##
model.eval()
image = torch.zeros((1, 3, 512, 1024)).to(device)

# FLOPs of the model
flops = FlopCountAnalysis(model, image)
print(flop_count_table(flops))

# Measure model latency and FPS
measure_latency_and_fps(model, device=device)