In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import patches
from collections import Counter
import cv2
from glob import glob
from tqdm import tqdm
from termcolor import colored

import torch
from torch import nn, optim
from torch.utils.data import DataLoader

import torchvision
from torchvision import transforms

import albumentations as A
from albumentations.pytorch import ToTensorV2


  check_for_updates()


In [3]:
import torch
import torchvision
import numpy as np


class CustomVOCDataset(torchvision.datasets.VOCDetection):
    def init_config_yolo(self, class_mapping, S=7, B=2, C=20, custom_transforms=None):
        """
        Initialize YOLO-specific configuration parameters.
        Args:
            class_mapping (dict): Mapping of class names to indices.
            S (int): Grid size (S x S).
            B (int): Number of bounding boxes per grid cell.
            C (int): Number of classes.
            custom_transforms (callable): Optional transformations for the dataset.
        """
        self.S = S  # Grid size S x S
        self.B = B  # Number of bounding boxes
        self.C = C  # Number of classes
        self.class_mapping = class_mapping  # Class-to-index mapping
        self.custom_transforms = custom_transforms  # Custom transformations

    def __getitem__(self, index):
        """
        Get an image and its corresponding YOLO-style label matrix.
        """
        # Get an image and its target (annotations) from the VOC dataset
        image, target = super(CustomVOCDataset, self).__getitem__(index)
        img_width, img_height = image.size

        # Convert target annotations to YOLO format bounding boxes
        boxes = convert_to_yolo_format(target, img_width, img_height, self.class_mapping)

        # Separate boxes and labels
        just_boxes = boxes[:, 1:]
        labels = boxes[:, 0]

        # Apply custom transformations if provided
        if self.custom_transforms:
            sample = {
                'image': np.array(image),
                'bboxes': just_boxes,
                'labels': labels,
            }
            sample = self.custom_transforms(**sample)
            image = sample['image']
            boxes = sample['bboxes']
            labels = sample['labels']

        # Create an empty label matrix for YOLO ground truth
        label_matrix = torch.zeros((self.S, self.S, self.C + 5 * self.B))

        # Convert boxes and labels to PyTorch tensors
        boxes = torch.tensor(boxes, dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.float32)
        image = torch.as_tensor(image, dtype=torch.float32)

        # Iterate through each bounding box in YOLO format
        for box, class_label in zip(boxes, labels):
            x, y, width, height = box.tolist()
            class_label = int(class_label)

            # Calculate the grid cell (i, j) that this box belongs to
            i, j = int(self.S * y), int(self.S * x)
            x_cell, y_cell = self.S * x - j, self.S * y - i

            # Calculate the width and height of the box relative to the grid cell
            width_cell, height_cell = width * self.S, height * self.S

            # If no object has been found in this specific cell (i, j) before
            if label_matrix[i, j, 20] == 0:
                # Mark that an object exists in this cell
                label_matrix[i, j, 20] = 1

                # Store the box coordinates as an offset from the cell boundaries
                box_coordinates = torch.tensor(
                    [x_cell, y_cell, width_cell, height_cell]
                )

                # Set the box coordinates in the label matrix
                label_matrix[i, j, 21:25] = box_coordinates

                # Set the one-hot encoding for the class label
                label_matrix[i, j, class_label] = 1

        return image, label_matrix


In [4]:
import numpy as np

def convert_to_yolo_format(target, img_width, img_height, class_mapping):
    """
    Convert annotation data from VOC format to YOLO format.

    Parameters:
        target (dict): Annotation data from VOCDetection dataset.
        img_width (int): Width of the original image.
        img_height (int): Height of the original image.
        class_mapping (dict): Mapping from class names to integer IDs.

    Returns:
        np.ndarray: Array of shape [N, 5] for N bounding boxes,
        each with [class_id, x_center, y_center, width, height].
    """
    # Extract the list of annotations from the target dictionary
    annotations = target['annotation']['object']

    # Get the real width and height of the image from the annotation
    real_width = int(target['annotation']['size']['width'])
    real_height = int(target['annotation']['size']['height'])

    # Ensure that annotations is a list, even if there’s only one object
    if not isinstance(annotations, list):
        annotations = [annotations]

    # Initialize an empty list to store the converted bounding boxes
    boxes = []

    # Loop through each annotation and convert it to YOLO format
    for anno in annotations:
        xmin = int(anno['bndbox']['xmin']) / real_width
        xmax = int(anno['bndbox']['xmax']) / real_width
        ymin = int(anno['bndbox']['ymin']) / real_height
        ymax = int(anno['bndbox']['ymax']) / real_height

        # Calculate the center coordinates, width, and height of the bounding box
        x_center = (xmin + xmax) / 2
        y_center = (ymin + ymax) / 2
        width = xmax - xmin
        height = ymax - ymin

        # Retrieve the class name from the annotation and map it to an integer ID
        class_name = anno['name']
        class_id = class_mapping[class_name] if class_name in class_mapping else 0

        # Append the YOLO formatted bounding box to the list
        boxes.append([class_id, x_center, y_center, width, height])

    # Convert the list of boxes to a numpy array
    return np.array(boxes)


In [5]:
import torch

def intersection_over_union(boxes_preds, boxes_labels, box_format="midpoint"):
    """
    Calculate the Intersection over Union (IoU) between bounding boxes.

    Parameters:
        boxes_preds (tensor): Predicted bounding boxes (BATCH_SIZE, 4)
        boxes_labels (tensor): Ground truth bounding boxes (BATCH_SIZE, 4)
        box_format (str): Box format, can be "midpoint" or "corners".

    Returns:
        tensor: Intersection over Union scores for each example.
    """
    # If box format is "midpoint", convert to "corners" format
    if box_format == "midpoint":
        box1_x1 = boxes_preds[..., 0:1] - boxes_preds[..., 2:3] / 2
        box1_y1 = boxes_preds[..., 1:2] - boxes_preds[..., 3:4] / 2
        box1_x2 = boxes_preds[..., 0:1] + boxes_preds[..., 2:3] / 2
        box1_y2 = boxes_preds[..., 1:2] + boxes_preds[..., 3:4] / 2

        box2_x1 = boxes_labels[..., 0:1] - boxes_labels[..., 2:3] / 2
        box2_y1 = boxes_labels[..., 1:2] - boxes_labels[..., 3:4] / 2
        box2_x2 = boxes_labels[..., 0:1] + boxes_labels[..., 2:3] / 2
        box2_y2 = boxes_labels[..., 1:2] + boxes_labels[..., 3:4] / 2

    # If box format is "corners", use the coordinates directly
    elif box_format == "corners":
        box1_x1 = boxes_preds[..., 0:1]
        box1_y1 = boxes_preds[..., 1:2]
        box1_x2 = boxes_preds[..., 2:3]
        box1_y2 = boxes_preds[..., 3:4]

        box2_x1 = boxes_labels[..., 0:1]
        box2_y1 = boxes_labels[..., 1:2]
        box2_x2 = boxes_labels[..., 2:3]
        box2_y2 = boxes_labels[..., 3:4]

    # Calculate the coordinates of the intersection rectangle
    x1 = torch.max(box1_x1, box2_x1)
    y1 = torch.max(box1_y1, box2_y1)
    x2 = torch.min(box1_x2, box2_x2)
    y2 = torch.min(box1_y2, box2_y2)

    # Compute the area of the intersection rectangle, clamp(0) to handle non-overlapping boxes
    intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)

    # Calculate the area of both bounding boxes
    box1_area = abs((box1_x2 - box1_x1) * (box1_y2 - box1_y1))
    box2_area = abs((box2_x2 - box2_x1) * (box2_y2 - box2_y1))

    # Calculate Intersection over Union (IoU)
    iou = intersection / (box1_area + box2_area - intersection + 1e-6)

    return iou


In [6]:
import torch

def non_max_suppression(bboxes, iou_threshold, threshold, box_format="corners"):
    """
    Perform Non-Maximum Suppression (NMS) on a list of bounding boxes.

    Parameters:
        bboxes (list): List of bounding boxes, each represented as
                       [class_pred, prob_score, x1, y1, x2, y2].
        iou_threshold (float): IoU threshold to determine overlap for suppression.
        threshold (float): Probability threshold to filter bounding boxes.
        box_format (str): Format of bounding boxes, either "midpoint" or "corners".

    Returns:
        list: Bounding boxes after applying NMS.
    """
    # Ensure bboxes is a list
    assert isinstance(bboxes, list), "bboxes must be a list"

    # Filter bounding boxes by probability threshold
    bboxes = [box for box in bboxes if box[1] > threshold]

    # Sort bounding boxes by probability score in descending order
    bboxes = sorted(bboxes, key=lambda x: x[1], reverse=True)

    # List to store bounding boxes after NMS
    bboxes_after_nms = []

    # Perform NMS
    while bboxes:
        # Select the bounding box with the highest probability
        chosen_box = bboxes.pop(0)

        # Remove bounding boxes with IoU greater than the threshold
        bboxes = [
            box
            for box in bboxes
            if box[0] != chosen_box[0]  # Keep boxes of different classes
            or intersection_over_union(
                torch.tensor(chosen_box[2:]),
                torch.tensor(box[2:]),
                box_format=box_format,
            )
            < iou_threshold  # Remove boxes with high IoU overlap
        ]

        # Add the chosen box to the result list
        bboxes_after_nms.append(chosen_box)

    return bboxes_after_nms


In [7]:
import torch
from collections import Counter


def mean_average_precision(
    pred_boxes, true_boxes, iou_threshold=0.5, box_format="midpoint", num_classes=20
):
    """
    Calculate the mean average precision (mAP).

    Parameters:
        pred_boxes (list): List of predicted bounding boxes, each defined as
                           [train_idx, class_pred, prob_score, x1, y1, x2, y2].
        true_boxes (list): List of ground truth bounding boxes, each defined as
                           [train_idx, class_label, x1, y1, x2, y2].
        iou_threshold (float): IoU threshold to consider a prediction as correct.
        box_format (str): "midpoint" or "corners" format for bounding boxes.
        num_classes (int): Total number of classes.

    Returns:
        float: mAP value across all classes at the specified IoU threshold.
    """
    # List to store average precision for each class
    average_precisions = []

    # Small epsilon to avoid division by zero
    epsilon = 1e-6

    for c in range(num_classes):
        # Filter predictions and ground truths for the current class
        detections = [detection for detection in pred_boxes if detection[1] == c]
        ground_truths = [gt for gt in true_boxes if gt[1] == c]

        # Count the number of ground truth boxes for each image
        amount_bboxes = Counter([gt[0] for gt in ground_truths])
        for key, val in amount_bboxes.items():
            amount_bboxes[key] = torch.zeros(val)

        # Sort detections by probability in descending order
        detections.sort(key=lambda x: x[2], reverse=True)

        # Initialize True Positive (TP) and False Positive (FP) arrays
        TP = torch.zeros(len(detections))
        FP = torch.zeros(len(detections))

        # Total ground truth boxes for this class
        total_true_bboxes = len(ground_truths)

        # Skip if there are no ground truth boxes for this class
        if total_true_bboxes == 0:
            continue

        for detection_idx, detection in enumerate(detections):
            # Get all ground truth boxes for the same image as the detection
            ground_truth_img = [gt for gt in ground_truths if gt[0] == detection[0]]

            best_iou = 0
            best_gt_idx = -1

            for idx, gt in enumerate(ground_truth_img):
                iou = intersection_over_union(
                    torch.tensor(detection[3:]),
                    torch.tensor(gt[3:]),
                    box_format=box_format,
                )

                if iou > best_iou:
                    best_iou = iou
                    best_gt_idx = idx

            # If IoU is greater than the threshold, mark as True Positive
            if best_iou > iou_threshold:
                if amount_bboxes[detection[0]][best_gt_idx] == 0:
                    TP[detection_idx] = 1
                    amount_bboxes[detection[0]][best_gt_idx] = 1
                else:
                    FP[detection_idx] = 1
            else:
                FP[detection_idx] = 1

        # Cumulative sum of True Positives and False Positives
        TP_cumsum = torch.cumsum(TP, dim=0)
        FP_cumsum = torch.cumsum(FP, dim=0)

        # Recall and Precision
        recalls = TP_cumsum / (total_true_bboxes + epsilon)
        precisions = TP_cumsum / (TP_cumsum + FP_cumsum + epsilon)

        # Add a starting point for recalls and precisions
        precisions = torch.cat((torch.tensor([1]), precisions))
        recalls = torch.cat((torch.tensor([0]), recalls))

        # Average Precision for this class (using trapezoidal rule for integration)
        average_precision = torch.trapz(precisions, recalls)
        average_precisions.append(average_precision)

    # Return the mean of average precisions across all classes
    return sum(average_precisions) / len(average_precisions)


In [8]:
import torch
import torch.nn as nn

# YOLOv1 architecture configuration
architecture_config = [
    (7, 64, 2, 3),  # Convolutional block 1
    "M",            # Max-pooling layer 1
    (3, 192, 1, 1),  # Convolutional block 2
    "M",            # Max-pooling layer 2
    (1, 128, 1, 0),  # Convolutional block 3
    (3, 256, 1, 1),  # Convolutional block 4
    (1, 256, 1, 0),  # Convolutional block 5
    (3, 512, 1, 1),  # Convolutional block 6
    "M",            # Max-pooling layer 3
    [(1, 256, 1, 0), (3, 512, 1, 1), 4],  # Convolutional block 7 (repeated 4 times)
    (1, 512, 1, 0),  # Convolutional block 8
    (3, 1024, 1, 1), # Convolutional block 9
    "M",            # Max-pooling layer 4
    [(1, 512, 1, 0), (3, 1024, 1, 1), 2],  # Convolutional block 10 (repeated 2 times)
    (3, 1024, 1, 1), # Convolutional block 11
    (3, 1024, 2, 1), # Convolutional block 12
    (3, 1024, 1, 1), # Convolutional block 13
    (3, 1024, 1, 1), # Convolutional block 14
]

# A convolutional block with Conv2d, BatchNorm2d, and LeakyReLU layers
class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, **kwargs):
        super(CNNBlock, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
        self.batchnorm = nn.BatchNorm2d(out_channels)
        self.leakyrelu = nn.LeakyReLU(0.1)

    def forward(self, x):
        return self.leakyrelu(self.batchnorm(self.conv(x)))

# YOLOv1 model class
class Yolov1(nn.Module):
    def __init__(self, in_channels=3, **kwargs):
        super(Yolov1, self).__init__()
        self.architecture = architecture_config
        self.in_channels = in_channels
        self.darknet = self._create_conv_layers(self.architecture)
        self.fcs = self._create_fcs(**kwargs)

    def forward(self, x):
        x = self.darknet(x)
        return self.fcs(torch.flatten(x, start_dim=1))

    def _create_conv_layers(self, architecture):
        layers = []
        in_channels = self.in_channels

        for x in architecture:
            if type(x) == tuple:
                layers += [
                    CNNBlock(
                        in_channels, x[1], kernel_size=x[0], stride=x[2], padding=x[3],
                    )
                ]
                in_channels = x[1]

            elif type(x) == str:
                layers += [nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))]

            elif type(x) == list:
                conv1 = x[0]
                conv2 = x[1]
                num_repeats = x[2]

                for _ in range(num_repeats):
                    layers += [
                        CNNBlock(
                            in_channels,
                            conv1[1],
                            kernel_size=conv1[0],
                            stride=conv1[2],
                            padding=conv1[3],
                        )
                    ]
                    layers += [
                        CNNBlock(
                            conv1[1],
                            conv2[1],
                            kernel_size=conv2[0],
                            stride=conv2[2],
                            padding=conv2[3],
                        )
                    ]
                    in_channels = conv2[1]

        return nn.Sequential(*layers)

    def _create_fcs(self, split_size, num_boxes, num_classes):
        S, B, C = split_size, num_boxes, num_classes

        return nn.Sequential(
            nn.Flatten(),
            nn.Linear(1024 * S * S, 4096),
            nn.Dropout(0.0),
            nn.LeakyReLU(0.1),
            nn.Linear(4096, S * S * (C + B * 5)),  # Final output layer
        )


In [9]:
import torch
import torch.nn as nn

class YoloLoss(nn.Module):
    """
    Calculate the loss for the YOLO (v1) model.
    """
    def __init__(self, S=7, B=2, C=20):
        super(YoloLoss, self).__init__()
        self.mse = nn.MSELoss(reduction="sum")  # Mean Squared Error loss with sum reduction
        self.S = S  # Grid size
        self.B = B  # Number of bounding boxes
        self.C = C  # Number of classes
        self.lambda_noobj = 0.5  # Weight for no object loss
        self.lambda_coord = 5  # Weight for box coordinate loss

    def forward(self, predictions, target):
        """
        Compute YOLOv1 loss.

        Args:
            predictions: Tensor of shape (BATCH_SIZE, S*S*(C + B*5)).
            target: Tensor of same shape as predictions.

        Returns:
            loss: Total loss (scalar).
        """
        # Reshape predictions to (BATCH_SIZE, S, S, C + B*5)
        predictions = predictions.reshape(-1, self.S, self.S, self.C + self.B * 5)

        # Calculate IoU for the two predicted bounding boxes with the target box
        iou_b1 = intersection_over_union(predictions[..., 21:25], target[..., 21:25])
        iou_b2 = intersection_over_union(predictions[..., 26:30], target[..., 21:25])
        ious = torch.cat([iou_b1.unsqueeze(0), iou_b2.unsqueeze(0)], dim=0)

        # Determine which bounding box has the highest IoU
        iou_maxes, bestbox = torch.max(ious, dim=0)
        exists_box = target[..., 20].unsqueeze(3)  # Indicator for object presence

        # ======================== #
        #   FOR BOX COORDINATES    #
        # ======================== #
        box_predictions = exists_box * (
            bestbox * predictions[..., 26:30] + (1 - bestbox) * predictions[..., 21:25]
        )
        box_targets = exists_box * target[..., 21:25]

        # Square root of width and height to stabilize gradients
        box_predictions[..., 2:4] = torch.sign(box_predictions[..., 2:4]) * torch.sqrt(
            torch.abs(box_predictions[..., 2:4]) + 1e-6
        )
        box_targets[..., 2:4] = torch.sqrt(box_targets[..., 2:4])

        box_loss = self.mse(
            torch.flatten(box_predictions, end_dim=-2),
            torch.flatten(box_targets, end_dim=-2),
        )

        # ==================== #
        #   FOR OBJECT LOSS    #
        # ==================== #
        pred_box = (
            bestbox * predictions[..., 25:26] + (1 - bestbox) * predictions[..., 20:21]
        )
        object_loss = self.mse(
            torch.flatten(exists_box * pred_box),
            torch.flatten(exists_box * target[..., 20:21]),
        )

        # ======================= #
        #   FOR NO OBJECT LOSS    #
        # ======================= #
        no_object_loss = self.mse(
            torch.flatten((1 - exists_box) * predictions[..., 20:21], start_dim=1),
            torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1),
        )
        no_object_loss += self.mse(
            torch.flatten((1 - exists_box) * predictions[..., 25:26], start_dim=1),
            torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1),
        )

        # ================== #
        #   FOR CLASS LOSS   #
        # ================== #
        class_loss = self.mse(
            torch.flatten(exists_box * predictions[..., :20], end_dim=-2),
            torch.flatten(exists_box * target[..., :20], end_dim=-2),
        )

        # ================== #
        #   TOTAL LOSS       #
        # ================== #
        loss = (
            self.lambda_coord * box_loss  # Box coordinate loss
            + object_loss                 # Object presence loss
            + self.lambda_noobj * no_object_loss  # No object loss
            + class_loss                  # Classification loss
        )

        return loss


In [10]:
# Set the random seed for reproducibility
seed = 123
torch.manual_seed(seed)

# Hyperparameters and configurations
LEARNING_RATE = 2e-5  # Learning rate for the optimizer
DEVICE = "cuda"  # Use "cuda" for GPU or "cpu" for CPU
BATCH_SIZE = 16  # Batch size (originally 64 in the paper, reduced for GPU limitations)
EPOCHS = 300  # Number of training epochs
NUM_WORKERS = 2  # Number of worker processes for data loading
PIN_MEMORY = True  # Pin memory for faster data transfer to GPU
LOAD_MODEL = False  # Set to True to load a pre-trained model
LOAD_MODEL_FILE = "yolov1.pth.tar"  # Pre-trained model file (if LOAD_MODEL is True)


In [11]:
import albumentations as A
from albumentations.pytorch import ToTensorV2

# Image dimensions
WIDTH = 448
HEIGHT = 448

# Training transformations
def get_train_transforms():
    return A.Compose(
        [
            A.OneOf(
                [
                    A.HueSaturationValue(
                        hue_shift_limit=0.2, 
                        sat_shift_limit=0.2, 
                        val_shift_limit=0.2, 
                        p=0.9
                    ),
                    A.RandomBrightnessContrast(
                        brightness_limit=0.2, 
                        contrast_limit=0.2, 
                        p=0.9
                    ),
                ],
                p=0.9,
            ),
            A.ToGray(p=0.01),
            A.HorizontalFlip(p=0.2),
            A.VerticalFlip(p=0.2),
            A.Resize(height=HEIGHT, width=WIDTH, p=1.0),
            # Uncomment if you want to use Cutout for data augmentation
            # A.Cutout(
            #     num_holes=8, 
            #     max_h_size=64, 
            #     max_w_size=64, 
            #     fill_value=0, 
            #     p=0.5
            # ),
            ToTensorV2(p=1.0),
        ],
        p=1.0,
        bbox_params=A.BboxParams(
            format='yolo', 
            min_area=0, 
            min_visibility=0, 
            label_fields=['labels']
        ),
    )

# Validation transformations
def get_valid_transforms():
    return A.Compose(
        [
            A.Resize(height=HEIGHT, width=WIDTH, p=1.0),
            ToTensorV2(p=1.0),
        ],
        p=1.0,
        bbox_params=A.BboxParams(
            format='yolo', 
            min_area=0, 
            min_visibility=0, 
            label_fields=['labels']
        ),
    )


In [12]:
class_mapping = {
    'aeroplane': 0,
    'bicycle': 1,
    'bird': 2,
    'boat': 3,
    'bottle': 4,
    'bus': 5,
    'car': 6,
    'cat': 7,
    'chair': 8,
    'cow': 9,
    'diningtable': 10,
    'dog': 11,
    'horse': 12,
    'motorbike': 13,
    'person': 14,
    'pottedplant': 15,
    'sheep': 16,
    'sofa': 17,
    'train': 18,
    'tvmonitor': 19,
}


In [13]:
import torch
from termcolor import colored

def train_fn(train_loader, model, optimizer, loss_fn, epoch):
    """
    Training function for YOLO.

    Args:
        train_loader: DataLoader for training data.
        model: YOLO model.
        optimizer: Optimizer for model training.
        loss_fn: Loss function for YOLO.
        epoch: Current epoch number.

    Returns:
        avg_mAP: Average mean Average Precision (mAP) for the epoch.
    """
    mean_loss = []
    mean_mAP = []

    total_batches = len(train_loader)
    display_interval = total_batches // 5  # Log progress at 20% intervals

    model.train()

    for batch_idx, (x, y) in enumerate(train_loader):
        x, y = x.to(DEVICE), y.to(DEVICE)
        out = model(x)
        loss = loss_fn(out, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Generate predicted and true bounding boxes
        pred_boxes, true_boxes = get_bboxes_training(
            out, y, iou_threshold=0.5, threshold=0.4
        )
        mAP = mean_average_precision(
            pred_boxes, true_boxes, iou_threshold=0.5, box_format="midpoint"
        )

        mean_loss.append(loss.item())
        mean_mAP.append(mAP.item())

        if batch_idx % display_interval == 0 or batch_idx == total_batches - 1:
            print(
                f"Epoch: {epoch:3} \t Iter: {batch_idx:3}/{total_batches:3} \t"
                f"Loss: {loss.item():3.10f} \t mAP: {mAP.item():3.10f}"
            )

    avg_loss = sum(mean_loss) / len(mean_loss)
    avg_mAP = sum(mean_mAP) / len(mean_mAP)

    print(colored(f"Train \t loss: {avg_loss:3.10f} \t mAP: {avg_mAP:3.10f}", "green"))

    return avg_mAP


In [14]:
import torch
from torch.utils.data import DataLoader, SubsetRandomSampler
import torch.optim as optim
from termcolor import colored

def train():
    # Initialize model, optimizer, and loss function
    model = Yolov1(split_size=7, num_boxes=2, num_classes=20).to(DEVICE)
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    loss_fn = YoloLoss()

    # Load pre-trained model checkpoint if required
    if LOAD_MODEL:
        load_checkpoint(torch.load(LOAD_MODEL_FILE), model, optimizer)

    # Create and configure training dataset
    train_dataset = CustomVOCDataset(
        root='./data',
        year='2012',
        image_set='train',
        download=True,
    )
    train_dataset.init_config_yolo(class_mapping=class_mapping, custom_transforms=get_train_transforms())

    # Create and configure validation and test datasets
    testval_dataset = CustomVOCDataset(
        root='./data',
        year='2012',
        image_set='val',
        download=True,
    )
    testval_dataset.init_config_yolo(class_mapping=class_mapping, custom_transforms=get_valid_transforms())

    # Split dataset into validation and test sets
    dataset_size = len(testval_dataset)
    val_size = int(0.15 * dataset_size)
    test_size = dataset_size - val_size

    val_indices = list(range(val_size))
    test_indices = list(range(val_size, val_size + test_size))

    # Create SubsetRandomSamplers for validation and test sets
    val_sampler = SubsetRandomSampler(val_indices)
    test_sampler = SubsetRandomSampler(test_indices)

    # Create DataLoaders
    train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS,
        pin_memory=PIN_MEMORY,
        drop_last=True,
    )

    val_loader = DataLoader(
        dataset=testval_dataset,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS,
        pin_memory=PIN_MEMORY,
        sampler=val_sampler,
        drop_last=False,
    )

    test_loader = DataLoader(
        dataset=testval_dataset,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS,
        pin_memory=PIN_MEMORY,
        sampler=test_sampler,
        drop_last=False,
    )

    # Initialize best mAP trackers
    best_mAP_train = 0
    best_mAP_val = 0
    best_mAP_test = 0

    # Training loop
    for epoch in range(EPOCHS):
        # Train and evaluate on train, validation, and test sets
        train_mAP = train_fn(train_loader, model, optimizer, loss_fn, epoch)
        val_mAP = test_fn(val_loader, model, loss_fn, epoch)  # Using test_fn for validation
        test_mAP = test_fn(test_loader, model, loss_fn, epoch)

        # Update best mAP values
        if train_mAP > best_mAP_train:
            best_mAP_train = train_mAP
        if val_mAP > best_mAP_val:
            best_mAP_val = val_mAP
            # Save checkpoint when validation mAP improves
            checkpoint = {
                "state_dict": model.state_dict(),
                "optimizer": optimizer.state_dict(),
            }
            save_checkpoint(checkpoint, filename=LOAD_MODEL_FILE)
        if test_mAP > best_mAP_test:
            best_mAP_test = test_mAP

        # Print best mAP values
        print(colored(f"Best Train mAP: {best_mAP_train:3.10f}", 'green'))
        print(colored(f"Best Val mAP: {best_mAP_val:3.10f}", 'blue'))
        print(colored(f"Best Test mAP: {best_mAP_test:3.10f}", 'yellow'))


In [15]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches

def plot_image_with_labels(image, ground_truth_boxes, predicted_boxes, class_mapping):
    """
    Plot an image with both ground truth and predicted bounding boxes.

    Args:
        image: The image to plot (PyTorch tensor or NumPy array).
        ground_truth_boxes: List of ground truth boxes in YOLO format
                            [class_id, confidence, x_center, y_center, width, height].
        predicted_boxes: List of predicted boxes in YOLO format
                         [class_id, confidence, x_center, y_center, width, height].
        class_mapping: Dictionary mapping class IDs to class names.
    """
    # Invert the class mapping for easier access to class names
    inverted_class_mapping = {v: k for k, v in class_mapping.items()}

    # Convert the image to a NumPy array and get its dimensions
    if isinstance(image, torch.Tensor):
        image = image.permute(1, 2, 0).cpu().numpy()
    height, width, _ = image.shape

    # Create the plot
    fig, ax = plt.subplots(1)
    ax.imshow(image)

    # Plot ground truth boxes (green)
    for box in ground_truth_boxes:
        label_index, box = box[0], box[2:]
        upper_left_x = box[0] - box[2] / 2
        upper_left_y = box[1] - box[3] / 2
        rect = patches.Rectangle(
            (upper_left_x * width, upper_left_y * height),
            box[2] * width,
            box[3] * height,
            linewidth=1,
            edgecolor="green",
            facecolor="none",
        )
        ax.add_patch(rect)
        class_name = inverted_class_mapping.get(label_index, "Unknown")
        ax.text(
            upper_left_x * width,
            upper_left_y * height,
            class_name,
            color="white",
            fontsize=12,
            bbox=dict(facecolor="green", alpha=0.2),
        )

    # Plot predicted boxes (red)
    for box in predicted_boxes:
        label_index, box = box[0], box[2:]
        upper_left_x = box[0] - box[2] / 2
        upper_left_y = box[1] - box[3] / 2
        rect = patches.Rectangle(
            (upper_left_x * width, upper_left_y * height),
            box[2] * width,
            box[3] * height,
            linewidth=1,
            edgecolor="red",
            facecolor="none",
        )
        ax.add_patch(rect)
        class_name = inverted_class_mapping.get(label_index, "Unknown")
        ax.text(
            upper_left_x * width,
            upper_left_y * height,
            class_name,
            color="white",
            fontsize=12,
            bbox=dict(facecolor="red", alpha=0.2),
        )

    plt.show()


In [16]:
import torch
from torch.utils.data import DataLoader

def test():
    """
    Test the YOLO model on the validation set and visualize predictions.
    """
    # Initialize the YOLO model
    model = Yolov1(split_size=7, num_boxes=2, num_classes=20).to(DEVICE)

    # Load pre-trained model weights
    if LOAD_MODEL:
        model.load_state_dict(torch.load(LOAD_MODEL_FILE)["state_dict"])

    # Prepare test dataset and DataLoader
    test_dataset = CustomVOCDataset(root='./data', image_set='val', download=False)
    test_dataset.init_config_yolo(class_mapping=class_mapping, custom_transforms=get_valid_transforms())
    test_loader = DataLoader(
        dataset=test_dataset,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS,
        pin_memory=PIN_MEMORY,
        shuffle=False,
        drop_last=False,
    )

    model.eval()
    with torch.no_grad():
        # Process the first batch from the test loader
        for x, y in test_loader:
            x = x.to(DEVICE)
            out = model(x)

            # Convert outputs and ground truth to bounding boxes
            pred_bboxes = cellboxes_to_boxes(out)
            gt_bboxes = cellboxes_to_boxes(y)

            # Visualize the first 8 images with bounding boxes
            for idx in range(min(8, len(x))):
                pred_box = non_max_suppression(
                    pred_bboxes[idx], iou_threshold=0.5, threshold=0.4, box_format="midpoint"
                )
                gt_box = non_max_suppression(
                    gt_bboxes[idx], iou_threshold=0.5, threshold=0.4, box_format="midpoint"
                )
                image = x[idx].permute(1, 2, 0).cpu() / 255.0
                plot_image_with_labels(image, gt_box, pred_box, class_mapping)

            break  # Process only the first batch


In [17]:
train()

Downloading http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar to ./data/VOCtrainval_11-May-2012.tar


100%|██████████| 2.00G/2.00G [04:26<00:00, 7.51MB/s]


Extracting ./data/VOCtrainval_11-May-2012.tar to ./data
Using downloaded and verified file: ./data/VOCtrainval_11-May-2012.tar
Extracting ./data/VOCtrainval_11-May-2012.tar to ./data


RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [16, 7, 7, 2]], which is output 0 of AsStridedBackward0, is at version 1; expected version 0 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

In [None]:
test()