## Building YOLO from Scratch in PyTorch

## Workshop Outline
0. Dependencies and Libraries
1. Going over the YOLO architecture in code
2. Preparing the dataset
3. Training the model
4. Evaluating the model
5. Looking at its detections/results

## Dependencies and Libraries
The following is the list of libraries you need to install:
- Python (version >= 3.9)
- [PyTorch (CUDA version / CPU-Only)](https://pytorch.org/get-started/locally/)
    - Torchvision
    - Torchaudio
- [CUDA](https://developer.nvidia.com/cuda-toolkit-archive)
    - [Checking GPU compatibility](https://developer.nvidia.com/cuda-gpus)

### Importing Libraries

In [None]:
import torch
import os
import pandas as pd
from PIL import Image

print("CUDA is available:", torch.cuda.is_available())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# For Visualization
import matplotlib.pyplot as plt
import matplotlib.patches as patches

%load_ext autoreload
%autoreload 2

## YOLO Architecture Implementation
- *Check implementation in model.py and loss.py*

### Loading the Model
- This initializes the YOLO model

In [None]:
from model import YOLOv1

model = YOLOv1(split_size=7, num_boxes=2, num_classes=20).to(device)

print(model)

## Preparing the Dataset

In [None]:
import torchvision.transforms as transforms

class Compose(object):
    def __init__(self, transforms):
        self.transforms = transforms

    def __call__(self, img, bboxes):
        for t in self.transforms:
            img, bboxes = t(img), bboxes

        return img, bboxes
    
transform = Compose([transforms.Resize((448, 448)), transforms.ToTensor(),])

### Pascal VOC
- The dataset is the PascalVOC Dataset which contains 20 classes being:
    - aeroplane, bicycle, bird, boat, bottle
    - bus, car, cat, chair, cow
    - diningtable, dog, horse, motorbike, person
    - pottedplant, sheep, sofa, train, tvmonitor

- The copy of the dataset can be downloaded here: [PascalVOC_YOLO](https://www.kaggle.com/datasets/734b7bcb7ef13a045cbdd007a3c19874c2586ed0b02b4afc86126e89d00af8d2)


In [None]:
image_path = "dataset/PascalVOC/images/000003.jpg"

image = Image.open(image_path)
img_width, img_height = image.size


# Class ID, Center_X, Center_Y, Width, Height)
bboxes = [
    (17, 0.338, 0.4666666666666667, 0.184, 0.10666666666666666),
    (8, 0.546, 0.48133333333333334, 0.136, 0.13066666666666665)
]

CLASS_COLORS = {
    8: (0.55, 0.27, 0.07, 1.0),
    17: (1.0, 0.41, 0.71, 1.0)
}

CLASS_MAP = {
    8: 'chair',
    17: 'sofa'
}

fig, ax = plt.subplots(1, figsize=(8, 8))
ax.imshow(image)

for bbox in bboxes:
    class_id, x_center, y_center, width, height = bbox

    upper_left_x = (x_center - width / 2) * img_width
    upper_left_y = (y_center - height / 2) * img_height
    bbox_width = width * img_width
    bbox_height = height * img_height
    rect = patches.Rectangle(
        (upper_left_x, upper_left_y),
        bbox_width,
        bbox_height,
        linewidth=2,
        edgecolor=CLASS_COLORS.get(class_id, "black"),
        facecolor="none"
    )

    ax.text(
        upper_left_x, upper_left_y - 5, f"{CLASS_MAP[class_id]}",
        color="white",
        fontsize=10, weight="bold",
        bbox=dict(facecolor=CLASS_COLORS.get(class_id, "black"), alpha=0.75)
    )

    ax.add_patch(rect)

ax.set_xticks([])
ax.set_yticks([])
plt.show()

- Create a dataset object for the PascalVOC dataset

In [None]:
# Source: https://github.com/aladdinpersson/Machine-Learning-Collection/blob/master/ML/Pytorch/object_detection/YOLO #
class VOCDataset(torch.utils.data.Dataset):
    def __init__(
        self, csv_file, img_dir, label_dir, S=7, B=2, C=20, transform=None,
    ):
        self.annotations = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.label_dir = label_dir
        self.transform = transform
        self.S = S
        self.B = B
        self.C = C

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        label_path = os.path.join(self.label_dir, self.annotations.iloc[index, 1])
        boxes = []
        with open(label_path) as f:
            for label in f.readlines():
                class_label, x, y, width, height = [
                    float(x) if float(x) != int(float(x)) else int(x)
                    for x in label.replace("\n", "").split()
                ]

                boxes.append([class_label, x, y, width, height])

        img_path = os.path.join(self.img_dir, self.annotations.iloc[index, 0])
        image = Image.open(img_path)
        boxes = torch.tensor(boxes)

        if self.transform:
            image, boxes = self.transform(image, boxes)

        # Convert To Cells
        label_matrix = torch.zeros((self.S, self.S, self.C + 5 * self.B))
        for box in boxes:
            class_label, x, y, width, height = box.tolist()
            class_label = int(class_label)

            # i,j represents the cell row and cell column
            i, j = int(self.S * y), int(self.S * x)
            x_cell, y_cell = self.S * x - j, self.S * y - i

            """
            Calculating the width and height of cell of bounding box,
            relative to the cell is done by the following, with
            width as the example:
            
            width_pixels = (width*self.image_width)
            cell_pixels = (self.image_width)
            
            Then to find the width relative to the cell is simply:
            width_pixels/cell_pixels, simplification leads to the
            formulas below.
            """
            width_cell, height_cell = (
                width * self.S,
                height * self.S,
            )

            # If no object already found for specific cell i,j
            # Note: This means we restrict to ONE object per cell
            if label_matrix[i, j, 20] == 0:
                # Set that there exists an object
                label_matrix[i, j, 20] = 1

                # Box coordinates
                box_coordinates = torch.tensor(
                    [x_cell, y_cell, width_cell, height_cell]
                )

                label_matrix[i, j, 21:25] = box_coordinates

                # Set one hot encoding for class_label
                label_matrix[i, j, class_label] = 1

        return image, label_matrix

- Initialize the train and test sets

In [None]:
train_dataset = VOCDataset(
    "dataset/PascalVOC/train.csv",
    transform=transform,
    img_dir="dataset/PascalVOC/images",
    label_dir="dataset/PascalVOC/labels",
)

test_dataset = VOCDataset(
    "dataset/PascalVOC/test.csv", 
    transform=transform, 
    img_dir="dataset/PascalVOC/images", 
    label_dir="dataset/PascalVOC/labels",
)

### Dataloaders

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=16,
    pin_memory=True,
    shuffle=True,
    drop_last=True,
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=16,
    pin_memory=True,
    shuffle=True,
    drop_last=True,
)

## Training and Evaluation

In [None]:
import tqdm
import torch.optim as optim
from loss import YoloLoss
from utils import (
    non_max_suppression,
    mean_average_precision,
    cellboxes_to_boxes,
    get_bboxes,
    plot_image,
    save_checkpoint,
    load_checkpoint
)

learning_rate = 2e-5
weight_decay = 0

optimizer = optim.Adam(
    model.parameters(), lr=learning_rate, weight_decay=weight_decay
)

loss_fn = YoloLoss()

In [None]:
def train(train_loader, model, optimizer, loss_fn):
    model.train()
    loop = tqdm.tqdm(train_loader, leave=True)
    avg_loss = []

    for batch_idx, (x, y) in enumerate(loop):
        x, y = x.to(device), y.to(device)
        out = model(x)
        loss = loss_fn(out, y)
        avg_loss.append(loss.item())

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step() # Update weights

        loop.set_postfix(loss=loss.item())

    print(f"Loss: {sum(avg_loss)/len(avg_loss)}")

### Training Loop
- Loads checkpoint weights if set to `True`. The code is the training loop for the model, alongside the losses it computes the mAP for each epoch on the train set.
    - You can download my checkpoint weights here: [weights/checkpoint.pth.tar](https://drive.google.com/file/d/1konF4j8UeFrea-3E3qgc49QWai30b-rs/view?usp=sharing)

In [None]:
epochs = 2
best_map = 0
checkpoint = True

if checkpoint:
      load_checkpoint(torch.load("weights/checkpoint.pth.tar"), model, optimizer)
      pred_boxes, target_boxes = get_bboxes(train_loader, model, iou_threshold=0.5, score_threshold=0.4)
      best_map = mean_average_precision(pred_boxes, target_boxes, iou_threshold=0.5, box_format="midpoint")

for epoch in range(epochs):

        print(f"Epoch {epoch}:")
        train(train_loader, model, optimizer, loss_fn)

        pred_boxes, target_boxes = get_bboxes(train_loader, model, iou_threshold=0.5, score_threshold=0.4)
        
        mean_avg_prec = mean_average_precision(pred_boxes, target_boxes, iou_threshold=0.5, box_format="midpoint")
        print(f"Train mAP: {mean_avg_prec}")

      # Update save to higher mAP weights
      #   if mean_avg_prec > best_map:
      #      best_map = mean_avg_prec
      #      checkpoint = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict()}
      #      save_checkpoint(checkpoint, filename="weights/checkpoint.pth.tar")

### Evaluation and Visualization
- Get mAP scores of trained model on training set and test set and visualize the results

In [None]:
load_checkpoint(torch.load("weights/checkpoint.pth.tar"), model, optimizer)

#### Train Set

In [None]:
pred_boxes, target_boxes = get_bboxes(train_loader, model, iou_threshold=0.5, score_threshold=0.4)
mean_avg_prec = mean_average_precision(pred_boxes, target_boxes, iou_threshold=0.5, box_format="midpoint")

print(f"Train mAP: {mean_avg_prec}")

In [None]:
num_images = 3
count = 0

for x, y in train_loader:
    x = x.to(device)
    for idx in range(8):
        if count >= num_images:
            break

        bboxes = cellboxes_to_boxes(model(x))
        bboxes = non_max_suppression(bboxes[idx], iou_threshold=0.5, threshold=0.4, box_format="midpoint")
        plot_image(x[idx].permute(1,2,0).to("cpu"), bboxes)

        count += 1
    
    if count >= num_images:
        break

#### Test Set

In [None]:
pred_boxes, target_boxes = get_bboxes(test_loader, model, iou_threshold=0.5, score_threshold=0.4)
mean_avg_prec = mean_average_precision(pred_boxes, target_boxes, iou_threshold=0.5, box_format="midpoint")

print(f"Test mAP: {mean_avg_prec}")

In [None]:
num_images = 3
count = 0

for x, y in test_loader:
    x = x.to(device)
    for idx in range(8):
        if count >= num_images:
            break

        bboxes = cellboxes_to_boxes(model(x))
        bboxes = non_max_suppression(bboxes[idx], iou_threshold=0.5, threshold=0.4, box_format="midpoint")
        plot_image(x[idx].permute(1,2,0).to("cpu"), bboxes)

        count += 1
    
    if count >= num_images:
        break