# DataLab Cup 2: CNN for Object Detection

Sao-Hsuan Lin

113062532

In [None]:
import os

# device
DEVICE = "cuda:0"  # "cuda:i" or "cpu"

# common params
IMAGE_SIZE = 300
BATCH_SIZE = 8
NUM_CLASSES = 20
MAX_OBJECTS_PER_IMAGE = 20

# dataset params
# TRAIN_DATA_PATH = "./dataset/pascal_voc_training_data.txt"
# TRAIN_IMAGE_DIR = "./dataset/VOCdevkit_train/VOC2007/JPEGImages/"
TRAIN_DATA_PATH = "./dataset/augmented_data.txt"
TRAIN_IMAGE_DIR = "./dataset/AugmentedImage/"
TEST_DATA_PATH = "./dataset/pascal_voc_testing_data.txt"
TEST_IMAGE_DIR = "./dataset/VOCdevkit_test/VOC2007/JPEGImages/"

# model params
CELL_SIZE = 7
BOXES_PER_CELL = 2
OBJECT_SCALE = 1
NOOBJECT_SCALE = 0.5
CLASS_SCALE = 3
COORD_SCALE = 5

# training params
# set epochs to 0 to prevent training
START_EPOCH = 0
LEARNING_RATE = 1e-4
EPOCHS = 0
FREEZE_BACKBONE = True

# checkpoint params
CHECKPOINT_DIR = "./ckpts/yolo-efficientnet-b3/"
CHECKPOINT_NAME = "yolo_checkpoint"

# evaluation params
OUTPUT_DIR = "./output/yolo-efficientnet-b3/"
PRED_OUTPUT_PATH = os.path.join(OUTPUT_DIR, "yolo_predictions.csv")
EVAL_OUTPUT_PATH = os.path.join(OUTPUT_DIR, "yolo_eval_results.csv")

In [None]:
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    gpus = torch.cuda.device_count()
    print(f"Number of GPUs: {gpus}")
    device = torch.device(DEVICE)
else:
    print("No GPU available, using the CPU instead.")
    device = torch.device("cpu")
print(f"Device: {device}")

In [None]:
import warnings

warnings.filterwarnings("ignore")

In [None]:
from torch.utils.data import DataLoader, random_split
from models.yolo.data import (
    TrainDatasetGenerator,
    AugmentedTrainDatasetGenerator,
)


def create_data_loader(
    data_path,
    image_dir,
    batch_size,
    max_objects_per_image,
    image_size,
    shuffle=True,
    num_workers=8,
    pin_memory=False,
    drop_last=False,
    device: str = "",
):
    dataset = TrainDatasetGenerator(
        data_path,
        image_dir,
        max_objects_per_image,
        image_size,
        # apply_rotation=True,
    )
    train_size = int(0.9 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers,
        pin_memory=pin_memory,
        drop_last=drop_last,
        pin_memory_device=device,
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=pin_memory,
        drop_last=drop_last,
        pin_memory_device=device,
    )
    return train_loader, val_loader

In [None]:
import torch
from torch import nn
from torchvision import models
import torch.nn.functional as F
from models.yolo.layers import ConvLeakyReLU


class YOLOEfficientNetB3(nn.Module):
    def __init__(self, num_classes=1470):
        super(YOLOEfficientNetB3, self).__init__()

        # Load EfficientNetB7 from torchvision, pretrained on ImageNet
        efficientnet_b3 = models.efficientnet_b3(
            weights=models.EfficientNet_B3_Weights.DEFAULT
        )

        # Remove the classifier part of EfficientNet (the last fully connected layer)
        self.backbone = nn.Sequential(
            *list(efficientnet_b3.children())[:-2]
        )  # [batch_size, 1536, 10, 10]
        for param in self.backbone.parameters():
            param.requires_grad = False

        self.layer1 = ConvLeakyReLU(
            1536, 1024, kernel_size=3, stride=1, padding=1
        )
        self.layer2 = ConvLeakyReLU(
            1024, 1024, kernel_size=3, stride=2, padding=1
        )
        self.layer3 = ConvLeakyReLU(
            1024, 1024, kernel_size=3, stride=1, padding=1
        )
        self.layer4 = ConvLeakyReLU(
            1024, 1024, kernel_size=3, stride=1, padding=1
        )

        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(1024 * 5 * 5, 4096)
        self.leaky_relu = nn.LeakyReLU(0.1)
        self.fc2 = nn.Linear(4096, num_classes)

        # Xavier initialization for fully connected layers
        nn.init.xavier_normal_(self.fc1.weight)
        nn.init.constant_(self.fc1.bias, 0)
        nn.init.xavier_normal_(self.fc2.weight)
        nn.init.constant_(self.fc2.bias, 0)

    def forward(self, x):
        # Pass through EfficientNetB3 backbone
        x = self.backbone(x)  # Output shape: [batch_size, 1536, 10, 10]

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.flatten(x)
        x = self.fc1(x)
        x = self.leaky_relu(x)
        x = self.fc2(x)

        return x

    def freeze_backbone(self):
        for param in self.backbone.parameters():
            param.requires_grad = False

    def unfreeze_backbone(self):
        for param in self.backbone.parameters():
            param.requires_grad = True

In [None]:
model = YOLOEfficientNetB3().to(device)

## Training

In [None]:
import os
import math
import torch
from torch import optim
from torch.optim import lr_scheduler
from datetime import datetime
from models.yolo.layers import YoloLossV2
from utils.training import load_checkpoint, save_checkpoint

# Directory for saving checkpoints
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

train_loader, val_loader = create_data_loader(
    TRAIN_DATA_PATH,
    TRAIN_IMAGE_DIR,
    BATCH_SIZE,
    MAX_OBJECTS_PER_IMAGE,
    IMAGE_SIZE,
)
yolo_loss = YoloLossV2(
    CELL_SIZE,
    NUM_CLASSES,
    BOXES_PER_CELL,
    IMAGE_SIZE,
    CLASS_SCALE,
    OBJECT_SCALE,
    NOOBJECT_SCALE,
    COORD_SCALE,
    device,
)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95)


# Training step function
def train_step(model, optimizer, images, labels, object_nums):
    model.train()  # Set model to training mode
    optimizer.zero_grad()  # Zero out gradients

    # Forward pass
    outputs = model(images)
    class_end = CELL_SIZE * CELL_SIZE * NUM_CLASSES
    conf_end = class_end + CELL_SIZE * CELL_SIZE * BOXES_PER_CELL
    class_probs = outputs[:, :class_end].view(
        -1, CELL_SIZE, CELL_SIZE, NUM_CLASSES
    )
    confs = outputs[:, class_end:conf_end].view(
        -1, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL
    )
    boxes = outputs[:, conf_end:].view(
        -1, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL * 4
    )
    predicts = torch.cat([class_probs, confs, boxes], dim=3)

    # Compute loss
    loss = yolo_loss(predicts, labels, object_nums)
    loss_metric = loss.item()

    # Backward pass
    loss.backward()
    optimizer.step()

    return loss_metric


def val_step(model, images, labels, object_nums):
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():  # Disable gradient calculation
        outputs = model(images)
        class_end = CELL_SIZE * CELL_SIZE * NUM_CLASSES
        conf_end = class_end + CELL_SIZE * CELL_SIZE * BOXES_PER_CELL
        class_probs = outputs[:, :class_end].view(
            -1, CELL_SIZE, CELL_SIZE, NUM_CLASSES
        )
        confs = outputs[:, class_end:conf_end].view(
            -1, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL
        )
        boxes = outputs[:, conf_end:].view(
            -1, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL * 4
        )
        predicts = torch.cat([class_probs, confs, boxes], dim=3)

        # Compute loss
        loss = yolo_loss(predicts, labels, object_nums)
        loss_metric = loss.item()

    return loss_metric


# Load checkpoint if available
if START_EPOCH > 0:
    load_checkpoint(
        model, CHECKPOINT_DIR, CHECKPOINT_NAME, optimizer, START_EPOCH
    )

# Set lr
optimizer.param_groups[0]["lr"] = LEARNING_RATE

if FREEZE_BACKBONE:
    model.freeze_backbone()
else:
    model.unfreeze_backbone()

# Training loop
print(f"{datetime.now()}, start training.")
for epoch in range(START_EPOCH, EPOCHS + START_EPOCH):
    loss_metric_list = []
    val_loss_metric_list = []

    for idx, (images, labels, object_nums) in enumerate(train_loader):
        images, labels, object_nums = (
            images.to(device),
            labels.to(device),
            object_nums.to(device),
        )
        loss_metric = train_step(model, optimizer, images, labels, object_nums)
        loss_metric_list.append(loss_metric)

        if (
            math.isnan(loss_metric)
            or math.isinf(loss_metric)
            or loss_metric < 0
        ):
            print("Loss is {:.4f}, stop training.".format(loss_metric))
            break

        if idx % 100 == 0:
            lr = scheduler.get_lr()[0]
            print(
                "epoch {:3d}/{:3d}, batch: {:4d}/{:4d}, loss {:10.4f}, lr {:10.4e}".format(
                    epoch + 1,
                    EPOCHS + START_EPOCH,
                    idx + 1,
                    len(train_loader),
                    loss_metric,
                    lr,
                )
            )

    if math.isnan(loss_metric) or math.isinf(loss_metric) or loss_metric < 0:
        break

    # # Scheduler step after each epoch
    # scheduler.step()

    for images, labels, object_nums in val_loader:
        images, labels, object_nums = (
            images.to(device),
            labels.to(device),
            object_nums.to(device),
        )
        val_loss_metric = val_step(model, images, labels, object_nums)
        val_loss_metric_list.append(val_loss_metric)

    # Print info
    avg_train_loss = sum(loss_metric_list) / len(loss_metric_list)
    avg_val_loss = sum(val_loss_metric_list) / len(val_loss_metric_list)
    lr = scheduler.get_last_lr()[0]
    print(
        "epoch {:3d}/{:3d}, train loss {:10.4f}, val loss {:10.4f}, lr {:10.4e}".format(
            epoch + 1, EPOCHS + START_EPOCH, avg_train_loss, avg_val_loss, lr
        )
    )

    # Save checkpoint
    save_checkpoint(
        epoch + 1, model, optimizer, CHECKPOINT_DIR, CHECKPOINT_NAME
    )

## Predict Test data

In [None]:
# Load model from checkpoint
load_checkpoint(model, CHECKPOINT_DIR, CHECKPOINT_NAME)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import cv2
import numpy as np
from utils.data import CLASS_NAMES
from utils.process_output import process_outputs

np_img = cv2.imread('./dataset/VOCdevkit_test/VOC2007/JPEGImages/000002.jpg')
resized_img = cv2.resize(np_img, (IMAGE_SIZE, IMAGE_SIZE))
np_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB)
resized_img = np_img
np_img = np_img.astype(np.float32)
np_img = np_img / 255.0 * 2 - 1
np_img = np.reshape(np_img, (1, IMAGE_SIZE, IMAGE_SIZE, 3))
np_img = np.transpose(np_img, (0, 3, 1, 2))

model.eval()
y_pred = model(torch.tensor(np_img).to(device))

bboxes, classes, confidences = process_outputs(y_pred, CELL_SIZE, NUM_CLASSES, BOXES_PER_CELL, IMAGE_SIZE, conf_threshold=0.04)
for bbox, class_idx, conf in zip(bboxes, classes, confidences):
    xmin, ymin, xmax, ymax = bbox
    cv2.rectangle(resized_img, (int(xmin), int(ymin)), (int(xmax), int(ymax)), (0, 255, 255), 3)
    txt = f"{CLASS_NAMES[int(class_idx)]}: {conf:.2f}"
    cv2.putText(resized_img, txt, (int(xmin) + 5, int(ymin) + 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 220, 0), 1, cv2.LINE_8)

plt.imshow(resized_img)
plt.show()

In [None]:
import os
from torch.utils.data import DataLoader
from models.yolo.data import TestDatasetGenerator

# Test data loader
data_loader = DataLoader(
    TestDatasetGenerator(TEST_DATA_PATH, TEST_IMAGE_DIR, IMAGE_SIZE),
    batch_size=8,
    shuffle=False,
    num_workers=8,
    pin_memory=False,
)

# Test the model
# Output format: image_name {xmin_i ymin_i xmax_i ymax_i class_i confidence_score} (repeat number of objects times)
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
output_file = open(PRED_OUTPUT_PATH, "w")
for image_names, images, image_heights, image_widths in data_loader:
    images, image_heights, image_widths = (
        images.to(device),
        image_heights.to(device),
        image_widths.to(device),
    )
    model.eval()
    outputs = model(images)

    for i in range(images.size(0)):
        answers = []
        bboxes, classes, confidences = process_outputs(
            outputs[i : i + 1],
            CELL_SIZE,
            NUM_CLASSES,
            BOXES_PER_CELL,
            IMAGE_SIZE,
            conf_threshold=0.03,
        )
        for bbox, class_idx, conf in zip(bboxes, classes, confidences):
            xmin, ymin, xmax, ymax = bbox
            xmin, ymin, xmax, ymax = (
                xmin * (image_widths[i] / IMAGE_SIZE),
                ymin * (image_heights[i] / IMAGE_SIZE),
                xmax * (image_widths[i] / IMAGE_SIZE),
                ymax * (image_heights[i] / IMAGE_SIZE),
            )
            answers.append(
                "%d %d %d %d %d %f" % (xmin, ymin, xmax, ymax, class_idx, conf)
            )
        output_file.write(image_names[i] + " " + " ".join(answers) + "\n")
output_file.close()

In [None]:
import sys

sys.path.insert(0, "./evaluate")

import evaluate

evaluate.evaluate(PRED_OUTPUT_PATH, EVAL_OUTPUT_PATH)

from compute_score import compute_score

print(compute_score(EVAL_OUTPUT_PATH))