In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import torch
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import MultiStepLR
import torchvision
import matplotlib.pyplot as plt
import dataset
from model import ResNet18YOLOv1
from loss import YOLOv1Loss
from tqdm import tqdm
from evaluate import get_bboxes, mean_average_precision

# About
This is an implementation of YOLOv1 from ***You Only Look Once: Unified, Real-Time Object Detection by Joseph Redmon, Santosh Divvala, Ross Girshick, and Ali Firhadi.*** Object detection is figuring out what objects are in an image and where they are. Another way to look at this problem is how can we write a computer program that draws bounding boxes around objects and predicts what kind of objects they are. YOLO solves this problem and does it super fast, like state of the art fast! I made slight modifications to the architecture and loss function which I'll discuss further down.

Let's talk about R-CNN, the predecessor to YOLO. It proposed regions, ran a classifier on every region, and did some post-processing to produce the final result. In simple language this translates to:
1. Lemme draw a lot of bounding boxes where I think objects are
2. Lemme figure out what are in the bounding boxes I drew
3. Ok, I drew too many bounding boxes, lemme remove most of them and keep the important ones

This is a lot of steps. What YOLO does instead is ***unified detection***. Unified detection combines the different components of object detection (where are the objects and what kind of objects are they) into one Convolutional Neural Network. You give it an image and in one swoop, it tells you exactly that.

Here's how it does it:
1. Divide the image into a SxS grid
2. Each cell in the grid predicts B bounding boxes and C class probabilities (what it thinks the object is)

We represent bounding boxes with 5 numbers: x, y, w, h, p.
- (x, y): center of the bounding box
- w: width
- h: height
- p: confidence (a measure of how confident we are that this box captures an object and matches the ground truth)

Accordingly, YOLOv1 produces a SxSx(5B+C) tensor. Each cell predicts B bounding boxes, how do we choose which one is the "true" predictor? How do we measure how good our bounding box and classification predictions are? 

We check which bounding box has the greatest overlap (IOU: Intersection Over Union) with the ground truth and choose that one as a predictor. We use this loss function to measure the "goodness" of our predictions:

![yolo loss function](https://i.stack.imgur.com/IddFu.png)

On a high level, it is the squared error between our prediction and the ground truth. Let's start training!

# Data (PASCAL VOC 2007)
PASCAL VOC Detection Dataset contains annotated images with 20 labelled classes and bounding boxes. There are 2,501 images in the training set, 2,510 images in the validation set, and 4,952 images in the test set.

In [2]:
# original dataset
pascal_voc_train = torchvision.datasets.VOCDetection(
    root="data",
    year="2007",
    image_set="train",
    download=False
)

pascal_voc_val = torchvision.datasets.VOCDetection(
    root="data",
    year="2007",
    image_set="val",
    download=False
)

pascal_voc_test = torchvision.datasets.VOCDetection(
    root="data",
    year="2007",
    image_set="test",
    download=False
)

# resize to 448x448, normalize, and convert annotations to target tensors
voc_train = dataset.PascalVOC(pascal_voc=pascal_voc_train)
voc_val = dataset.PascalVOC(pascal_voc=pascal_voc_val)
voc_test = dataset.PascalVOC(pascal_voc=pascal_voc_test)

In [3]:
# dataloaders
BATCH_SIZE = 64

train_loader = DataLoader(voc_train, batch_size=BATCH_SIZE, shuffle=False)
val_loader = DataLoader(voc_val, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(voc_test, batch_size=BATCH_SIZE, shuffle=False)

In [4]:
for x, y in train_loader:
    print(x.shape, y.shape)
    break

torch.Size([64, 3, 448, 448]) torch.Size([64, 7, 7, 25])


# Device

In [5]:
DEVICE = "cpu"

if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    DEVICE = torch.device("mps")

DEVICE

device(type='cuda')

# Hyperparameters
- S: dimensions of SxS grid
- B: number of bounding boxes predicted per cell
- C: number of classes
- lambda_coord: penalty on incorrect localization loss
- lambda_noobj: penalty on incorrect noobj confidence loss

In [6]:
S = 7
B = 2
C = 20
lambda_coord = 5.0
lambda_noobj = 0.5

# Training Setup
Model, loss function, optimizer, scheduler and evaluation utils

- Model: ResNet18 convolutional layers pretrained on ImageNet with 2 feedforward layers outputting a (N x S x S x (5B + C)) tensor
- Loss: Squared Error Loss
- Optimizer + Scheduler: Stochastic Gradient Descent with momentum of 0.9 and weight decay of 0.0005. We train with learning rate set to 1e-3 for the first 75 epochs, 1e-4 for the next 30 epochs, and 1e-5 for the final 30 epochs. 

In [7]:
yolo = ResNet18YOLOv1(S=S, B=B, C=C).to(DEVICE)
yolo_loss = YOLOv1Loss(S=S, B=B, C=C, lambda_coord=lambda_coord, lambda_noobj=lambda_noobj)

In [18]:
MOMENTUM = 0.9
WEIGHT_DECAY = 0.0005
LEARNING_RATE = 1e-4

optimizer = torch.optim.SGD(yolo.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)
scheduler = MultiStepLR(optimizer,
                        milestones=[75, 105],
                        gamma=0.1)

## Evaluating Loss

In [19]:
def compute_loss(model, criterion, dataloader):
    total_loss = 0
    model.eval()
    
    with torch.no_grad():
        for X, Y in dataloader:
            X = X.to(DEVICE)
            Y = Y.to(DEVICE)

            pred = model(X)
            loss = criterion(pred, Y)
            total_loss += loss.item()
            break
            
    N = len(dataloader)
    # loss = total_loss / N
    loss = total_loss
    return loss

## Evaluating mAP

In [20]:
CONFIDENCE_THRESHOLD = 0.5
IOU_THRESHOLD = 0.5

In [21]:
def bboxes(model, dataloader):
    model.eval()
    pred_bboxes = []
    target_bboxes = []
    
    with torch.no_grad():
        for X, Y in dataloader:
            X, Y = X.to(DEVICE), Y.to(DEVICE)
            pred = model(X)
            
            for i in range(len(X)):
                x = pred[i]
                y = Y[i]
                
                pred_bbox = get_bboxes(x, confidence_threshold=CONFIDENCE_THRESHOLD, iou_threshold=IOU_THRESHOLD, S=S, B=B, C=C)
                target_bbox = get_bboxes(y, confidence_threshold=CONFIDENCE_THRESHOLD, iou_threshold=IOU_THRESHOLD, S=S, B=1, C=C)
                
                pred_bboxes.append(pred_bbox)
                target_bboxes.append(target_bbox)
    
    return pred_bboxes, target_bboxes

In [22]:
def compute_mAP(model, dataloader):
    pred_bboxes, target_bboxes = bboxes(model, dataloader)
    mAP = mean_average_precision(pred_bboxes, target_bboxes, iou_threshold=IOU_THRESHOLD, C=C)
    
    return mAP

# Training

In [23]:
torch.cuda.empty_cache()

In [24]:
EPOCHS = 40

def train(model, criterion, train_loader, val_loader, optimizer, scheduler):
    train_losses = []
    val_losses = []
    val_mAPs = []
    
    best_mAP = -float("inf")
    
    N = len(train_loader)
    
    for epoch in range(EPOCHS):
        # set to train mode
        model.train()
        total_loss = 0
        
        lr = optimizer.param_groups[0]["lr"]
        pbar = tqdm(train_loader, leave=False, desc=f"Epoch [{epoch+1}/{EPOCHS}]: lr={lr}")
        
        for i, (X, Y) in enumerate(pbar):
            X, Y = X.to(DEVICE), Y.to(DEVICE)
            pred = model(X)
            loss = criterion(pred, Y)
            total_loss += loss.item()
            
            # backprop
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # update progress bar
            pbar.set_postfix(batch_loss=loss.item())
        
        # update learning rate with scheduler
        scheduler.step()  
        
        # calculate metrics
        train_loss = total_loss / N
        # train_loss = total_loss
        val_loss = compute_loss(model, criterion, val_loader)
        # val_mAP = compute_mAP(model, val_loader)
        
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        # val_mAPs.append(val_mAP)
        
#         if val_mAP > best_mAP:
#             best_mAP = val_mAP
            
#             # save best model
#             print("=> saving best model")
#             torch.save(model.state_dict(), "best_model.pth")
        
        # print(f"Epoch [{epoch+1}/{EPOCHS}]: Loss={train_loss}, Val Loss={val_loss}, mAP={val_mAP}")
        print(f"Epoch [{epoch+1}/{EPOCHS}]: Loss={train_loss}, Val Loss={val_loss}")
    
    return {
        "train_losses": train_losses,
        "val_losses": val_losses,
        "val_mAP": val_mAP,
        "epochs": EPOCHS
    }
    

In [None]:
train_result = train(yolo, 
      yolo_loss, 
      train_loader=train_loader, 
      val_loader=val_loader, 
      optimizer=optimizer, 
      scheduler=scheduler)

                                                                                         

Epoch [1/40]: Loss=3.1734261333942415, Val Loss=5.162011623382568


                                                                                         

Epoch [2/40]: Loss=3.1328529953956603, Val Loss=5.118073463439941


                                                                                         

Epoch [3/40]: Loss=3.0633435606956483, Val Loss=5.193297386169434


Epoch [4/40]: lr=0.0001:  55%|█████▌    | 22/40 [00:35<00:29,  1.61s/it, batch_loss=2.58]

In [None]:
train_result