In [1]:
%load_ext autoreload
%autoreload 2

In [61]:
import torch
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import MultiStepLR
import torchvision
import matplotlib.pyplot as plt
import dataset
from model import ResNet18YOLOv1
from loss import YOLOv1Loss
from tqdm import tqdm
from evaluate import get_bboxes, mean_average_precision

# About
This is an implementation of YOLOv1 from ***You Only Look Once: Unified, Real-Time Object Detection by Joseph Redmon, Santosh Divvala, Ross Girshick, and Ali Firhadi.*** Object detection is figuring out what objects are in an image and where they are. Another way to look at this problem is how can we write a computer program that draws bounding boxes around objects and predicts what kind of objects they are. YOLO solves this problem and does it super fast, like state of the art fast! I made slight modifications to the architecture and loss function which I'll discuss further down.

Let's talk about R-CNN, the predecessor to YOLO. It proposed regions, ran a classifier on every region, and did some post-processing to produce the final result. In simple language this translates to:
1. Lemme draw a lot of bounding boxes where I think objects are
2. Lemme figure out what are in the bounding boxes I drew
3. Ok, I drew too many bounding boxes, lemme remove most of them and keep the important ones

This is a lot of steps. What YOLO does instead is ***unified detection***. Unified detection combines the different components of object detection (where are the objects and what kind of objects are they) into one Convolutional Neural Network. You give it an image and in one swoop, it tells you exactly that.

Here's how it does it:
1. Divide the image into a SxS grid
2. Each cell in the grid predicts B bounding boxes and C class probabilities (what it thinks the object is)

We represent bounding boxes with 5 numbers: x, y, w, h, p.
- (x, y): center of the bounding box
- w: width
- h: height
- p: confidence (a measure of how confident we are that this box captures an object and matches the ground truth)

Accordingly, YOLOv1 produces a SxSx(5B+C) tensor. Each cell predicts B bounding boxes, how do we choose which one is the "true" predictor? How do we measure how good our bounding box and classification predictions are? 

We check which bounding box has the greatest overlap (IOU: Intersection Over Union) with the ground truth and choose that one as a predictor. We use this loss function to measure the "goodness" of our predictions:

![yolo loss function](https://i.stack.imgur.com/IddFu.png)

On a high level, it is the squared error between our prediction and the ground truth. Let's start training!

# Data (PASCAL VOC 2007)
PASCAL VOC Detection Dataset contains annotated images with 20 labelled classes and bounding boxes. There are 2,501 images in the training set, 2,510 images in the validation set, and 4,952 images in the test set.

In [62]:
# original dataset
pascal_voc_train = torchvision.datasets.VOCDetection(
    root="data",
    year="2007",
    image_set="train",
    download=False
)

pascal_voc_val = torchvision.datasets.VOCDetection(
    root="data",
    year="2007",
    image_set="val",
    download=False
)

pascal_voc_test = torchvision.datasets.VOCDetection(
    root="data",
    year="2007",
    image_set="test",
    download=False
)

# resize to 448x448, normalize, and convert annotations to target tensors
voc_train = dataset.PascalVOC(pascal_voc=pascal_voc_train)
voc_val = dataset.PascalVOC(pascal_voc=pascal_voc_val)
voc_test = dataset.PascalVOC(pascal_voc=pascal_voc_test)

In [63]:
# dataloaders
BATCH_SIZE = 32

train_loader = DataLoader(voc_train, batch_size=BATCH_SIZE, shuffle=False)
val_loader = DataLoader(voc_val, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(voc_test, batch_size=BATCH_SIZE, shuffle=False)

In [64]:
for x, y in train_loader:
    print(x.shape, y.shape)
    break

torch.Size([32, 3, 448, 448]) torch.Size([32, 7, 7, 25])


# Device

In [65]:
DEVICE = "cpu"

if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    DEVICE = torch.device("mps")

DEVICE

device(type='cuda')

# Hyperparameters
- S: dimensions of SxS grid
- B: number of bounding boxes predicted per cell
- C: number of classes
- lambda_coord: penalty on incorrect localization loss
- lambda_noobj: penalty on incorrect noobj confidence loss

In [66]:
S = 7
B = 2
C = 20
lambda_coord = 5.0
lambda_noobj = 0.5

# Training Setup
Model, loss function, optimizer, scheduler and evaluation utils

- Model: ResNet18 convolutional layers pretrained on ImageNet with 2 feedforward layers outputting a (N x S x S x (5B + C)) tensor
- Loss: Squared Error Loss
- Optimizer + Scheduler: Stochastic Gradient Descent with momentum of 0.9 and weight decay of 0.0005. We train with learning rate set to 1e-3 for the first 75 epochs, 1e-4 for the next 30 epochs, and 1e-5 for the final 30 epochs. 

In [67]:
yolo = ResNet18YOLOv1(S=S, B=B, C=C).to(DEVICE)
yolo_loss = YOLOv1Loss(S=S, B=B, C=C, lambda_coord=lambda_coord, lambda_noobj=lambda_noobj)

In [77]:
MOMENTUM = 0.9
WEIGHT_DECAY = 0.0005
# LEARNING_RATE = 1e-3
LEARNING_RATE = 1e-4

# optimizer = torch.optim.SGD(yolo.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)
optimizer = torch.optim.SGD(yolo.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)
# scheduler = MultiStepLR(optimizer,
#                         milestones=[75, 105],
#                         gamma=0.1)
scheduler = MultiStepLR(optimizer,
                        milestones=[17],
                        gamma=0.1)

## Evaluating Loss

In [78]:
def compute_loss(model, criterion, dataloader):
    total_loss = 0
    model.eval()
    
    with torch.no_grad():
        for X, Y in dataloader:
            X = X.to(DEVICE)
            Y = Y.to(DEVICE)

            pred = model(X)
            loss = criterion(pred, Y)
            total_loss += loss.item()
            break
            
    N = len(dataloader)
    # loss = total_loss / N
    loss = total_loss
    return loss

## Evaluating mAP

In [79]:
CONFIDENCE_THRESHOLD = 0.5
IOU_THRESHOLD = 0.5

In [80]:
def bboxes(model, dataloader):
    model.eval()
    pred_bboxes = []
    target_bboxes = []
    
    with torch.no_grad():
        for X, Y in dataloader:
            X, Y = X.to(DEVICE), Y.to(DEVICE)
            pred = model(X)
            
            for i in range(len(X)):
                x = pred[i]
                y = Y[i]
                
                pred_bbox = get_bboxes(x, confidence_threshold=CONFIDENCE_THRESHOLD, iou_threshold=IOU_THRESHOLD, S=S, B=B, C=C)
                target_bbox = get_bboxes(y, confidence_threshold=CONFIDENCE_THRESHOLD, iou_threshold=IOU_THRESHOLD, S=S, B=1, C=C)
                
                pred_bboxes.append(pred_bbox)
                target_bboxes.append(target_bbox)
    
    return pred_bboxes, target_bboxes

In [81]:
def compute_mAP(model, dataloader):
    pred_bboxes, target_bboxes = bboxes(model, dataloader)
    mAP = mean_average_precision(pred_bboxes, target_bboxes, iou_threshold=IOU_THRESHOLD, C=C)
    
    return mAP

# Training

In [82]:
torch.cuda.empty_cache()

In [83]:
EPOCHS = 47

def train(model, criterion, train_loader, val_loader, optimizer, scheduler):
    train_losses = []
    val_losses = []
    val_mAPs = []
    
    best_mAP = -float("inf")
    
    N = len(train_loader)
    
    for epoch in range(EPOCHS):
        # set to train mode
        model.train()
        total_loss = 0
        
        lr = optimizer.param_groups[0]["lr"]
        pbar = tqdm(train_loader, leave=False, desc=f"Epoch [{epoch+1}/{EPOCHS}]: lr={lr}")
        
        for i, (X, Y) in enumerate(pbar):
            X, Y = X.to(DEVICE), Y.to(DEVICE)
            pred = model(X)
            loss = criterion(pred, Y)
            total_loss += loss.item()
            
            # backprop
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # update progress bar
            pbar.set_postfix(batch_loss=loss.item())
        
        # update learning rate with scheduler
        scheduler.step()  
        
        # calculate metrics
        train_loss = total_loss / N
        val_loss = compute_loss(model, criterion, val_loader)
        # val_mAP = compute_mAP(model, val_loader)
        
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        # val_mAPs.append(val_mAP)
        
#         if val_mAP > best_mAP:
#             best_mAP = val_mAP
            
#             # save best model
#             print("=> saving best model")
#             torch.save(model.state_dict(), "best_model.pth")
        
        # print(f"Epoch [{epoch+1}/{EPOCHS}]: Loss={train_loss}, Val Loss={val_loss}, mAP={val_mAP}")
        print(f"Epoch [{epoch+1}/{EPOCHS}]: Loss={train_loss}, Val Loss={val_loss}")
        # print(f"Epoch [{epoch+1}/{EPOCHS}]: Loss={train_loss}")
    
    return {
        "train_losses": train_losses,
        "val_losses": val_losses,
        # "val_mAP": val_mAP,
        "epochs": EPOCHS
    }
    

In [84]:
train_result = train(yolo, 
      yolo_loss, 
      train_loader=train_loader, 
      val_loader=val_loader, 
      optimizer=optimizer, 
      scheduler=scheduler)

                                                                                          

Epoch [1/47]: Loss=0.41366252567194683, Val Loss=4.3801164627075195


                                                                                          

Epoch [2/47]: Loss=0.40954702828503864, Val Loss=4.38690185546875


                                                                                          

Epoch [3/47]: Loss=0.4109177383818204, Val Loss=4.371411323547363


                                                                                          

Epoch [4/47]: Loss=0.40219785782355294, Val Loss=4.3650712966918945


                                                                                          

Epoch [5/47]: Loss=0.3980318743594085, Val Loss=4.370936393737793


                                                                                          

Epoch [6/47]: Loss=0.4051446548745602, Val Loss=4.36860466003418


                                                                                          

Epoch [7/47]: Loss=0.40083092998100234, Val Loss=4.366539478302002


                                                                                          

Epoch [8/47]: Loss=0.39550705650184725, Val Loss=4.3622541427612305


                                                                                          

Epoch [9/47]: Loss=0.395018798829634, Val Loss=4.384642124176025


                                                                                           

Epoch [10/47]: Loss=0.3903928287798845, Val Loss=4.391739368438721


                                                                                           

Epoch [11/47]: Loss=0.3879205463807794, Val Loss=4.3895673751831055


                                                                                           

Epoch [12/47]: Loss=0.38464951156815397, Val Loss=4.386770248413086


                                                                                           

Epoch [13/47]: Loss=0.38615043216113804, Val Loss=4.387179374694824


                                                                                           

Epoch [14/47]: Loss=0.3843506352056431, Val Loss=4.382532596588135


                                                                                           

Epoch [15/47]: Loss=0.3860694952780687, Val Loss=4.387601852416992


                                                                                           

Epoch [16/47]: Loss=0.3816681413710872, Val Loss=4.382450103759766


                                                                                           

Epoch [17/47]: Loss=0.3787918854740602, Val Loss=4.389592170715332


                                                                                          

Epoch [18/47]: Loss=0.3824614865493171, Val Loss=4.390236854553223


                                                                                          

Epoch [19/47]: Loss=0.38036430844023256, Val Loss=4.39033842086792


                                                                                          

Epoch [20/47]: Loss=0.37659589550163175, Val Loss=4.389893531799316


                                                                                          

Epoch [21/47]: Loss=0.3776979293627075, Val Loss=4.389444351196289


                                                                                          

Epoch [22/47]: Loss=0.37722086736673044, Val Loss=4.389580726623535


                                                                                          

Epoch [23/47]: Loss=0.3762362812516056, Val Loss=4.390537738800049


                                                                                          

Epoch [24/47]: Loss=0.3804168774734569, Val Loss=4.390586853027344


                                                                                          

Epoch [25/47]: Loss=0.3766538016026533, Val Loss=4.390893936157227


                                                                                          

Epoch [26/47]: Loss=0.3763019536869435, Val Loss=4.391341209411621


                                                                                          

Epoch [27/47]: Loss=0.3776222833349735, Val Loss=4.391352653503418


                                                                                          

Epoch [28/47]: Loss=0.377379163911071, Val Loss=4.392122268676758


                                                                                          

Epoch [29/47]: Loss=0.37795083639742455, Val Loss=4.391934394836426


                                                                                          

Epoch [30/47]: Loss=0.37918470535851734, Val Loss=4.392219543457031


                                                                                          

Epoch [31/47]: Loss=0.37744864048082616, Val Loss=4.392506122589111


                                                                                          

Epoch [32/47]: Loss=0.37390614706504194, Val Loss=4.391820907592773


                                                                                          

Epoch [33/47]: Loss=0.3743356722819654, Val Loss=4.39193868637085


                                                                                          

Epoch [34/47]: Loss=0.375195148436329, Val Loss=4.390186786651611


                                                                                          

Epoch [35/47]: Loss=0.3779275373944753, Val Loss=4.389100074768066


                                                                                          

Epoch [36/47]: Loss=0.3762403081489515, Val Loss=4.388833045959473


                                                                                          

Epoch [37/47]: Loss=0.37418102369278294, Val Loss=4.389584541320801


                                                                                          

Epoch [38/47]: Loss=0.3763273296099675, Val Loss=4.3899312019348145


                                                                                          

Epoch [39/47]: Loss=0.36977565005610263, Val Loss=4.389260292053223


                                                                                          

Epoch [40/47]: Loss=0.37279316367982307, Val Loss=4.389886856079102


                                                                                          

Epoch [41/47]: Loss=0.3770729942789561, Val Loss=4.390512466430664


                                                                                          

Epoch [42/47]: Loss=0.37287919223308563, Val Loss=4.390302658081055


                                                                                          

Epoch [43/47]: Loss=0.37403447247004207, Val Loss=4.389866352081299


                                                                                          

Epoch [44/47]: Loss=0.3790361426676376, Val Loss=4.390549659729004


                                                                                          

Epoch [45/47]: Loss=0.3753308700987055, Val Loss=4.392234802246094


                                                                                          

Epoch [46/47]: Loss=0.3763999663576295, Val Loss=4.3906402587890625


                                                                                          

Epoch [47/47]: Loss=0.37199418473092816, Val Loss=4.389849662780762


In [None]:
train_result

In [88]:
torch.save(yolo.state_dict(), "best_model.pth")