In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import torch
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import MultiStepLR, StepLR, ExponentialLR
import torchvision
import matplotlib.pyplot as plt
import dataset
from model import ResNet18YOLOv1
from loss import YOLOv1Loss
from tqdm import tqdm
from evaluate import get_bboxes, mean_average_precision

# About
This is an implementation of YOLOv1 from ***You Only Look Once: Unified, Real-Time Object Detection by Joseph Redmon, Santosh Divvala, Ross Girshick, and Ali Firhadi.*** Object detection is figuring out what objects are in an image and where they are. Another way to look at this problem is how can we write a computer program that draws bounding boxes around objects and predicts what kind of objects they are. YOLO solves this problem and does it super fast, like state of the art fast! I made slight modifications to the architecture and loss function which I'll discuss further down.

Let's talk about R-CNN, the predecessor to YOLO. It proposed regions, ran a classifier on every region, and did some post-processing to produce the final result. In simple language this translates to:
1. Lemme draw a lot of bounding boxes where I think objects are
2. Lemme figure out what are in the bounding boxes I drew
3. Ok, I drew too many bounding boxes, lemme remove most of them and keep the important ones

This is a lot of steps. What YOLO does instead is ***unified detection***. Unified detection combines the different components of object detection (where are the objects and what kind of objects are they) into one Convolutional Neural Network. You give it an image and in one swoop, it tells you exactly that.

Here's how it does it:
1. Divide the image into a SxS grid
2. Each cell in the grid predicts B bounding boxes and C class probabilities (what it thinks the object is)

We represent bounding boxes with 5 numbers: x, y, w, h, p.
- (x, y): center of the bounding box
- w: width
- h: height
- p: confidence (a measure of how confident we are that this box captures an object and matches the ground truth)

Accordingly, YOLOv1 produces a SxSx(5B+C) tensor. Each cell predicts B bounding boxes, how do we choose which one is the "true" predictor? How do we measure how good our bounding box and classification predictions are? 

We check which bounding box has the greatest overlap (IOU: Intersection Over Union) with the ground truth and choose that one as a predictor. We use this loss function to measure the "goodness" of our predictions:

![yolo loss function](https://i.stack.imgur.com/IddFu.png)

On a high level, it is the squared error between our prediction and the ground truth. Let's start training!

# Data (PASCAL VOC 2007)
PASCAL VOC Detection Dataset contains annotated images with 20 labelled classes and bounding boxes. There are 2,501 images in the training set, 2,510 images in the validation set, and 4,952 images in the test set.

In [4]:
# original dataset
pascal_voc_train = torchvision.datasets.VOCDetection(
    root="data",
    year="2007",
    image_set="train",
    download=False
)

pascal_voc_val = torchvision.datasets.VOCDetection(
    root="data",
    year="2007",
    image_set="val",
    download=False
)

pascal_voc_test = torchvision.datasets.VOCDetection(
    root="data",
    year="2007",
    image_set="test",
    download=False
)

# resize to 448x448, normalize, and convert annotations to target tensors
voc_train = dataset.PascalVOC(pascal_voc=pascal_voc_train)
voc_val = dataset.PascalVOC(pascal_voc=pascal_voc_val)
voc_test = dataset.PascalVOC(pascal_voc=pascal_voc_test)

In [5]:
# dataloaders
BATCH_SIZE = 64

train_loader = DataLoader(voc_train, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(voc_val, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(voc_test, batch_size=BATCH_SIZE, shuffle=True)

In [6]:
for x, y in train_loader:
    print(x.shape, y.shape)
    break

torch.Size([64, 3, 448, 448]) torch.Size([64, 7, 7, 25])


# Device

In [7]:
DEVICE = "cpu"

if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    DEVICE = torch.device("mps")

DEVICE

'cpu'

# Hyperparameters
- S: dimensions of SxS grid
- B: number of bounding boxes predicted per cell
- C: number of classes
- lambda_coord: penalty on incorrect localization loss
- lambda_noobj: penalty on incorrect noobj confidence loss

In [8]:
S = 7
B = 2
C = 20
lambda_coord = 5.0
lambda_noobj = 0.5

# Training Setup
Model, loss function, optimizer, scheduler and evaluation utils

- Model: ResNet18 convolutional layers pretrained on ImageNet with 2 feedforward layers outputting a (N x S x S x (5B + C)) tensor
- Loss: Squared Error Loss
- Optimizer + Scheduler: Stochastic Gradient Descent with momentum of 0.9 and weight decay of 0.0005. We train with learning rate set to 1e-3 for the first 75 epochs, 1e-4 for the next 30 epochs, and 1e-5 for the final 30 epochs. 

In [9]:
yolo = ResNet18YOLOv1(S=S, B=B, C=C).to(DEVICE)
yolo_loss = YOLOv1Loss(S=S, B=B, C=C, lambda_coord=lambda_coord, lambda_noobj=lambda_noobj)

In [10]:
MOMENTUM = 0.9
WEIGHT_DECAY = 0.0005
PRE_LEARNING_RATE = 1e-4
LEARNING_RATE = 1e-3
PRE_EPOCHS = 5
EPOCHS = 135

optimizer = torch.optim.SGD(yolo.parameters(), lr=PRE_LEARNING_RATE, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)
pre_exp_factor = (LEARNING_RATE / PRE_LEARNING_RATE) ** (1 / PRE_EPOCHS)
pre_scheduler = ExponentialLR(optimizer, gamma=pre_exp_factor)
scheduler = MultiStepLR(optimizer, milestones=[75, 105], gamma=0.1)

## Evaluating Loss

In [25]:
def compute_loss(model, criterion, dataloader):
    total_loss = 0
    model.eval()
    
    with torch.no_grad():
        for X, Y in dataloader:
            X = X.to(DEVICE)
            Y = Y.to(DEVICE)

            pred = model(X)
            loss = criterion(pred, Y)
            total_loss += loss.item()
            break
            
    N = len(dataloader)
    # loss = total_loss / N
    loss = total_loss
    return loss

## Evaluating mAP

In [26]:
CONFIDENCE_THRESHOLD = 0.25
IOU_THRESHOLD = 0.5

In [27]:
def bboxes(model, dataloader):
    model.eval()
    pred_bboxes = []
    target_bboxes = []
    
    with torch.no_grad():
        for X, Y in dataloader:
            X, Y = X.to(DEVICE), Y.to(DEVICE)
            pred = model(X)
            
            for i in range(len(X)):
                x = pred[i]
                y = Y[i]
                
                pred_bbox = get_bboxes(x, confidence_threshold=CONFIDENCE_THRESHOLD, iou_threshold=IOU_THRESHOLD, S=S, B=B, C=C)
                target_bbox = get_bboxes(y, confidence_threshold=CONFIDENCE_THRESHOLD, iou_threshold=IOU_THRESHOLD, S=S, B=1, C=C)
                
                pred_bboxes.append(pred_bbox)
                target_bboxes.append(target_bbox)
    
    return pred_bboxes, target_bboxes

In [28]:
def compute_mAP(model, dataloader):
    pred_bboxes, target_bboxes = bboxes(model, dataloader)
    mAP = mean_average_precision(pred_bboxes, target_bboxes, iou_threshold=IOU_THRESHOLD, C=C)
    
    return mAP

# Training

In [29]:
torch.cuda.empty_cache()

In [30]:
def train(model, criterion, train_loader, val_loader, optimizer, scheduler, pre_scheduler, pre_epochs, epochs):
    train_losses = []
    val_losses = []
    best_val_loss = float("inf")
    active_scheduler = pre_scheduler
    
    N = len(train_loader)
    
    for epoch in range(pre_epochs + epochs):
        # set to train mode
        model.train()
        total_loss = 0
        
        lr = optimizer.param_groups[0]["lr"]
        pbar = tqdm(train_loader, leave=False, desc=f"Epoch [{epoch+1}/{pre_epochs + epochs}]: lr={lr}")
        
        for i, (X, Y) in enumerate(pbar):
            X, Y = X.to(DEVICE), Y.to(DEVICE)
            pred = model(X)
            loss = criterion(pred, Y)
            train_losses.append(loss.item())
            total_loss += loss.item()
            # val_loss = compute_loss(model, criterion, val_loader)
            # val_losses.append(val_loss)
            
            # backprop
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # update progress bar
            pbar.set_postfix(batch_loss=loss.item())
        
        # update learning rate with scheduler
        active_scheduler.step()  
        
        # calculate metrics
        train_loss = total_loss / N
        val_loss = compute_loss(model, criterion, val_loader)
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            # save best model
            print("=> saving model")
            torch.save(model.state_dict(), "best_model.pth")
        
        if epoch == pre_epochs - 1:
            active_scheduler = scheduler
        
        print(f"Epoch [{epoch+1}/{EPOCHS}]: Loss={train_loss}, Val Loss={val_loss}")
    
    return {
        "train_losses": train_losses,
        "val_losses": val_losses,
        "pre_epochs": pre_epochs,
        "epochs": epochs
    }
    

In [31]:
train_result = train(yolo, 
      yolo_loss, 
      train_loader=train_loader, 
      val_loader=val_loader, 
      optimizer=optimizer, 
      pre_scheduler=pre_scheduler,
      scheduler=scheduler,
      pre_epochs=PRE_EPOCHS,
      epochs=EPOCHS
    )

                                                                                          

=> saving model
Epoch [1/135]: Loss=18.438377451896667, Val Loss=6.989580154418945


                                                                                                          

=> saving model
Epoch [2/135]: Loss=8.007345354557037, Val Loss=5.3820953369140625


                                                                                                          

=> saving model
Epoch [3/135]: Loss=6.713325822353363, Val Loss=5.198844909667969


                                                                                                          

=> saving model
Epoch [4/135]: Loss=6.016474312543869, Val Loss=5.052807807922363


                                                                                                         

=> saving model
Epoch [5/135]: Loss=5.434785181283951, Val Loss=4.992927551269531


                                                                                                         

=> saving model
Epoch [6/135]: Loss=4.875706040859223, Val Loss=4.884123802185059


                                                                                                         

=> saving model
Epoch [7/135]: Loss=4.21184828877449, Val Loss=4.815620422363281


                                                                                                         

=> saving model
Epoch [8/135]: Loss=3.8218248069286345, Val Loss=4.735628128051758


                                                                                                         

Epoch [9/135]: Loss=3.4411068379879, Val Loss=4.891972541809082


                                                                                                          

Epoch [10/135]: Loss=3.1556669950485228, Val Loss=4.972096920013428


                                                                                                          

Epoch [11/135]: Loss=3.030849117040634, Val Loss=4.87185525894165


                                                                                                          

Epoch [12/135]: Loss=2.7462028205394744, Val Loss=4.743162631988525


                                                                                                          

=> saving model
Epoch [13/135]: Loss=2.559248313307762, Val Loss=4.699849605560303


                                                                                                          

=> saving model
Epoch [14/135]: Loss=2.365251213312149, Val Loss=4.554368019104004


                                                                                                          

=> saving model
Epoch [15/135]: Loss=2.217328372597694, Val Loss=4.534677982330322


                                                                                                          

Epoch [16/135]: Loss=2.129265856742859, Val Loss=4.605022430419922


                                                                                                          

Epoch [17/135]: Loss=2.0250596582889555, Val Loss=4.616058826446533


                                                                                                          

Epoch [18/135]: Loss=1.9047053337097168, Val Loss=4.643465042114258


                                                                                                          

Epoch [19/135]: Loss=1.8305783420801163, Val Loss=4.63784122467041


                                                                                                          

Epoch [20/135]: Loss=1.7734659790992737, Val Loss=4.582029342651367


                                                                                                          

Epoch [21/135]: Loss=1.697898694872856, Val Loss=4.712367057800293


                                                                                                          

Epoch [22/135]: Loss=1.6990468353033066, Val Loss=4.88136625289917


                                                                                                          

Epoch [23/135]: Loss=1.7102133721113204, Val Loss=4.799613952636719


                                                                                                          

Epoch [24/135]: Loss=1.6193825393915176, Val Loss=4.661437034606934


                                                                                                          

Epoch [25/135]: Loss=1.5192356407642365, Val Loss=4.717012405395508


                                                                                                          

Epoch [26/135]: Loss=1.5209031015634538, Val Loss=4.58297061920166


                                                                                                          

Epoch [27/135]: Loss=1.4396533459424972, Val Loss=4.7719526290893555


                                                                                                          

Epoch [28/135]: Loss=1.3692256718873979, Val Loss=4.725193023681641


                                                                                                          

Epoch [29/135]: Loss=1.2999969244003295, Val Loss=4.629217147827148


                                                                                                           

=> saving model
Epoch [30/135]: Loss=1.2728725776076317, Val Loss=4.513859272003174


                                                                                                           

=> saving model
Epoch [31/135]: Loss=1.2516739875078202, Val Loss=4.5059099197387695


                                                                                                           

=> saving model
Epoch [32/135]: Loss=1.2544240176677703, Val Loss=4.470510482788086


                                                                                                           

Epoch [33/135]: Loss=1.2463082030415535, Val Loss=4.544309616088867


                                                                                                           

Epoch [34/135]: Loss=1.2328982591629027, Val Loss=4.526529312133789


                                                                                                           

Epoch [35/135]: Loss=1.1947389557957648, Val Loss=4.575481414794922


                                                                                                            

=> saving model
Epoch [36/135]: Loss=1.1573955044150352, Val Loss=4.42613410949707


                                                                                                            

=> saving model
Epoch [37/135]: Loss=1.0422888562083243, Val Loss=4.385383605957031


                                                                                                            

=> saving model
Epoch [38/135]: Loss=1.0121290519833566, Val Loss=4.380411148071289


                                                                                                            

Epoch [39/135]: Loss=0.9902253568172454, Val Loss=4.3873610496521


                                                                                                            

Epoch [40/135]: Loss=0.9798985123634338, Val Loss=4.3965888023376465


                                                                                                            

Epoch [41/135]: Loss=0.9556765392422676, Val Loss=4.386624336242676


                                                                                                            

=> saving model
Epoch [42/135]: Loss=0.9503527671098709, Val Loss=4.374203681945801


                                                                                                            

Epoch [43/135]: Loss=0.9327093988656998, Val Loss=4.394341468811035


                                                                                                            

Epoch [44/135]: Loss=0.9262596666812897, Val Loss=4.385965347290039


                                                                                                            

Epoch [45/135]: Loss=0.9012362152338028, Val Loss=4.388501167297363


                                                                                                            

Epoch [46/135]: Loss=0.9161386415362358, Val Loss=4.387277126312256


                                                                                                            

Epoch [47/135]: Loss=0.9028321355581284, Val Loss=4.3760666847229


                                                                                                            

=> saving model
Epoch [48/135]: Loss=0.8938896223902703, Val Loss=4.365889549255371


                                                                                                            

=> saving model
Epoch [49/135]: Loss=0.8903761774301528, Val Loss=4.364752769470215


                                                                                                            

Epoch [50/135]: Loss=0.8766697153449059, Val Loss=4.378524303436279


                                                                                                            

=> saving model
Epoch [51/135]: Loss=0.8657611012458801, Val Loss=4.351946830749512


                                                                                                            

Epoch [52/135]: Loss=0.8627159699797631, Val Loss=4.382003307342529


                                                                                                            

Epoch [53/135]: Loss=0.8674574047327042, Val Loss=4.364062786102295


                                                                                                            

Epoch [54/135]: Loss=0.856218321621418, Val Loss=4.377956867218018


                                                                                                            

Epoch [55/135]: Loss=0.8415544688701629, Val Loss=4.395225524902344


                                                                                                            

Epoch [56/135]: Loss=0.8417152047157288, Val Loss=4.392341136932373


                                                                                                            

Epoch [57/135]: Loss=0.8397885590791703, Val Loss=4.370244026184082


                                                                                                            

Epoch [58/135]: Loss=0.8337332144379616, Val Loss=4.377070426940918


                                                                                                            

Epoch [59/135]: Loss=0.8340521842241287, Val Loss=4.353064060211182


                                                                                                            

=> saving model
Epoch [60/135]: Loss=0.8205058261752128, Val Loss=4.332716464996338


                                                                                                            

Epoch [61/135]: Loss=0.8144300445914269, Val Loss=4.3733673095703125


                                                                                                            

Epoch [62/135]: Loss=0.8234325975179673, Val Loss=4.332716941833496


                                                                                                            

=> saving model
Epoch [63/135]: Loss=0.8220765084028244, Val Loss=4.330210208892822


                                                                                                            

Epoch [64/135]: Loss=0.8050012469291687, Val Loss=4.3347601890563965


                                                                                                            

=> saving model
Epoch [65/135]: Loss=0.8122555375099182, Val Loss=4.328667640686035


                                                                                                            

Epoch [66/135]: Loss=0.7992564469575882, Val Loss=4.329145431518555


                                                                                                            

Epoch [67/135]: Loss=0.80242008715868, Val Loss=4.329507827758789


                                                                                                            

Epoch [68/135]: Loss=0.8084523305296898, Val Loss=4.329780101776123


                                                                                                            

Epoch [69/135]: Loss=0.8069570437073708, Val Loss=4.329713821411133


                                                                                                            

Epoch [70/135]: Loss=0.7978807762265205, Val Loss=4.331157684326172


                                                                                                           

KeyboardInterrupt: 

In [33]:
train_result

{'train_losses': [0.8605380043387413,
  0.8637479037046433,
  0.8546822085976601,
  0.8509144648909569,
  0.8564467936754226,
  0.8534302830696106,
  0.8544076174497605,
  0.850600641965866,
  0.8588794693350792,
  0.8570745304226876,
  0.8504234209656716,
  0.8501313015818596,
  0.8508121773600579,
  0.8593935877084732,
  0.8524240866303444,
  0.8544886112213135,
  0.849775955080986,
  0.8467658802866935,
  0.8504383265972137,
  0.8455603152513504,
  0.8545303136110306,
  0.8436845406889916,
  0.8530150756239891,
  0.8486282780766488,
  0.8435310542583465,
  0.8553196310997009,
  0.8475099980831147,
  0.836984796822071,
  0.8407960385084152,
  0.8525115951895714],
 'val_losses': [4.38873815536499,
  4.387818813323975,
  4.388105869293213,
  4.387928009033203,
  4.388128757476807,
  4.389425754547119,
  4.3873491287231445,
  4.3884382247924805,
  4.388421058654785,
  4.386437892913818,
  4.385745048522949,
  4.385434150695801,
  4.385828018188477,
  4.385298728942871,
  4.3854475021362

# Evaluate

In [34]:
compute_mAP(yolo, val_loader)

0.15139479707577266

In [20]:
compute_mAP(yolo, train_loader)

0.5534197650849819