In [4]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
import torch
import torch.nn as nn
import torchvision
from torchvision.transforms import ToTensor, Resize, Compose
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import sys
sys.path.append('../')
import dataset
from torchvision.ops import box_iou
import torch.nn.functional as F

In [15]:
pascal_voc_train = torchvision.datasets.VOCDetection(
    root="../data",
    year="2007",
    image_set="train",
    download=False
)

In [16]:
voc_train = dataset.PascalVOC(pascal_voc=pascal_voc_train)

TRANSFORMING PASCAL VOC


In [17]:
def iou():
    pass

In [220]:
class YOLOv1Loss(nn.Module):
    """
    YOLOv1 Loss
    """
    def __init__(self, S=7, B=2, C=20, lambda_coord=5, lambda_noobj=0.5):
        """
        S: dimension of the S x S grid
        B: number of bounding boxes predicted by network
        C: number of classes
        lambda_coord: penalty for coord loss
        lambda_noobj: penalty for confidence loss when no object is present in target
        """
        super().__init__()
        
        self.S = S
        self.B = B
        self.C = C
        self.lambda_coord = lambda_coord
        self.lambda_noobj = lambda_noobj
        
    def xywh_to_x1y1x2y2(self, boxes: torch.Tensor) -> torch.Tensor:
        """
        Converts YOLO bounding box format to (x1, y1, x2, y2)
        
        pred: (X, 4)
        
        returns (X, 4)
        """
        x = boxes[..., 0] # (N, S^2, X)
        y = boxes[..., 1]
        w = boxes[..., 2]
        h = boxes[..., 3]
        
        x1 = x - w / 2
        y1 = y - h / 2
        x2 = x + w / 2
        y2 = y + h / 2
        
        x1y1x2y2 = torch.stack((x1, y1, x2, y2), dim=1)
        
        return x1y1x2y2
    
    def forward(self, pred, target):
        """
        pred: (N x S x S x (5 * B + C))
        target: (N x S x S x (5 + C))
        """    
        print("CALCULATING YOLO LOSS")
        
        # check pred and target are in the correct shape
        assert len(pred) == len(target)
        N = pred.size(0)
        
        print(f"BATCH SIZE: {N}")
        
        # get parameters of YOLO loss
        S = self.S
        B = self.B
        C = self.C
        lambda_coord = self.lambda_coord
        lambda_noobj = self.lambda_noobj
        
        assert pred.shape == torch.Size((N, S, S, 5 * B + C))
        assert target.shape == torch.Size((N, S, S, 5 + C))
        
        # set requires_grad=True
        # pred = pred.clone().requires_grad_()
        # target = target.clone().requires_grad_()
        
        # obj, noobj mask: select bounding boxes whose target bounding box has a confidence=1 for obj and confidence=0
        # for noobj
        obj_mask = target[:, :, :, 0] == 1
        noobj_mask = target[:, :, :, 0] == 0
        
        # select predictions and targets where ground truth contains an object
        obj_pred = pred[obj_mask] # (num_obj, 5*B+C)
        obj_target = target[obj_mask] # (num_obj, 5+C)
        
        # get bounding boxes
        obj_pred_bndbox = obj_pred[:, :5*B].view(-1, B, 5) # (num_obj, 5*B+C) -> (num_obj, B, 5)
        obj_target_bndbox = obj_target[:, :5].view(-1, 1, 5) # (num_obj, 5*B+C) -> (num_obj, 1, 5)
        
        print("BOUNDING BOXES")
        print(obj_pred_bndbox, obj_pred_bndbox.shape)
        print(obj_target_bndbox, obj_target_bndbox.shape)
        
        print("OBJ PRED")
        print(obj_pred, obj_pred.shape)
        
        print("OBJ TARGET")
        print(obj_target, obj_target.shape)
        
        # select predictions and targets where ground grouth does not contain an object
        noobj_pred = pred[noobj_mask] # (num_noobj, 5*B+C)
        noobj_target = target[noobj_mask] # (num_obj, 5+C)
        
        # get bounding boxes for target's whose confidenc=0
        noobj_pred_bndbox = noobj_pred[:, :5*B].view(-1, B, 5) # (num_noobj, 5*B+C) -> (num_noobj, B, 5)
        noobj_target_bndbox = noobj_target[:, :5].view(-1, 1, 5)  # (num_noobj, 5*B+C) -> (num_noobj, 1, 5)
        
        # calculate ious
        max_iou_mask = torch.BoolTensor(obj_pred_bndbox.size())
        
        print("SELECTING PREDICTOR BOXES")
        for i in range(obj_pred_bndbox.size(0)):
            # get proposed boxes and target box
            pred_bndbox = obj_pred_bndbox[i][:, 1:] # (B, 4)
            target_bndbox = obj_target_bndbox[i][:, 1:] # (1, 4)
            
            # convert (x, y, w, h) -> (x1, y1, x2, y2)
            pred_bndbox = self.xywh_to_x1y1x2y2(pred_bndbox)
            target_bndbox = self.xywh_to_x1y1x2y2(target_bndbox)
        
            # get box ious
            ious = box_iou(pred_bndbox, target_bndbox).squeeze(-1) # (B)
            
            # get the box with the max iou and keep in mask
            max_iou, max_idx = ious.max(dim=0)
            max_iou_mask[i, max_idx] = 1

        # responsible predictors
        obj_pred_bndbox = obj_pred_bndbox[max_iou_mask].view(-1, 5) # (num_obj, 5)    
        obj_target_bndbox = obj_target_bndbox.squeeze(1) # (num_obj, 5)
        
        print("RESPONSIBLE PREDICTORS")
        print(obj_pred_bndbox, obj_pred_bndbox.shape)
        print(obj_target_bndbox, obj_target_bndbox.shape)
     
        ###
        # Bounding Box Loss
        ###
        print("BOUNDING BOX Loss")
        pred_xy = obj_pred_bndbox[:, 1:3]
        target_xy = obj_target_bndbox[:, 1:3]
        xy_loss = lambda_coord * F.mse_loss(pred_xy, target_xy, reduction="sum")
        print(xy_loss)
          
        pred_wh = torch.sqrt(obj_pred_bndbox[:, 3:5])
        target_wh = torch.sqrt(obj_target_bndbox[:, 3:5])
        wh_loss = lambda_coord * F.mse_loss(pred_wh, target_wh, reduction="sum")
        print(wh_loss)
        
        localization_loss = xy_loss + wh_loss
        
        ###
        # Confidence Loss
        ###
        print("CONFIDENCE Loss")
        obj_pred_confidence = obj_pred_bndbox[:, 0]
        obj_target_confidence = obj_target_bndbox.squeeze(1)[:, 0]
        obj_confidence_loss = F.mse_loss(obj_pred_confidence, obj_target_confidence, reduction="sum")        
        
        print(noobj_pred_bndbox, noobj_pred_bndbox.shape)
        print(noobj_target_bndbox, noobj_target_bndbox.shape)
        print(obj_confidence_loss)
        
        noobj_pred_confidence = noobj_pred_bndbox[:, :, 0] # (num_noobj, 2)
        noobj_target_confidence = noobj_target_bndbox[:, :, 0][:, [0, 0]] # (num_noobj, 2) -> duplicated target for every bounding box
        
        print("NOOBJ CONFIDENCE Loss")
        print(noobj_pred_confidence)
        print(noobj_target_confidence)
        
        noobj_confidence_loss = lambda_noobj * F.mse_loss(noobj_pred_confidence, noobj_target_confidence, reduction="sum")
        print(noobj_confidence_loss)
        
        confidence_loss = obj_confidence_loss + noobj_confidence_loss
        
        ###
        # Classification Loss
        ###
        print("CLASSIFICATION LOSS")
        obj_pred_classification = obj_pred[:, -C:] # (num_obj, C)
        obj_target_classification = obj_target[:, -C:] # (num_obj, C)
        classification_loss = F.mse_loss(obj_pred_classification, obj_target_classification, reduction="sum")
        
        print(obj_pred_classification)
        print(obj_target_classification)
        print(classification_loss)
        
        # total loss
        print(localization_loss, confidence_loss, classification_loss)
        loss = (localization_loss + confidence_loss + classification_loss) / N
        
        return loss
        
        

In [221]:
# S = 1
# B = 2
# C = 2

S = 2
B = 2
C = 2

In [226]:
pred = torch.zeros((S, S, 5 * B + C))
pred[0, 0] = torch.tensor([1, 0.5, 0.5, 0.5, 0.5, 0.95, 0.5, 0.5, 1/7, 1/7, 0, 0.95])
pred[0, 1] = torch.tensor([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0.95])
pred = pred.unsqueeze(0)
pred.shape
pred.requires_grad_()

# pred = torch.zeros((2, S, S, 5 * B + C))
# pred[0, 0, 0] = torch.tensor([1, 0.5, 0.5, 0.5, 0.5, 1, 0.5, 0.5, 1/7, 1/7, 0, 0.95])
# pred.shape

print(pred)

tensor([[[[1.0000, 0.5000, 0.5000, 0.5000, 0.5000, 0.9500, 0.5000, 0.5000,
           0.1429, 0.1429, 0.0000, 0.9500],
          [0.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
           1.0000, 1.0000, 0.0000, 0.9500]],

         [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000]]]], requires_grad=True)


In [229]:
target = torch.zeros((S, S, 5 + C))
target[0, 0] = torch.tensor([1, 0.5, 0.5, 1/7, 1/7, 0, 1])
target = target.unsqueeze(0)
target.shape

target.requires_grad_()

# target = torch.zeros((2, S, S, 5 + C))
# target[0, 0, 0] = torch.tensor([1, 0.5, 0.5, 1/7, 1/7, 0, 1])
# target.shape

tensor([[[[1.0000, 0.5000, 0.5000, 0.1429, 0.1429, 0.0000, 1.0000],
          [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],

         [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]]]],
       requires_grad=True)

In [230]:
yolo_loss = YOLOv1Loss(
    S=S,
    B=B,
    C=C
)

loss = yolo_loss(pred, target)
loss

CALCULATING YOLO LOSS
BATCH SIZE: 1
BOUNDING BOXES
tensor([[[1.0000, 0.5000, 0.5000, 0.5000, 0.5000],
         [0.9500, 0.5000, 0.5000, 0.1429, 0.1429]]], grad_fn=<ViewBackward0>) torch.Size([1, 2, 5])
tensor([[[1.0000, 0.5000, 0.5000, 0.1429, 0.1429]]], grad_fn=<ViewBackward0>) torch.Size([1, 1, 5])
OBJ PRED
tensor([[1.0000, 0.5000, 0.5000, 0.5000, 0.5000, 0.9500, 0.5000, 0.5000, 0.1429,
         0.1429, 0.0000, 0.9500]], grad_fn=<IndexBackward0>) torch.Size([1, 12])
OBJ TARGET
tensor([[1.0000, 0.5000, 0.5000, 0.1429, 0.1429, 0.0000, 1.0000]],
       grad_fn=<IndexBackward0>) torch.Size([1, 7])
SELECTING PREDICTOR BOXES
RESPONSIBLE PREDICTORS
tensor([[0.9500, 0.5000, 0.5000, 0.1429, 0.1429]], grad_fn=<ViewBackward0>) torch.Size([1, 5])
tensor([[1.0000, 0.5000, 0.5000, 0.1429, 0.1429]], grad_fn=<SqueezeBackward1>) torch.Size([1, 5])
BOUNDING BOX Loss
tensor(0., grad_fn=<MulBackward0>)
tensor(0., grad_fn=<MulBackward0>)
CONFIDENCE Loss
tensor([[[0., 1., 1., 1., 1.],
         [1., 1., 1.

tensor(0.5050, grad_fn=<DivBackward0>)

In [231]:
loss.backward()

In [107]:
a = torch.zeros((1, 2, 2))

In [108]:
a.shape == torch.Size([1, 2, 2])

True

In [109]:
a.shape

torch.Size([1, 2, 2])