In [2]:
%load_ext autoreload
%autoreload 2

In [222]:
import torch
import torch.nn as nn
import torchvision
from torchvision.transforms import ToTensor, Resize, Compose
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import sys
sys.path.append('../')
import dataset
from torchvision.ops import box_iou
import torch.nn.functional as F

In [4]:
pascal_voc_train = torchvision.datasets.VOCDetection(
    root="../data",
    year="2007",
    image_set="train",
    download=False
)

In [5]:
voc_train = dataset.PascalVOC(pascal_voc=pascal_voc_train)

TRANSFORMING PASCAL VOC


In [6]:
def iou():
    pass

In [281]:
class YOLOv1Loss(nn.Module):
    """
    YOLOv1 Loss
    """
    def __init__(self, S=7, B=2, C=20, lambda_coord=5, lambda_noobj=0.5):
        """
        S: dimension of the S x S grid
        B: number of bounding boxes predicted by network
        C: number of classes
        lambda_coord: penalty for coord loss
        lambda_noobj: penalty for confidence loss when no object is present in target
        """
        super().__init__()
        
        self.S = S
        self.B = B
        self.C = C
        self.lambda_coord = lambda_coord
        self.lambda_noobj = lambda_noobj
        
    def xywh_to_x1y1x2y2(self, boxes: torch.Tensor) -> torch.Tensor:
        """
        Converts YOLO bounding box format to (x1, y1, x2, y2)
        
        pred: (X, 4)
        
        returns (X, 4)
        """
        x = boxes[..., 0] # (N, S^2, X)
        y = boxes[..., 1]
        w = boxes[..., 2]
        h = boxes[..., 3]
        
        x1 = x - w / 2
        y1 = y - h / 2
        x2 = x + w / 2
        y2 = y + h / 2
        
        x1y1x2y2 = torch.stack((x1, y1, x2, y2), dim=1)
        
        return x1y1x2y2
    
    def forward(self, pred, target):
        """
        pred: (N x S x S x (5 * B + C))
        target: (N x S x S x (5 + C))
        """    
        # check pred and target are in the correct shape
        assert len(pred) == len(target)
        N = len(pred)
        
        # get parameters of YOLO loss
        S = self.S
        B = self.B
        C = self.C
        lambda_coord = self.lambda_coord
        lambda_noobj = self.lambda_noobj
        
        assert pred.shape == torch.Size((N, S, S, 5 * B + C))
        assert target.shape == torch.Size((N, S, S, 5 + C))
        
        print("PRED SHAPE")
        print(pred.shape)
        
        print("TARGET SHAPE")
        print(target.shape)
        
        # obj, noobj mask: select bounding boxes whose target bounding box has a confidence=1 for obj and confidence=0
        # for noobj
        obj_mask = target[:, :, :, 0] == 1
        noobj_mask = target[:, :, :, 0] == 0
        
        # select predictions and targets where ground truth contains an object
        obj_pred = pred[obj_mask] # (num_obj, 5*B+C)
        obj_target = target[obj_mask] # (num_obj, 5+C)
        
        print("OBJ PRED + TARGET")
        print(obj_pred, obj_pred.shape)
        print(obj_target, obj_target.shape)
        
        # get bounding boxes
        obj_pred_bndbox = obj_pred[:, :5*B].view(-1, B, 5) # (num_obj, 5*B+C) -> (num_obj, B, 5)
        obj_target_bndbox = obj_target[:, :5].view(-1, 1, 5) # (num_obj, 5*B+C) -> (num_obj, 1, 5)
        
        print("OBJ PRED BNDBOX")
        print(obj_pred_bndbox, obj_pred_bndbox.shape)
        print(obj_target_bndbox, obj_target_bndbox.shape)
        
        # calculate ious
        max_iou_mask = torch.BoolTensor(obj_pred_bndbox.size())
        
        print(max_iou_mask, max_iou_mask.shape)
        
        print("CALCULATING IOUS")
        for i in range(obj_pred_bndbox.size(0)):
            pred_bndbox = obj_pred_bndbox[i][:, 1:] # (B, 4)
            target_bndbox = obj_target_bndbox[i][:, 1:] # (1, 4)
            print(pred_bndbox, target_bndbox)
            
            pred_bndbox = self.xywh_to_x1y1x2y2(pred_bndbox)
            target_bndbox = self.xywh_to_x1y1x2y2(target_bndbox)
            
            print(pred_bndbox)
            print(target_bndbox)
        
            ious = box_iou(pred_bndbox, target_bndbox).squeeze(-1) # (B)
            
            print("IOUS")
            print(ious)
            
            max_iou, max_idx = ious.max(dim=0)
            
            print(max_iou, max_idx)
        
            max_iou_mask[i, max_idx] = 1
        
        print(max_iou_mask)
        
        # responsible predictors
        obj_pred_bndbox = obj_pred_bndbox[max_iou_mask].view(-1, 5) # (num_obj, 5)
        
        print(obj_pred_bndbox)
        
     
        ###
        # Bounding Box Error
        ###
        print("BOUNDING BOX ERROR")
        pred_xy = obj_pred_bndbox[:, 1:3]
        target_xy = obj_target_bndbox.squeeze(1)[:, 1:3]
        print(pred_xy, target_xy)
        
        print(target_xy)
        
        xy_loss = lambda_coord * F.mse_loss(pred_xy, target_xy, reduction="sum")
          
        print(xy_loss)
        
        print("WH")
        pred_wh = torch.sqrt(obj_pred_bndbox[:, 3:5])
        target_wh = torch.sqrt(obj_target_bndbox.squeeze(1)[:, 3:5])
        
        print(pred_wh, target_wh)
        
        wh_loss = lambda_coord * F.mse_loss(pred_wh, target_wh, reduction="sum")
        
        print(wh_loss)
        
        obj_pred_confidence = obj_pred_bndbox[:, 0]
        obj_target_confidence = obj_target_bndbox.squeeze(1)[:, 0]
        
        
        print("OBJ confidence")
        print(obj_pred_confidence, obj_target_confidence)
        
        obj_confidence_loss = F.mse_loss(obj_pred_confidence, obj_target_confidence)
        print(obj_confidence_loss)
        
        ###
        # Confidence Error
        ###
        
        
        ###
        # Classification Error
        ###
        
        

In [282]:
S = 1
B = 2
C = 2

# S = 2
# B = 2
# C = 2

In [283]:
pred = torch.zeros((S, S, 5 * B + C))
pred[0, 0] = torch.tensor([1, 0.5, 0.5, 0.5, 0.5, 0.95, 0.5, 0.5, 1/7, 1/7, 0, 0.95])
pred = pred.unsqueeze(0)
pred.shape

# pred = torch.zeros((2, S, S, 5 * B + C))
# pred[0, 0, 0] = torch.tensor([1, 0.5, 0.5, 0.5, 0.5, 1, 0.5, 0.5, 1/7, 1/7, 0, 0.95])
# pred.shape

torch.Size([1, 1, 1, 12])

In [284]:
target = torch.zeros((S, S, 5 + C))
target[0, 0] = torch.tensor([1, 0.5, 0.5, 1/7, 1/7, 0, 1])
target = target.unsqueeze(0)
target.shape

# target = torch.zeros((2, S, S, 5 + C))
# target[0, 0, 0] = torch.tensor([1, 0.5, 0.5, 1/7, 1/7, 0, 1])
# target.shape

torch.Size([1, 1, 1, 7])

In [285]:
yolo_loss = YOLOv1Loss(
    S=S,
    B=B,
    C=C
)

yolo_loss(pred, target)

PRED SHAPE
torch.Size([1, 1, 1, 12])
TARGET SHAPE
torch.Size([1, 1, 1, 7])
OBJ PRED + TARGET
tensor([[1.0000, 0.5000, 0.5000, 0.5000, 0.5000, 0.9500, 0.5000, 0.5000, 0.1429,
         0.1429, 0.0000, 0.9500]]) torch.Size([1, 12])
tensor([[1.0000, 0.5000, 0.5000, 0.1429, 0.1429, 0.0000, 1.0000]]) torch.Size([1, 7])
OBJ PRED BNDBOX
tensor([[[1.0000, 0.5000, 0.5000, 0.5000, 0.5000],
         [0.9500, 0.5000, 0.5000, 0.1429, 0.1429]]]) torch.Size([1, 2, 5])
tensor([[[1.0000, 0.5000, 0.5000, 0.1429, 0.1429]]]) torch.Size([1, 1, 5])
tensor([[[False, False, False, False, False],
         [False, False, False, False, False]]]) torch.Size([1, 2, 5])
CALCULATING IOUS
tensor([[0.5000, 0.5000, 0.5000, 0.5000],
        [0.5000, 0.5000, 0.1429, 0.1429]]) tensor([[0.5000, 0.5000, 0.1429, 0.1429]])
tensor([[0.2500, 0.2500, 0.7500, 0.7500],
        [0.4286, 0.4286, 0.5714, 0.5714]])
tensor([[0.4286, 0.4286, 0.5714, 0.5714]])
IOUS
tensor([0.0816, 1.0000])
tensor(1.) tensor(1)
tensor([[[False, False, Fals

In [107]:
a = torch.zeros((1, 2, 2))

In [108]:
a.shape == torch.Size([1, 2, 2])

True

In [109]:
a.shape

torch.Size([1, 2, 2])