In [1]:
%load_ext autoreload
%autoreload 2

In [203]:
import torch
import torch.nn as nn
import torchvision
from torchvision.transforms import ToTensor, Resize, Compose
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import sys
sys.path.append('../')
import dataset
from torchvision.ops import box_iou

In [3]:
pascal_voc_train = torchvision.datasets.VOCDetection(
    root="../data",
    year="2007",
    image_set="train",
    download=False
)

In [4]:
voc_train = dataset.PascalVOC(pascal_voc=pascal_voc_train)

TRANSFORMING PASCAL VOC


In [204]:
def iou():
    pass

In [351]:
class YOLOv1Loss(nn.Module):
    """
    YOLOv1 Loss
    """
    def __init__(self, S=7, B=2, C=20, lambda_coord=5, lambda_noobj=0.5):
        """
        S: dimension of the S x S grid
        B: number of bounding boxes predicted by network
        C: number of classes
        lambda_coord: penalty for coord loss
        lambda_noobj: penalty for confidence loss when no object is present in target
        """
        super().__init__()
        
        self.S = S
        self.B = B
        self.C = C
        self.lambda_coord = lambda_coord
        self.lambda_noobj = lambda_noobj
        
    def _iou(self, pred, target):
        pass
    
    def forward(self, pred, target):
        """
        pred: (N x S x S x (5 * B + C))
        target: (N x S x S x (5 + C))
        """
        
        print("YOLO LOSS")
        
        # check pred and target are in the correct shape
        assert len(pred) == len(target)
        N = len(pred)
        
        # get parameters of YOLO loss
        S = self.S
        B = self.B
        C = self.C
        lambda_coord = self.lambda_coord
        lambda_noobj = self.lambda_noobj
        
        assert pred.shape == torch.Size((N, S, S, 5 * B + C))
        assert target.shape == torch.Size((N, S, S, 5 + C))
        
        # flatten S x S grid into S^2
        pred = pred.view(N, S**2, -1) # (N, S, S, 5 * B + C) -> (N, S^2, 5 * B + C)
        target = target.view(N, S**2, -1) # (N, S, S, 5 + C) -> (N, S^2, 5 + C)
        
        print("flattening S x S to S^2")
        print(pred.shape, target.shape)
        
        # seperate tensor into box + classification
        print("seperating tensor into box + classification")
        pred_bndboxes = pred[:, :, 0:5 * B] # (N, S^2, 5 * B + C) -> (N, S^2, 5 * B)
        target_bndbox = target[:, :, 0:5] # (N, S^2, 5 + C) -> (N, S^2, 5)
        
        print("getting confidence")
        pred_confidences = pred_bndboxes[..., 0:-1:5] # (N, S^2, 5 * B) -> (N, S^2, B)
        target_confidence = target_bndbox[..., 0] # (N, S^2, 5) -> (N, S^2)
        
        print(pred_confidences, pred_confidences.shape)
        print(target_confidence, target_confidence.shape)
        
        print("getting bounding box")
        print(pred_bndboxes, pred_bndboxes.shape)
        
        box_indices = torch.arange(0, 5 * B) % 5 != 0 # mask for every 2nd, 3rd, 4th, and 5th element
        pred_boxes = pred_bndboxes[..., box_indices] # (N, S^2, 5 * B) -> (N, S^2, 4 * B)
        target_box = target_bndbox[:, :, 1: 5] # (N, S^2, 4)
        
        print("target bndbox")
        print(target_bndbox)
        print(pred_boxes, target_box)
        
        
        # print(pred_boxes, pred_boxes.shape)
        # print(target_box, target_box.shape)
        
        print("getting classification")
        pred_classification = pred[:, :, 5 * B: 5 * B + C] # (N, S^2, 5 * B + C) -> (N, S^2, C)
        target_classification = target[:, :, 5: 5 + C] # (N, S^2, 5 + C) -> (N, S^2, C)
        
        # print(pred_classification, pred_classification.shape)
        # print(target_classification, target_classification.shape)
        
        # calculate IOU between predicted boxes and target box
        
        
        
        

In [352]:
S = 1
B = 2
C = 2

In [353]:
pred = torch.zeros((S, S, 5 * B + C))
pred[0, 0] = torch.tensor([1, 0.5, 0.5, 0.5, 0.5, 1, 0.5, 0.5, 1/7, 1/7, 0, 0.95])
pred = pred.unsqueeze(0)
pred.shape

torch.Size([1, 1, 1, 12])

In [354]:
target = torch.zeros((S, S, 5 + C))
target[0, 0] = torch.tensor([1, 0.5, 0.5, 1/7, 1/7, 0, 1])
target = target.unsqueeze(0)
target.shape

torch.Size([1, 1, 1, 7])

In [355]:
yolo_loss = YOLOv1Loss(
    S=S,
    B=B,
    C=C
)

yolo_loss

YOLOv1Loss()

In [356]:
yolo_loss(pred, target)

YOLO LOSS
flattening S x S to S^2
torch.Size([1, 1, 12]) torch.Size([1, 1, 7])
seperating tensor into box + classification
getting confidence
tensor([[[1., 1.]]]) torch.Size([1, 1, 2])
tensor([[1.]]) torch.Size([1, 1])
getting bounding box
tensor([[[1.0000, 0.5000, 0.5000, 0.5000, 0.5000, 1.0000, 0.5000, 0.5000,
          0.1429, 0.1429]]]) torch.Size([1, 1, 10])
target bndbox
tensor([[[1.0000, 0.5000, 0.5000, 0.1429, 0.1429]]])
tensor([[[0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.1429, 0.1429]]]) tensor([[[0.5000, 0.5000, 0.1429, 0.1429]]])
getting classification


In [107]:
a = torch.zeros((1, 2, 2))

In [108]:
a.shape == torch.Size([1, 2, 2])

True

In [109]:
a.shape

torch.Size([1, 2, 2])