In [9]:
from google.colab import files
 
uploaded = files.upload()
 
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  
# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 69 bytes


In [10]:
!kaggle datasets download -d aladdinpersson/pascalvoc-yolo

Downloading pascalvoc-yolo.zip to /content
100% 4.30G/4.31G [00:44<00:00, 134MB/s]
100% 4.31G/4.31G [00:44<00:00, 103MB/s]


In [13]:
# !mkdir pascal_voc_yolo; mv pascalvoc-yolo.zip pascal_voc_yolo; cd pascal_voc_yolo; unzip pascalvoc-yolo.zip

In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from PIL import Image

In [16]:
class PascalVOCDataSet(Dataset):
  def __init__(self, csv_file, img_dir, label_dir, S=7, B=2, C=20, transform=None):
      self.annotations = pd.read_csv(csv_file)
      self.img_dir = img_dir,
      self.label_dir = label_dir,
      self.transform = transform
      self.S = S,
      self.B = B,
      self.C = C

  def __len__(self):
      return len(self.annotations)

  def __getitem__(self, idx):
      label_path = os.path.join(self.label_dir, self.annotations.iloc[idx, 1])
      boxes = []
      with open(label_path) as f:
        for label in f.readlines():
            class_label, x, y, width, height = [float(x) if float(x) != int(float(x)) else int(x) for x in label.replace("\n", "").split()]
            boxes.append([class_label, x, y, width, height])
        
      img_path = os.path.join(self.img_dir, self.anotations.iloc[index, 0])
      image = Image.open(img_path)
      boxes = torch.tensor(boxes)

      if self.transform:
          image, boxes = self.transform(image, boxes)

      label_matrix = torch.zeros((self.S, self.S, self.C + 5*self.B))
      for box in boxes:
          class_label, x, y, width, height =  box.tolist()
          class_label = int(class_label)
          i, j = int(self.S * y), int(self.S * x)
          x_cell, y_cell = self.S * x - j, self.S*y - i
          width_cell, height_cell = width*self.S, height*self.S

          if label_matrix[i, j, 20] == 0:
            label_matrix[i, j, 20] = 1
            box_coordinates = torch.tensor([x_cell, y_cell, width_cell, height_cell])
            label_matrix[i, j, 21:25] = box_coordinates
            label_matrix[i, j, class_label] = 1
      
      return image, label_matrix




<h3>Architecture Config</h3>

In [17]:
# (kernel_size, filters, stride, padding)
architecture_config = [
    (7, 64, 2, 3),
    "M",
    (3, 192, 1, 1),
    "M",
    (1, 128, 1, 0),
    (3, 256, 1, 1),
    (1, 256, 1, 0),
    (3, 512, 1, 1),
    "M",
    [(1, 256, 1, 0), (3, 512, 1, 1), 4],
    (1, 512, 1, 0),
    (3, 1024, 1, 1),
    "M",
    [(1, 512, 1, 0), (3, 1024, 1, 1), 2],
    (3, 1024, 1, 1),
    (3, 1024, 2, 1),
    (3, 1024, 1, 1),
    (3, 1024, 1, 1)
]

In [18]:
class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, **kwargs):
        super(CNNBlock, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
        self.batchnorm = nn.BatchNorm2d(out_channels)
        self.leakyrelu = nn.LeakyReLU(0.1)

    def forward(self, x):
        return self.leakyrelu(self.batchnorm(self.conv(x)))

In [19]:
class Yolov1(nn.Module):
    def __init__(self, in_channels=3, **kwargs):
        super(Yolov1, self).__init__()
        self.architecture = architecture_config
        self.in_channels = in_channels
        self.darknet = self._create_conv_layers(self.architecture)
        self.fcs = self._create_fcs(**kwargs)

    def forward(self, x):
        x = self.darknet(x)
        return self.fcs(torch.flatten(x, start_dim=1))

    def _create_conv_layers(self, architecture):
        layers = []
        in_channels = self.in_channels

        for x in architecture:
            if type(x) == tuple:
                layers += [
                    CNNBlock(
                        in_channels, x[1], kernel_size=x[0], stride=x[2], padding=x[3],
                    )
                ]
                in_channels = x[1]

            elif type(x) == str:
                layers += [nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))]

            elif type(x) == list:
                conv1 = x[0]
                conv2 = x[1]
                num_repeats = x[2]

                for _ in range(num_repeats):
                    layers += [
                        CNNBlock(
                            in_channels,
                            conv1[1],
                            kernel_size=conv1[0],
                            stride=conv1[2],
                            padding=conv1[3],
                        )
                    ]
                    layers += [
                        CNNBlock(
                            conv1[1],
                            conv2[1],
                            kernel_size=conv2[0],
                            stride=conv2[2],
                            padding=conv2[3],
                        )
                    ]
                    in_channels = conv2[1]

        return nn.Sequential(*layers)

    def _create_fcs(self, split_size, num_boxes, num_classes):
        S, B, C = split_size, num_boxes, num_classes

        # In original paper this should be
        # nn.Linear(1024*S*S, 4096),
        # nn.LeakyReLU(0.1),
        # nn.Linear(4096, S*S*(B*5+C))

        return nn.Sequential(
            nn.Flatten(),
            nn.Linear(1024 * S * S, 496),
            nn.Dropout(0.0),
            nn.LeakyReLU(0.1),
            nn.Linear(496, S * S * (C + B * 5)),
        )

In [20]:
def test_network(S=7, B=2, C=20):
  model = Yolov1(split_size=S, num_boxes=B, num_classes=C)
  x = torch.randn((2, 3, 448, 448))
  print(model(x).shape)

In [None]:
test_test_network()

torch.Size([2, 1470])


<h3>Intersection Over Union</h3>

In [27]:
def intersection_over_union(preds, labels, box_format="midpoint"):
    """
    Calculates the intersection over union between predicted bounding box
    and the ground truth bouding box.

    Parameters:
        preds: torch.tensor = predictions
        labels: torch.tensor = labels
    Returns:
        Intersection over union: torch.tensor
    """
      #shape should be (N, 4) where N is number of bboxes - both preds and labels
    if box_format == "corners":
        #box1 is the pedicted box
        box1_x1 = preds[..., 0:1]
        box1_y1 = preds[..., 1:2]
        box1_x2 = preds[..., 2:3]
        box1_y2 = preds[..., 3:4]
        #box2 is the ground truth
        box2_x1 = labels[..., 0:1]
        box2_y1 = labels[..., 1:2]
        box2_x2 = labels[..., 2:3]
        box2_y2 = labels[..., 3:4]

    elif box_format == "midpoint":
        #box1
        box1_x1 = preds[..., 0:1] - preds[..., 2:3]/2
        box1_y1 = preds[..., 1:2] - preds[..., 3:4]/2
        box1_x2 = preds[..., 0:1] + preds[..., 2:3]/2
        box1_y2 = preds[..., 1:2] + preds[..., 3:4]/2
        #box2
        box2_x1 = labels[..., 0:1] - labels[..., 2:3]/2
        box2_y1 = labels[..., 1:2] - labels[..., 3:4]/2
        box2_x2 = labels[..., 0:1] + labels[..., 2:3]/2
        box2_y2 = labels[..., 1:2] + labels[..., 3:4]/2
        
    x1 = torch.max(box1_x1, box2_x1)
    y1 = torch.max(box1_y1, box2_y2)
    x2 = torch.min(box1_x1, box2_x1)
    y2 = torch.min(box1_y1, box2_y2)

    #.clamp(0) is for the edge case where they do not intersect
    intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)
    box1_area = abs((box1_x2 - box1_x1)*(box1_y1 - box1_y2))
    box2_area = abs((box2_x2 - box2_x1)*(box2_y1 - box2_y2))
    union = (box1_area + box2_area) - intersection

    return intersection/(union - 1e-6) #1e-6 in case denominator becomes 0

<h3>Non Max Suppressions</h3>

In [31]:
def non_max_supressions(bboxes, iou_threshold, prob_threshold, box_format="corners"):
    """
    """
    assert type(bboxes) == list
    bboxes = [box for box in bboxes if box[1] > prob_threshold]
    bboxes = sorted(bboxes, key=lambda x: x[1], reverse=True)
    bboxes_after_nms = []
    while bboxes:
        chosen_box = bboxes.pop(0)
        bboxes = [
                  box for box in bboxes if box[0] != choosen_box[0] 
                  or intersection_over_union(
                      torch.tensor(chosen_box[2:]).
                      torch.tensor(box[2:]),
                      box_format=box_format) < iou_threshold
                  ]
        bboxes_after_nms.append(choosen_box)
    return bboxes_after_nms

In [None]:
class YoloLoss(nn.Module):
    def __init__(self, S=7, B=2, C=20):
        super(YoloLoss, self).__init__()
        self.mse = nn.MSELoss(reduction="sum")
        self.S = S
        self.B = B
        self.C = C
        self.lambda_noobj = 0.5
        self.lambda_coord = 5

    def forward(self, predictions, target):
        #currently the o/p from n/2 is of format (Batch, sxsx30)
        #predictions need to be of shape sxsx30.
        predictions = predictions.reshape(-1, self.S, self.S, self.C + self.B*5)
        #0-19 for class probabilities
        #20 is for class score
        #21 to 25 is for bounding box 1 
        #26 to 30 is for bounding box 1
        #since there is only 1 target, we will keep the target idx the same for both bounding boxes
        iou_b1 = intersection_over_union(predictions[..., 21:25], target[..., 21:25])
        iou_b2 = intersection_over_union(predictions[..., 26:30], target[..., 21:25])
        ious = torch.cat([iou_b1.unsqueeze(0), iou_b2.unsqueeze(0)], dim=0)
        iou_maxes, best_box = torch.max(ious, dim=0) #best box is the box responsible for prediction or the highest iou
        exists_box = target[..., 20].unsqueeze() #identity of the object - whether obj exists or not
        #--------------------------------------#
        #          FOR BOX COORDINATES         #
        #--------------------------------------#
        box_predictions = exists_box * (
            (
                best_box * predictions[..., 26:30]
             + (1 - best_box)*predictions[..., 21:25]
            )
        )
        
        box_targets = exists_box * target[..., 21:25]
        box_predictions[..., 2:4] = torch.sign(box_predictions[..., 2:4]) * torch.sqrt(
            torch.abs(box_preditions[..., 2:4] + 1e-6)
            )

        box_targets[..., 2:4] = torch.sqrt(box_targets[..., 2:4])

        # (N, S, S, 4) -> (N*S*S, 4) This is how mse expects the inputs to be
        box_loss = self.mse(
            torch.flatten(box_predictions, end_dim=2),
            torch.flatten(box_targets, end_dim=2)
        )
        #--------------------------------------#
        #FOR OBJ LOSS: If there is an object   #
        #--------------------------------------#
        #which bounding box is responsible ?
        pred_box = (
            best_box * predictions[..., 25:26] + (1 - best_box) * predictions[..., 20:21]
        )
        #if there actually exists a box
        #(N*S*S, 1)
        object_loss = self.mse(
            torch.flatten(exists_box * pred_box),
            torch.flatten(exists_box * target[..., 20:21])
        )

        #--------------------------------------#
        #FOR NO OBJ LOSS: If there is no obj   #
        #--------------------------------------#
        no_obj_loss = self.mse(
            torch.flatten((1 - exists_box) * predictions[..., 20:21], start_dim=1),
            torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1)
        )
        no_obj_loss += self.mse(
            torch,flatten((1 - exists_box) * predictions[..., 25:26], start_dim=1),
            torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1)
        )
        #--------------------------------------#
        #             FOR CLASS LOSS           #
        #--------------------------------------#
        # (N, S, S, 20) -> (N*S*S, 20)
        class_loss = self.mse(
            torch.flatten(exists_box * predictions[..., :20], end_dim=-2),
            torch.flatten(exists_box * target[..., :20], end_dim=-2)
        )

        loss = (
            self.lambda_coord * box_loss # first loss component
            + object_loss # second
            + self.lambda_noobj * no_object_loss # third
            + class_loss # fourth
        )

        return loss