<a href="https://colab.research.google.com/github/chang-heekim/Implementation_Deep_Learning_Paper/blob/main/YOLO(You%20Only%20Look%20Once)/YOLO_V1_Pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load Necessary Library

In [None]:
import torch
from torch import nn, optim
import torch.nn.functional as F
from torchvision import transforms
from torchvision.datasets import VOCDetection

from PIL import Image
import numpy as np
import xmltodict
from tqdm import tqdm

# Define Custom Dataset Class

In [None]:
class YOLO_VOC_DATASET(VOCDetection):
    def __getitem__(self, index):
        img = (Image.open(self.images[index]).convert('RGB')).resize((448, 448))
        transform = transforms.Compose([transforms.PILToTensor(), transforms.Resize((448, 448))])
        img = torch.divide(transform(img), 255)

        annotations = xmltodict.parse(open(self.annotations[index]).read())

        classes = ["aeroplane", "bicycle", "bird", "boat", "bottle",
                   "bus", "car", "cat", "chair", "cow", "diningtable",
                   "dog", "horse", "motorbike", "person", "pottedplant",
                   "sheep", "sofa", "train", "tvmonitor"]

        label = np.zeros((7, 7, 25), dtype=float)

        width = float(annotations['annotation']['size']['width'])
        height = float(annotations['annotation']['size']['height'])

        try:
            for obj in annotations['annotation']['object']:
                class_id = classes.index(obj['name'].lower())

                x_min = float(obj['bndbox']['xmin'])
                x_max = float(obj['bndbox']['xmax'])
                y_min = float(obj['bndbox']['ymin'])
                y_max = float(obj['bndbox']['ymax'])

                x_min = float((x_min/width)*448.)
                y_min = float((y_min/height)*448.)
                x_max = float((x_max/width)*448.)
                y_max = float((y_max/height)*448.)

                center_x = (x_min + x_max) / 2.
                center_y = (y_min + y_max) / 2.
                w = (x_max - x_min) / 448.
                h = (y_max - y_min) / 448.

                x_cell = int(center_x/64)
                y_cell = int(center_y/64)

                x_cell_center = float((center_x - x_cell * 64.0)/64.0)
                y_cell_center = float((center_y - y_cell * 64.0)/64.0)

                class_id_index = class_id + 5

                label[y_cell][x_cell][0] = x_cell_center
                label[y_cell][x_cell][1] = y_cell_center
                label[y_cell][x_cell][2] = w
                label[y_cell][x_cell][3] = h
                label[y_cell][x_cell][4] = 1.0
                label[y_cell][x_cell][class_id_index] = 1.0

        except TypeError as e:
            class_id = classes.index(annotations['annotation']['object']['name'].lower())

            x_min = float(annotations['annotation']['object']['bndbox']['xmin'])
            x_max = float(annotations['annotation']['object']['bndbox']['xmax'])
            y_min = float(annotations['annotation']['object']['bndbox']['ymin'])
            y_max = float(annotations['annotation']['object']['bndbox']['ymax'])

            x_min = float((x_min/width)*448.)
            y_min = float((y_min/height)*448.)
            x_max = float((x_max/width)*448.)
            y_max = float((y_max/height)*448.)

            center_x = (x_min + x_max) / 2.
            center_y = (y_min + y_max) / 2.
            w = (x_max - x_min) / 448.
            h = (y_max - y_min) / 448.

            x_cell = int(center_x/64)
            y_cell = int(center_y/64)

            x_cell_center = float((center_x - x_cell * 64.0)/64.0)
            y_cell_center = float((center_y - y_cell * 64.0)/64.0)

            class_id_index = class_id + 5

            label[y_cell][x_cell][0] = x_cell_center
            label[y_cell][x_cell][1] = y_cell_center
            label[y_cell][x_cell][2] = w
            label[y_cell][x_cell][3] = h
            label[y_cell][x_cell][4] = 1.0
            label[y_cell][x_cell][class_id_index] = 1.0
        return img, torch.tensor(label)

# Define Yolo v1

In [None]:
class YOLO_V1(nn.Module):
    def __init__(self):
        super().__init__()

        self.conv = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=7, stride=2, padding=4),
            nn.LeakyReLU(0.1, inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            nn.Conv2d(in_channels=64, out_channels=192, kernel_size=3, stride=1, padding=1),
            nn.LeakyReLU(0.1, inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(in_channels=192, out_channels=128, kernel_size=1, stride=1, padding=0),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=1, stride=1, padding=0),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1),
            nn.LeakyReLU(0.1, inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(in_channels=512, out_channels=256, kernel_size=1, stride=1, padding=0),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=512, out_channels=256, kernel_size=1, stride=1, padding=0),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=512, out_channels=256, kernel_size=1, stride=1, padding=0),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=512, out_channels=256, kernel_size=1, stride=1, padding=0),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=1, stride=1, padding=0),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=1, padding=1),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(in_channels=1024, out_channels=512, kernel_size=1, stride=1, padding=0),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=1, padding=1),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=1024, out_channels=512, kernel_size=1, stride=1, padding=0),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=1, padding=1),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, stride=1, padding=1),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, stride=2, padding=1),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, stride=1, padding=1),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, stride=1, padding=1),
            nn.LeakyReLU(0.1, inplace=True),
        )
        
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_features=7*7*1024, out_features=4096),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Dropout(0.5),
            nn.Linear(in_features=4096, out_features=7*7*30)
        )
    
        for m in self.conv.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.normal_(m.weight, mean=0, std=0.01)

        for m in self.fc.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, mean=0, std=0.01)

    def forward(self, x):
        out = self.conv(x)
        out = self.fc(out)
        out = torch.reshape(out, (-1, 7, 7, 30))
        return out

# Define Multitask loss

In [None]:
def multitask_loss(preds, targets, smooth=1):
    batch_loss = 0
    batch = targets.shape[0]

    for i in range(len(targets)):
        pred = preds[i].clone().detach().requires_grad_(True)
        target = targets[i].clone().detach().requires_grad_(True)

        pred = torch.reshape(pred, (49, 30))
        target = torch.reshape(target, (49, 25))
        # print(pred.shape, target.shape)
        
        loss = 0
        for j in range(len(target)):
            pred_bbox_1 = pred[j, :4]
            pred_bbox_2 = pred[j, 5:9]
            pred_confidence_1 = pred[j, 4]
            pred_confidence_2 = pred[j, 9]
            pred_class = pred[j, 10:]

            target_bbox = target[j, :4]
            target_confidence = target[j, 4]
            target_class = target[j, 5:]

            # Calculate IOU BBox1, 2
            pred_bbox_1 = pred_bbox_1.detach().cpu().numpy()
            pred_bbox_2 = pred_bbox_2.detach().cpu().numpy()
            target_bbox = target_bbox.detach().cpu().numpy()
            
            bbox_1_area = pred_bbox_1[2] * pred_bbox_1[3]
            bbox_2_area = pred_bbox_2[2] * pred_bbox_2[3]
            bbox_target_area = target_bbox[2] * target_bbox[3]
            
            # [xmin, ymin, xmax, ymax]
            bbox_1 = [pred_bbox_1[0] - 0.5*pred_bbox_1[2], pred_bbox_1[1] - 0.5*pred_bbox_1[3], pred_bbox_1[0] + 0.5*pred_bbox_1[2], pred_bbox_1[1] + 0.5*pred_bbox_1[3]]
            bbox_2 = [pred_bbox_2[0] - 0.5*pred_bbox_2[2], pred_bbox_2[1] - 0.5*pred_bbox_2[3], pred_bbox_2[0] + 0.5*pred_bbox_2[2], pred_bbox_2[1] + 0.5*pred_bbox_2[3]]
            gt_bbox = [target_bbox[0] - 0.5*target_bbox[2], target_bbox[1] - 0.5*target_bbox[3], target_bbox[0] + 0.5*target_bbox[2], target_bbox[1] + 0.5*target_bbox[3]]

            intersection_1 = []
            intersection_1.append(max(bbox_1[0], gt_bbox[0]))
            intersection_1.append(max(bbox_1[1], gt_bbox[1]))
            intersection_1.append(min(bbox_1[2], gt_bbox[2]))
            intersection_1.append(min(bbox_1[3], gt_bbox[3]))

            intersection_2 = []
            intersection_2.append(max(bbox_2[0], gt_bbox[0]))
            intersection_2.append(max(bbox_2[1], gt_bbox[1]))
            intersection_2.append(min(bbox_2[2], gt_bbox[2]))
            intersection_2.append(min(bbox_2[3], gt_bbox[3]))

            intersection_area_1 = 0
            if intersection_1[2] - intersection_1[0] >= 0 and intersection_1[3] - intersection_1[1] >= 0:
                intersection_area_1 = (intersection_1[2] - intersection_1[0] + smooth) * (intersection_1[3] - intersection_1[1] + smooth)

            intersection_area_2 = 0
            if intersection_2[2] - intersection_2[0] >= 0 and intersection_2[3] - intersection_2[1] >= 0:
                intersection_area_2 = (intersection_2[2] - intersection_2[0] + smooth) * (intersection_2[3] - intersection_2[1] + smooth)
            
            union_pred_1 = bbox_1_area + bbox_target_area -intersection_area_1
            union_pred_2 = bbox_2_area + bbox_target_area -intersection_area_2

            iou_1 = intersection_area_1 / (union_pred_1 + smooth)
            iou_2 = intersection_area_2 / (union_pred_2 + smooth)

            if iou_1 >= iou_2:
                final_bbox = pred_bbox_1
                final_bbox_confidence = pred_confidence_1
                low_bbox_confidence = pred_confidence_2
            else:
                final_bbox = pred_bbox_2
                final_bbox_confidence = pred_confidence_2
                low_bbox_confidence = pred_confidence_1

            obj_exist = torch.ones_like(target_confidence)
            if target_bbox[0] == 0 and target_bbox[1] == 0 and target_bbox[2] == 0 and target_bbox[3] == 0:
                obj_exist = torch.zeros_like(target_confidence)
            
            loc_error_x = pow((target_bbox[0] - final_bbox[0]), 2)
            loc_error_y = pow((target_bbox[1] - final_bbox[1]), 2)
            loc_error_w = pow(np.sqrt(target_bbox[2]) - np.sqrt(final_bbox[2]), 2)
            loc_error_h = pow(np.sqrt(target_bbox[3]) - np.sqrt(final_bbox[3]), 2)
            
            if np.isnan(loc_error_w):
                loc_error_w = np.zeros_like(loc_error_w)
            if np.isnan(loc_error_h):
                loc_error_h = np.zeros_like(loc_error_h)

            loc_error_xy = (loc_error_x + loc_error_y) * 5 * obj_exist
            loc_error_wh = (loc_error_w + loc_error_h) * 5 * obj_exist
            loc_error = loc_error_xy + loc_error_wh
            
            confidence_error = pow((target_confidence - final_bbox_confidence), 2) * obj_exist
            non_confidence_error = pow((target_confidence - low_bbox_confidence), 2) * 0.5
            non_confidence_error = non_confidence_error * (torch.ones_like(obj_exist) - obj_exist)
            total_confidence_error = confidence_error + non_confidence_error

            classification_error = pow((target_class - pred_class), 2)
            classification_error = sum(classification_error) * obj_exist

            total_loss = loc_error + total_confidence_error + classification_error
            
            if loss == 0:
                loss = total_loss
            else:
                loss += total_loss
            
        if batch_loss == 0:
            batch_loss = loss
        else:
            batch_loss += loss
    batch_loss /= batch
    return batch_loss

# Setting device & Make Model

In [None]:
from torchsummary import summary
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = YOLO_V1().to(device)
summary(model, (3, 448, 448))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 225, 225]           9,472
         LeakyReLU-2         [-1, 64, 225, 225]               0
         MaxPool2d-3         [-1, 64, 112, 112]               0
            Conv2d-4        [-1, 192, 112, 112]         110,784
         LeakyReLU-5        [-1, 192, 112, 112]               0
         MaxPool2d-6          [-1, 192, 56, 56]               0
            Conv2d-7          [-1, 128, 56, 56]          24,704
         LeakyReLU-8          [-1, 128, 56, 56]               0
            Conv2d-9          [-1, 256, 56, 56]         295,168
        LeakyReLU-10          [-1, 256, 56, 56]               0
           Conv2d-11          [-1, 256, 56, 56]          65,792
        LeakyReLU-12          [-1, 256, 56, 56]               0
           Conv2d-13          [-1, 512, 56, 56]       1,180,160
        LeakyReLU-14          [-1, 512,

# Load VOC Dataset

In [None]:
data_root= './data'
train_dataset = YOLO_VOC_DATASET(data_root, image_set='train', download=True)
val_dataset = YOLO_VOC_DATASET(data_root, image_set='trainval', download=True)

Downloading http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar to ./data/VOCtrainval_11-May-2012.tar


  0%|          | 0/1999639040 [00:00<?, ?it/s]

Extracting ./data/VOCtrainval_11-May-2012.tar to ./data
Using downloaded and verified file: ./data/VOCtrainval_11-May-2012.tar
Extracting ./data/VOCtrainval_11-May-2012.tar to ./data


In [None]:
batch_size = 64
epochs = 135
lr = 0.001
momentum = 0.9
weight_decay=0.0005

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay)

# Define train & validation function

In [None]:
def training(model, loss_fn, optimizer, data_loader, epoch) :
    print(f'Training [{epoch}/{epochs}]')
    for idx, (inputs, labels) in enumerate(data_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
            
        outputs = model(inputs) 
        loss = loss_fn(outputs, labels) 
        loss.backward() 
        optimizer.step()
        if (idx + 1) % 20 == 0:
            print(f' [{(idx+1) * batch_size}/{batch_size * len(data_loader)}] Step:{idx + 1}   Loss: {loss.item()}')

def validation(model, loss_fn, data_loader, epoch) :
    for idx, (inputs, labels) in enumerate(data_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)
            
        outputs = model(inputs) 
        loss = loss_fn(outputs, labels) 
        
    print(f'Validation [{epoch}/{epochs}] Loss: {loss.item()}')

# Training

In [None]:
for epoch in range(1, epochs + 1):
    if epoch >=0 and epoch < 75 :
        optimizer.param_groups[0]['lr'] = 0.001 + 0.009 * (float(epoch)/(75.0)) 
    elif epoch >= 75 and epoch < 105 :
        optimizer.param_groups[0]['lr'] = 0.001
    else : 
        optimizer.param_groups[0]['lr'] = 0.0001

    training(model, multitask_loss, optimizer, train_loader, epoch)
    validation(model, multitask_loss, val_loader, epoch)

Training [1/135]




 [1280/5696] Step:20   Loss: 18.89099283512913
 [2560/5696] Step:40   Loss: 14.496629728684182
 [3840/5696] Step:60   Loss: 18.948596844629254
 [5120/5696] Step:80   Loss: 16.694833914703324
Validation [1/135] Loss: 20.75534489247705
Training [2/135]
 [1280/5696] Step:20   Loss: 16.71188756645216
 [2560/5696] Step:40   Loss: 16.044616596558274
 [3840/5696] Step:60   Loss: 15.345384036222223
 [5120/5696] Step:80   Loss: 15.975549077312921
Validation [2/135] Loss: 21.18456407516982
Training [3/135]
 [1280/5696] Step:20   Loss: 16.074508639104053
 [2560/5696] Step:40   Loss: 17.848081999610137
 [3840/5696] Step:60   Loss: 17.15362911951697
 [5120/5696] Step:80   Loss: 15.9376174471791
Validation [3/135] Loss: 21.09576307200568
Training [4/135]
 [1280/5696] Step:20   Loss: 18.55313619196987
 [2560/5696] Step:40   Loss: 17.431287375374648
 [3840/5696] Step:60   Loss: 16.111867120298527
 [5120/5696] Step:80   Loss: 16.876841488952703
Validation [4/135] Loss: 21.033716038258433
Training [5/13