<a href="https://colab.research.google.com/github/YeongRoYun/BearTeam/blob/edu/edu/model/YOLOv2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://github.com/csm-kr/yolo_v2_vgg16_pytorch/blob/

In [10]:
!pip install xmltodict

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting xmltodict
  Downloading xmltodict-0.13.0-py2.py3-none-any.whl (10.0 kB)
Installing collected packages: xmltodict
Successfully installed xmltodict-0.13.0


In [49]:
from math import ceil
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import xmltodict
from PIL import Image
from torchvision.datasets import VOCDetection
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import PILToTensor, Compose

In [None]:
class YOLO_PASCAL_VOC(VOCDetection):
    def __getitem__(self, index):
        img = Image.open(self.images[index]).convert('RGB')
        
        # img = img.resize((224,224))
        img_transform = Compose([
            PILToTensor(),
            # Resize((224,224))
        ])
        img = torch.divide(img_transform(img), 255)
        
        target = xmltodict.parse(open(self.annotations[index], mode='rb'))
        
        classes = ["aeroplane", "bicycle", "bird", "boat", "bottle",
                   "bus", "car", "cat", "chair", "cow", "diningtable",
                   "dog", "horse", "motorbike", "person", "pottedplant",
                   "sheep", "sofa", "train", "tvmonitor"]
        
        label = torch.zeros(7, 7, 25, dtype = torch.float32)
       
        # Grid 만들기
        Image_Height = float(target['annotation']['size']['height'])
        Image_Width = float(target['annotation']['size']['width'])

        try:
            for i, obj in enumerate(target['annotation']['object']):
                self.parse(obj, classes, Image_Width, Image_Height, label)
    
        # Single-Object in Image
        except TypeError:
            obj = target['annotation']['object']
            self.parse(obj, classes, Image_Width, Image_Height, label)
        return img, torch.tensor(label)
    
    def parse(self, obj, classes, Image_Width, Image_Height, label):
        class_index = classes.index(obj['name'].lower())
        
        x_min = float(obj['bndbox']['xmin']) 
        y_min = float(obj['bndbox']['ymin'])
        x_max = float(obj['bndbox']['xmax']) 
        y_max = float(obj['bndbox']['ymax'])
        
        x = (x_min + x_max) / 2.0
        y = (y_min + y_max) / 2.0
        w = x_max - x_min + 1
        h = y_max - y_min + 1
        
        # 13 x 13 Grid!
        cell_w = ceil(w / 13) # 마지막 셀의 크기는 다를 수 있다.
        cell_h = ceil(h / 13)

        cell_w_last = cell_w if w % 13 == 0 else w - (12 * cell_w)
        cell_h_last = cell_h if h % 13 == 0 else h - (12 * cell_h)
        

        x_cell = int(x/cell_w) # int()는 floor와 동일하다!
        y_cell = int(y/cell_h)
        
        x_val_inCell = float((x - x_cell * cell_w))
        y_val_inCell = float((y - y_cell * cell_h))
        
        x_val_inCell = x_val_inCell / cell_w if x_cell < 12 \
                    else x_val_inCell / cell_w_last
        y_val_inCell = y_val_inCell / cell_h if y_cell < 12 \
                    else y_val_inCell / cell_w_last
        
        # Bounding Box의 width와 height를 [0,1] 사이의 값으로 정규화 한다.
        w = w / Image_Width
        h = h / Image_Height
        
        # [x, y, w, h, c] 이후에 Class one-hot encoding이 있으므로, Offset을 5 준다.
        class_index_inCell = class_index + 5
        
        # 한 Cell에서 Bounding Box는 하나만 가진다
        label[y_cell][x_cell][0] = x_val_inCell
        label[y_cell][x_cell][1] = y_val_inCell
        label[y_cell][x_cell][2] = w
        label[y_cell][x_cell][3] = h
        # Object가 있는 것이 확실하므로 Confidence = 1.0이다.
        label[y_cell][x_cell][4] = 1.0
        
        # 바운딩 박스의 중심점이 같은 셀에 위치하면, 이미지 자체를 복사해서 또다른 데이터로 만들어야 한다!
        # 전처리 단계에서 이러한 경우는 제거된다고 가정한다.
        label[y_cell][x_cell][class_index_inCell] = 1.0
        return None

In [48]:
class YOLOv2(nn.Module):
    def __init__(self, backbone, num_classes = 20):
        """
        여기에서는 임의로 backbone을 VGG16을 유지한당
        마지막 FC Layer를 포함한 온전한 VGG를 backbone으로 입력하기!
        """
        super().__init__()
        self.num_anchors = 5
        self.num_classes = num_classes
        # Backbone의 Parameters 고정! => 안할 수도 있다.
        for feature in backbone.features[:-1]:
            if type(feature) == type(nn.Conv2d(1, 1, 1)):
                feature.requires_grad = False
            else:
                continue
        self.backbone = backbone.features[:-1] 
        # Extra의 Output = [B, 512, ]
        self.extra = nn.Sequential(
            nn.Conv2d(512, 512, 3, 1, 1),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(inplace = True), # Memory 절약!
            nn.Conv2d(512, 512, 3, 1, 1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True), 
            nn.Conv2d(512, 512, 3, 1, 1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.MaxPool2d((2, 2)),
        )
        self.skip_module = nn.Sequential(
            # 1 x 1 conv => channel만 변화!
            nn.Conv2d(512, 64, 1, stride = 1, padding=0),
            nn.BatchNorm2d(64),
            nn.LeakyReLU(inplace=True),
        )

        self.final = nn.Sequential(
            nn.Conv2d(768, 1024, 3, padding=1),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(inplace=True),
            nn.Conv2d(1024, 256, 3, padding=1),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(inplace=True),
            # 1 x 1 Convolution!
            nn.Conv2d(256, self.num_anchors * (5 + self.num_classes), 1)
        )
        self.init_conv2d()
        print(f"num_params : {self.count_parameters()}")

    def init_conv2d(self):
        mean = 0.0
        std = 0.01
        constant = 0.0
        for c in self.extra.children():
            if isinstance(c, nn.Conv2d):
                nn.init.normal_(c.weight, mean, std)
                nn.init.constant_(c.bias, constant)
        for c in self.skip_module.children():
            if isinstance(c, nn.Conv2d):
                nn.init.normal_(c.weight, mean, std)
                nn.init.constant_(c.bias, constant)
        for c in self.final.children():
            if isinstance(c, nn.Conv2d):
                nn.init.normal_(c.weight, mean, std)
                nn.init.constant_(c.bias, constant)
        return None

    def count_parameters(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)
    
    def forward(self, x):
        # X : [B, C, H, W] 형태
        
        x = self.backbone(x) # O: B x 512 x H1 x W1 (H, W는 변한다!)
        skip_x = self.skip_module(x) # O: 64 x H1 x W1

        # skip_x 를 B x 64 x 26 x 26으로 맞춘다.
        h = skip_x.size(2)
        w = skip_x.size(3)

        stride = 1
        padding = [0, 0] # h, w
        padding[0] = 0 if h - 25 > 0 else ceil(12.5 - 0.5 * h)
        padding[1] = 0 if w - 25 > 0 else ceil(12.5 - 0.5 * w)

        kernel = [0, 0] # h, w
        kernel[0] = h - 25 if h - 25 > 0 else 2 * padding[0] + h - 25
        kernel[1] = w - 25 if w - 25 > 0 else 2 * padding[1] + w - 25

        skip_x = nn.MaxPool2d(kernel, stride, padding, )(skip_x) # B x 64 x 26 x 26!
        print(skip_x.shape)
        skip_x = skip_x.view(-1, 64, 13, 2, 13, 2)
        skip_x = skip_x.permute(0, 3, 5, 1, 2, 4).contiguous() # B x 2 x 2 x 64 x 13 x 13으로!
        skip_x = skip_x.view(-1, 256, 13, 13) # B x 256 x 13 x 13!!

        x = self.extra(x) # B x 512 x H2 x W2
        # B x 512 x 13 x 13으로 맞추기!
        h = x.size(2)
        w = x.size(3)

        stride = 1
        padding = [0, 0] # h, w
        padding[0] = 0 if h - 12 > 0 else ceil(6 - 0.5 * h)
        padding[1] = 0 if w - 12 > 0 else ceil(6 - 0.5 * w)

        kernel = [0, 0] # h, w
        kernel[0] = h - 12 if h - 12 > 0 else 2 * padding[0] + h - 12
        kernel[1] = w - 12 if w - 12 > 0 else 2 * padding[1] + w - 12

        x = nn.MaxPool2d(kernel, stride, padding, )(x) # B x 512 x 13 x 13!
        print(x.shape)
        # Concat
        x = torch.cat([x, skip_x], dim=1) # B x 768 x 13 x 13
        x = self.final(x)
        return x

In [46]:
class YOLOv2Loss(nn.Module):
    """
    YOLOv1과 동일하게 구한당!!
    단치 BBox = 5가 된것!!!
    """
    pass

In [47]:
# Test
from torchvision.models import vgg16_bn

model = YOLOv2(vgg16_bn(weights='DEFAULT'))
image = torch.randn([1, 3, 1920, 1200])
y = model(image)

num_params : 31311741
torch.Size([1, 64, 26, 26])
torch.Size([1, 512, 13, 13])
