In [3]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torchvision.datasets import VOCDetection

import xmltodict
from PIL import Image
import numpy as np
from tqdm import tqdm

In [51]:
class YOLO_PASCAL_VOC(VOCDetection):
    def __getitem__(self, index):
        """
        전처리
        결과 : X(image), y(label)
        """
        img = (Image.open(self.images[index]).convert('RGB').resize((224,224)))
        img_transform = transforms.Compose([
            transforms.PILToTensor(),
            transforms.Resize((224,224))
        ])
        img = torch.divide(img_transform(img), 255)
        
        target = xmltodict.parse(open(self.annotations[index].read()))
        
        classes = ["aeroplane", "bicycle", "bird", "boat", "bottle",
                   "bus", "car", "cat", "chair", "cow", "diningtable",
                   "dog", "horse", "motorbike", "person", "pottedplant",
                   "sheep", "sofa", "train", "tvmonitor"]
        
        label = np.zeros((7, 7, 25), dtype = float)
        # 격자 구조
        
        Image_Height = float(target['annotation']['size']['height'])
        Image_Width = float(target['annotation']['size']['width'])
        
        try:
            for i, obj in enumerate(target['annotation']['object']):
                if i >= 1:
                    break
    
                class_index = classes.index(obj['name'].lower())
                
                x_min = float(obj['bndbox']['xmin']) 
                y_min = float(obj['bndbox']['ymin'])
                x_max = float(obj['bndbox']['xmax']) 
                y_max = float(obj['bndbox']['ymax'])
                
                x_min = float((224.0/Image_Width)*x_min)
                y_min = float((224.0/Image_Height)*y_min)
                x_max = float((224.0/Image_Width)*x_max)
                y_max = float((224.0/Image_Height)*y_max)
                
                x = (x_min + x_max) / 2.0
                y = (y_min + y_max) / 2.0
                w = x_max - x_min
                h = y_max - y_min
                
                x_cell = int(x/32)
                y_cell = int(y/32)
                
                x_val_inCell = float((x - x_cell * 32.0)/32.0)
                y_val_inCell = float((y - y_cell * 32.0)/32.0)
                
                w = w / 224.0
                h = h / 224.0
                
                
                class_index_inCell = class_index + 5
                
                label[y_cell][x_cell][25 * i + 0] = x_val_inCell
                label[y_cell][x_cell][25 * i + 1] = y_val_inCell
                label[y_cell][x_cell][25 * i + 2] = w
                label[y_cell][x_cell][25 * i + 3] = h
                label[y_cell][x_cell][25 * i + 4] = 1.0
                label[y_cell][x_cell][25 * i + class_index_inCell] = 1.0
                
    
        # Single-Object in Image
        except TypeError:
            obj = target['annotation']['object']
            class_index = classes.index(obj['name'].lower())
                
            x_min = float(obj['bndbox']['xmin']) 
            y_min = float(obj['bndbox']['ymin'])
            x_max = float(obj['bndbox']['xmax']) 
            y_max = float(obj['bndbox']['ymax'])

            x_min = float((224.0/Image_Width)*x_min)
            y_min = float((224.0/Image_Height)*y_min)
            x_max = float((224.0/Image_Width)*x_max)
            y_max = float((224.0/Image_Height)*y_max)

            x = (x_min + x_max)/2.0
            y = (y_min + y_max)/2.0
            w = x_max - x_min
            h = y_max - y_min

            x_cell = int(x/32) 
            y_cell = int(y/32) 
            x_val_inCell = float((x - x_cell * 32.0)/32.0) 
            y_val_inCell = float((y - y_cell * 32.0)/32.0)

            w = w / 224.0
            h = h / 224.0

            class_index_inCell = class_index + 5

            label[y_cell][x_cell][0] = x_val_inCell
            label[y_cell][x_cell][1] = y_val_inCell
            label[y_cell][x_cell][2] = w
            label[y_cell][x_cell][3] = h
            label[y_cell][x_cell][4] = 1.0
            label[y_cell][x_cell][class_index_inCell] = 1.0

        return img, torch.tensor(label)
            

In [52]:
class YOLO(nn.Module):
    def __init__(self, VGG16):
        super().__init__()
        
        self.backbone = VGG16
        
        self.conv = nn.Sequential(
            nn.BatchNorm2d(512), # in_channels
            nn.Conv2d(in_channels = 512,out_channels = 1024, kernel_size = 3, padding = 1),
            nn.LeakyReLU(),
            nn.BatchNorm2d(1024),
            nn.Conv2d(in_channels = 1024,out_channels = 1024, kernel_size = 3, padding = 1),
            nn.LeakyReLU(),
            nn.MaxPool2d(2),
            nn.BatchNorm2d(1024),
            nn.Conv2d(in_channels = 1024,out_channels = 1024, kernel_size = 3, padding = 1),
            nn.LeakyReLU(),
            nn.BatchNorm2d(1024),
            nn.Conv2d(in_channels = 1024,out_channels = 1024, kernel_size = 3, padding = 1),
            nn.LeakyReLU(),
            nn.Flatten()
        )
        self.linear = nn.Sequential(
            nn.BatchNorm1d(7*7*1024),
            nn.Linear(7*7*1024, 4096),
            nn.LeakyReLU(),
            nn.Dropout(),
            nn.BatchNorm1d(4096),
            nn.Linear(4096, 1470)
        )

    def forward(self, x):
        out = self.backbone(x)
        out = self.conv(out)
        out = self.linear(out)
        out = torch.reshape(out, (-1 ,7, 7, 25))
        return out

In [48]:
def yolo_multitask_loss(y_pred, y_true):
    bath_loss = 0
    
    count = len(y_true)
    for i in range(0, count):
        y_true_unit = y_true[i].clone().detach().requires_gred_(True)
        y_pred
        

0
5
5


In [22]:
lst = ['a', 'b', 'c']

try:
    lst.index('z')
except ValueError as e:
    pass

-1


In [11]:
obj = {
    'name': '윤영로',
    'age': 24,
    'annotations': {
        'type': 'helo'
    }
}
obj['annotations']['type']

cell = [0, 1] x [0, 1]

'helo'

In [42]:
'a' == 'A'.lower()

True