In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, models
import torchvision.transforms as transforms
import torch.optim as optim

import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
import pandas as pd
from PIL import Image
import cv2

from typing import List, Dict, Tuple

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

Using cpu device


## Pacal VOC 2007 Dataset load

In [4]:
pre_processing = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

In [12]:
_dataset = datasets.VOCDetection(root='../data/VOC/2007/train', year='2007', image_set='train', download=False, transform=pre_processing)

In [13]:
def imshow(img, norm: bool=False):
    if norm:
        img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    img = np.transpose(npimg, (1, 2, 0))
    return img

In [14]:
classes = {
    'person': 0, 'bird': 1,
    'cat': 2, 'cow': 3,
    'dog': 4, 'horse': 5,
    'sheep': 6, 'aeroplane': 7,
    'bicycle': 8, 'boat': 9,
    'bus': 10, 'car': 11,
    'motorbike': 12, 'train': 13,
    'bottle': 14, 'chair': 15,
    'diningtable': 16, 'pottedplant': 17,
    'sofa': 18, 'tvmonitor': 19
    }

In [40]:
def parse_from_target(classes: Dict, data: Tuple) -> Tuple:
    image, target = data
    
    labels: List = []
    bboxes: List = []

    for i in range(len(target['annotation']['object'])):
        bbox: List = []
        _class = target['annotation']['object'][i]['name']
        #print(f'class: {_class}')
        label = classes[_class]
        #print(f'label: {label}')
        bndbox = target['annotation']['object'][i]['bndbox']
        width = target['annotation']['size']['width']
        height = target['annotation']['size']['height']
        size = (int(width), int(height))

        for key, val in bndbox.items():
            bbox.append(int(val))
        labels.append(label)
        bboxes.append(bbox)

    image = torch.tensor(image)
    labels = torch.tensor(labels)
    bboxes = torch.tensor(bboxes)
    size = torch.tensor(size)
    #print(f' labels: {labels}')
    return image, labels, bboxes, size

In [41]:
import torchvision.transforms.functional as FT

def resize(image, bboxes, image_size, dims: Tuple=(300, 300), return_percent_coords=True):
    width, height = image_size[0], image_size[1]

    # Resize image
    re_image = FT.resize(image, dims)

    # Resize bounding boxes
    old_dims = torch.FloatTensor([width, height, width, height]).unsqueeze(0)
    re_bboxes: List = []
    for bbox in bboxes:
        bbox = torch.tensor(bbox)
        re_bbox = bbox / old_dims  # percent coordinates
        re_bbox = re_bbox.tolist()[0]

        for i, rate in enumerate(re_bbox):
            if i % 2 == 0:
                re_bbox[i] = rate * dims[0]
            elif i % 2 == 1:
                re_bbox[i] = rate * dims[1]
        re_bboxes.append(re_bbox)
    re_bboxes = torch.tensor(re_bboxes)

    if not return_percent_coords:
        new_dims = torch.FloatTensor([dims[1], dims[0], dims[1], dims[0]]).unsqueeze(0)
        re_bboxes: List = []
        for bbox in bboxes:
            bbox = torch.tensor(bbox)
            re_bbox = bbox * new_dims
            re_bboxes.append(re_bbox.tolist()[0])
        re_bboxes = torch.tensor(re_bboxes)

    return re_image, re_bboxes

In [42]:
class myDataset(Dataset):
    def __init__(self, classes, _dataset, dims: Tuple, norm: bool=False):
        super(myDataset, self).__init__()
        self.classes = classes
        self._dataset = _dataset
        self.dims = dims
        self.norm = norm

    def __len__(self) -> int:
        return len(self._dataset)

    def __getitem__(self, idx: int):
        image, labels, bboxes, image_size = parse_from_target(self.classes, _dataset[idx])
        re_image, re_bboxes = resize(image, bboxes, image_size, dims=self.dims)
        if self.norm:
            normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
            re_image = normalize(re_image)

        return re_image, labels, re_bboxes

    def collate_fn(self, batch):
            """
            Since each image may have a different number of objects, we need a collate function (to be passed to the DataLoader).
            This describes how to combine these tensors of different sizes. We use lists.
            Note: this need not be defined in this Class, can be standalone.
            :param batch: an iterable of N sets from __getitem__()
            :return: a tensor of images, lists of varying-size tensors of bounding boxes, labels, and difficulties
            """

            images: List=list()
            labels: List=list()
            bboxes: List=list()

            for b in batch:
                images.append(b[0])
                labels.append(b[1])
                bboxes.append(b[2])
            images = torch.stack(images, dim=0)
            return images, labels, bboxes


In [43]:
train_ds = myDataset(classes, _dataset, (300, 300))

In [44]:
batch_size=64
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=train_ds.collate_fn)

## Model 정의

In [45]:
class BB_model(nn.Module):
    def __init__(self):
        super(BB_model, self).__init__()
        resnet = models.resnet34(pretrained=True)
        layers = list(resnet.children())[:8]
        self.features1 = nn.Sequential(*layers[:6])
        self.features2 = nn.Sequential(*layers[6:])
        self.classifier = nn.Sequential(nn.BatchNorm1d(512), nn.Linear(512, 4))
        self.bb = nn.Sequential(nn.BatchNorm1d(512), nn.Linear(512, 4))
        
    def forward(self, x):
        x = self.features1(x)
        x = self.features2(x)
        x = F.relu(x)
        x = nn.AdaptiveAvgPool2d((1,1))(x)
        x = x.view(x.shape[0], -1)
        return self.classifier(x), self.bb(x)

In [46]:
model = BB_model().to(device)

In [47]:
def update_optimizer(optimizer, lr):
    for i, param_group in enumerate(optimizer.param_groups):
        param_group["lr"] = lr

In [48]:
learning_rate = 0.001

criterion = nn.CrossEntropyLoss()  # 손실함수 설정
optimizer = optim.Adam(model.parameters(), lr=learning_rate)  # 최적화 설정

In [49]:
def get_accuracy(y, label):
    y_idx = torch.argmax(y, dim=1)
    result = y_idx - label

    num_correct = 0
    for i in range(len(result)):
        if result[i] == 0:
            num_correct += 1

    return num_correct/y.shape[0]

In [53]:
import time

def train(dataloader, model, loss_fn, optimizer):
    num_batches = len(dataloader)
    train_loss_list, train_acc_list = [], []

    start_time = time.time()
    for batch, (images, labels, bboxes) in enumerate(dataloader):
        print(f'image: {len(images)}')
        print(f'labels: {len(labels)}')
        print(f'bboxes: {len(bboxes)}')
        model.train()
        # x: 입력, y: 정답(레이블)을 받아온 후 device에 올려줌
        images = images.to(device)
        labels = [l.to(device) for l in labels]
        bboxs = [b.to(device) for b in bboxes]
        
        # 예측 오류 계산
        pred = model(images)
        loss = loss_fn(pred, labels)  # 손실함수 계산

        # 역전파
        optimizer.zero_grad() # 학습 수행 전 미분값을 0으로 초기화(학습전 반드시 처리 필요)
        loss.backward()       # 가중치와 편향에 대한 기울기 계산
        optimizer.step()      # 가중치와 편향 업데이트

        # 학습 정확도 및 손실함수 값 기록
        train_acc = get_accuracy(pred, labels)  # 정확도 계산

        train_loss_list.append(loss.item())
        train_acc_list.append(train_acc)

        if (batch+1) % num_batches == 0:
            print(f'step: {batch+1}/{num_batches} | {time.time() - start_time:.2f} s/step | ', end='')
            print(f'train loss: {np.mean(train_loss_list):.4f} | train acc: {np.mean(train_acc_list):.4f} | ', end='')

In [54]:
num_epochs = 10
for epoch in range(num_epochs):
    print(f'Epoch: {epoch+1}/{num_epochs}')
    train(train_dl, model, criterion, optimizer)
print("Done!")   

Epoch: 1/10
  image = torch.tensor(image)
  bbox = torch.tensor(bbox)
image: 64
labels: 64
bboxes: 64
  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


TypeError: cross_entropy_loss(): argument 'input' (position 1) must be Tensor, not tuple