In [1]:
# TORCHVISION OBJECT DETECTION FINETUNING TUTORIAL을 참조하여 구현하였습니다.
# https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html
import os
import time
import pickle
import numpy as np
from PIL import Image, ImageDraw
from xml.etree.ElementTree import parse

import torch
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

from engine import train_one_epoch, evaluate
import transforms as T
import utils

In [2]:
# 검출 대상 객체 리스트
class_dict = {"background":0, "naengmyeon":1, "bibimbap":2, "samgyetang":3, "yukgaejang":4, "galbijjim":5, "ddukguk":6, "sundae":7, \
              "jokbal":8, "japchae":9, "bindaetteok":10, "udon":11, "tonkatsu":12, "soba":13, "gyudon":14, "ramen":15, "okonomiyaki":16, \
              "jajangmyeon":17, "jjambbong":18, "tangsuyuk":19, "beijingduck":20, "mapotofu":21, "shaolongbao":22, \
              "hamburger":23, "margeritapizza":24, "friedchicken":25, "curryrice":26, "fishandchips":27, "tomyamkung":28, \
              "nasigoreng":29}

# 데이터셋 클래스
class FoodetectorDataset(torch.utils.data.Dataset):
    def __init__(self, root, transforms=None):
        self.root = root
        self.transforms = transforms

        self.imgs = list(sorted(os.listdir(os.path.join(root, "images"))))
        self.annos = list(sorted(os.listdir(os.path.join(root, "annotations"))))

    def __getitem__(self, idx):
        # load images and annotations
        img_path = os.path.join(self.root, "images", self.imgs[idx])
        anno_path = os.path.join(self.root, "annotations", self.annos[idx])
        img = Image.open(img_path).convert("RGB")
        
        # annotation xml 파일에서 정보를 읽어와서 데이터셋으로 저장
        tree = parse(anno_path)
        
        objects = tree.getroot().findall("object")
        labels = []
        boxes = []
        for obj in objects:
            label = obj.find('name').text
            labels.append(class_dict[label])
            bndbox = obj.find('bndbox')
            xmin, ymin = int(bndbox.find('xmin').text), int(bndbox.find('ymin').text) 
            xmax, ymax = int(bndbox.find('xmax').text), int(bndbox.find('ymax').text)
            boxes.append([xmin, ymin, xmax, ymax])
        labels = torch.as_tensor(labels, dtype=torch.int64)
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        
        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        # suppose all instances are not crowd
        iscrowd = torch.zeros((len(objects),), dtype=torch.int64)
        
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd

        if self.transforms is not None:
            img, target = self.transforms(img, target)

        return img, target

    def __len__(self):
        return len(self.imgs)

In [3]:
def get_object_detection_model(num_classes):
    # COCO 데이터셋으로 pre-trained된 object_detection model을 가져옴 
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

    # Feature vector의 size 확인
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    
    # 학습 대상인 classifier와 bounding box regressor를 새로 정의
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    return model

In [5]:
def get_transform(train):
    transforms = []
    # converts the image, a PIL image, into a PyTorch Tensor
    transforms.append(T.ToTensor())
    if train:
        # during training, randomly flip the training images
        # and ground-truth for data augmentation
        transforms.append(T.RandomHorizontalFlip(0.5))
    return T.Compose(transforms)

In [4]:
# 29개의 객체를 검출 대상으로 함 (1개는 background)
num_classes = 29 + 1

# 모델 인스턴스 생성
model = get_object_detection_model(num_classes)
# print(model)

In [6]:
# use our dataset and defined transformations
dataset = FoodetectorDataset('data', get_transform(train=True))
dataset_test = FoodetectorDataset('data', get_transform(train=False))

# split the dataset in train and test set
torch.manual_seed(2)
indices = torch.randperm(len(dataset)).tolist()
dataset = torch.utils.data.Subset(dataset, indices[:-300])
dataset_test = torch.utils.data.Subset(dataset_test, indices[-300:])

# define training and validation data loaders
data_loader = torch.utils.data.DataLoader(
    dataset, batch_size=2, shuffle=True, num_workers=4,
    collate_fn=utils.collate_fn)

data_loader_test = torch.utils.data.DataLoader(
    dataset_test, batch_size=1, shuffle=False, num_workers=4,
    collate_fn=utils.collate_fn)

# 별도로 테스트할 수 있도록 테스트 데이터 저장
with open('test_data.pkl', 'wb') as output:
    pickle.dump(dataset_test, output, pickle.HIGHEST_PROTOCOL)

In [7]:
# 사용 가능한 GPU 설정
# os.environ["CUDA_VISIBLE_DEVICES"] = '1, 2, 6, 7'
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("device :", device)

# move model to the right device
model.to(device)

# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.002, momentum=0.9, weight_decay=0.0005)

# and a learning rate scheduler which decreases the learning rate by 2x every 3 epochs
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.5)

device : cuda


In [None]:
start = time.time()

# let's train it for 10 epochs
num_epochs = 10

for epoch in range(num_epochs):
    # train for one epoch, printing every 100 iterations
    train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=100)
    # update the learning rate
    lr_scheduler.step()
    # evaluate on the test dataset
    evaluate(model, data_loader_test, device=device)
    
    if epoch % 5 == 4:
        #model SAVE
        modelname = 'data/models/take5_epoch' + str(epoch+21) + '.model'
        torch.save(model.state_dict(), modelname)
    
print (time.time() - start)

In [10]:
torch.cuda.empty_cache()
# 필요시 모델 save
# torch.save(model.state_dict(), "data/models/take5_epoch15.model")