<span style="font-size:150%">모델 선언</span>

In [None]:
from torchvision.models.detection import fasterrcnn_resnet50_fpn

model = fasterrcnn_resnet50_fpn(num_classes=10, weights=None, weights_backbone=None)

<span style="font-size:150%">백본</span>

In [None]:
print(model)

In [None]:
print(model.backbone.body)

In [None]:
print(model.backbone.fpn)

<span style="font-size:150%">RPN</span>

In [None]:
print(model.rpn)
print(model.rpn.anchor_generator.sizes)
print(model.rpn.anchor_generator.aspect_ratios)
print(model.rpn.anchor_generator.cell_anchors)

In [None]:
import numpy as np
import cv2
import matplotlib.pyplot as plt

background = np.ones([1000, 1000, 3])*255
center = (500, 500)
colormap = [(255,255,0),(255,0,255),(255,0,0),(0,255,0),(0,0,255),]
anchors = model.rpn.anchor_generator.cell_anchors

i=0
for anchor in anchors:
    color = colormap[i%7]
    i+=1
    for x1,y1,x2,y2 in anchor:
        px1 = int(center[0] + x1)
        py1 = int(center[1] + y1)
        px2 = int(center[0] + x2)
        py2 = int(center[1] + y2)
        cv2.rectangle(background,(px1, py1),(px2, py2),color, 5)

plt.axis(False)
plt.imshow(background)

In [None]:
#torchvison.models.detection._utils.py 내 decode_single 함수 발췌

import torch
from torch import Tensor
import math

boxes = '앵커'
rel_codes = 'RPN 출력'

def decode_single(rel_codes: Tensor, boxes: Tensor) -> Tensor:
    """
    From a set of original boxes and encoded relative box offsets,
    get the decoded boxes.

    Args:
        rel_codes (Tensor): encoded boxes
        boxes (Tensor): reference boxes.
    """

    boxes = boxes.to(rel_codes.dtype)

    widths = boxes[:, 2] - boxes[:, 0]
    heights = boxes[:, 3] - boxes[:, 1]
    ctr_x = boxes[:, 0] + 0.5 * widths
    ctr_y = boxes[:, 1] + 0.5 * heights

    wx, wy, ww, wh = 1,1,1,1
    dx = rel_codes[:, 0::4] / wx
    dy = rel_codes[:, 1::4] / wy
    dw = rel_codes[:, 2::4] / ww
    dh = rel_codes[:, 3::4] / wh

    # Prevent sending too large values into torch.exp()
    dw = torch.clamp(dw, max=math.log(1000.0 / 16))
    dh = torch.clamp(dh, max=math.log(1000.0 / 16))

    pred_ctr_x = dx * widths[:, None] + ctr_x[:, None]
    pred_ctr_y = dy * heights[:, None] + ctr_y[:, None]
    pred_w = torch.exp(dw) * widths[:, None]
    pred_h = torch.exp(dh) * heights[:, None]

    # Distance from center to box's corner.
    c_to_c_h = torch.tensor(0.5, dtype=pred_ctr_y.dtype, device=pred_h.device) * pred_h
    c_to_c_w = torch.tensor(0.5, dtype=pred_ctr_x.dtype, device=pred_w.device) * pred_w

    pred_boxes1 = pred_ctr_x - c_to_c_w
    pred_boxes2 = pred_ctr_y - c_to_c_h
    pred_boxes3 = pred_ctr_x + c_to_c_w
    pred_boxes4 = pred_ctr_y + c_to_c_h
    pred_boxes = torch.stack((pred_boxes1, pred_boxes2, pred_boxes3, pred_boxes4), dim=2).flatten(1)
    return pred_boxes

rel_codes : [delta_x, delta_y, delta_width, delta_height] 


boxes : [x_min, y_min, x_max, y_max]

In [None]:
import cv2
import torch
import numpy as np
import matplotlib.pyplot as plt

rel_codes = torch.tensor([[ 0.1, -0.1, -0.3, -0.3]])
boxes = torch.tensor([[-45., -91.,  45.,  91.]])
pred_boxes = decode_single(rel_codes, boxes)

fig, ax = plt.subplots(1, 1, figsize=(16, 8))

size = [225,225]
background = np.ones([size[0], size[1], 3], dtype=np.int32)*255
boxes = (boxes[0] + torch.tensor([p/2 for p in size]*2)).int()
pred_boxes = (pred_boxes[0] + torch.tensor([p/2 for p in size]*2)).int()

print(boxes)
print(pred_boxes)

# Original anchor box
cv2.rectangle(background,(boxes[0].item(), boxes[1].item()),(boxes[2].item(), boxes[3].item()),(225, 0, 0), 3)
# Predicted box
cv2.rectangle(background,(pred_boxes[0].item(), pred_boxes[1].item()),(pred_boxes[2].item(), pred_boxes[3].item()),(0, 0, 255), 3)


ax.set_axis_off()
ax.imshow(background)

<span style="font-size:150%">ROI 헤드</span>

In [None]:
print(model.roi_heads)

In [None]:
import torch
import torchvision

m = torchvision.ops.MultiScaleRoIAlign(featmap_names=['0', '1'],
                                       output_size=(7,7),
                                       sampling_ratio=2)

i = dict()
i['0'] = torch.rand(1,256,64,64)
i['1'] = torch.rand(1,256,32,32)
boxes = torch.rand(3,4) * 256
boxes[:,2:] += boxes[:,:2]
image_sizes = [(512,512)]
output = m(i, [boxes], image_sizes)

print(boxes)
print(output.shape)
print(output.reshape([output.shape[0], -1]).shape)

### 데이터 셋


In [None]:
import os
import numpy as np
import torch
from PIL import Image


class PennFudanDataset(torch.utils.data.Dataset):
    """Penn-Fudan Database for Pedestrian Detection and Segmentation
    Download the dataset from https://www.cis.upenn.edu/~jshi/ped_html/
    """
    def __init__(self, root, transforms):
        self.root = root
        self.transforms = transforms
        # load all image files, sorting them to
        # ensure that they are aligned
        self.imgs = list(sorted(os.listdir(os.path.join(root, "PNGImages"))))
        self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks"))))

    def __getitem__(self, idx):
        # load images and masks
        img_path = os.path.join(self.root, "PNGImages", self.imgs[idx])
        mask_path = os.path.join(self.root, "PedMasks", self.masks[idx])
        img = Image.open(img_path).convert("RGB")
        # note that we haven't converted the mask to RGB,
        # because each color corresponds to a different instance
        # with 0 being background
        mask = Image.open(mask_path)
        # convert the PIL Image into a numpy array
        mask = np.array(mask)
        # instances are encoded as different colors
        obj_ids = np.unique(mask)
        # first id is the background, so remove it
        obj_ids = obj_ids[1:]

        # split the color-encoded mask into a set
        # of binary masks
        masks = mask == obj_ids[:, None, None]

        # get bounding box coordinates for each mask
        num_objs = len(obj_ids)
        boxes = []
        for i in range(num_objs):
            pos = np.nonzero(masks[i])
            xmin = np.min(pos[1])
            xmax = np.max(pos[1])
            ymin = np.min(pos[0])
            ymax = np.max(pos[0])
            boxes.append([xmin, ymin, xmax, ymax])

        # convert everything into a torch.Tensor
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        # there is only one class
        labels = torch.ones((num_objs,), dtype=torch.int64)
        masks = torch.as_tensor(masks, dtype=torch.uint8)

        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        # suppose all instances are not crowd
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)

        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["masks"] = masks
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd

        if self.transforms is not None:
            img, target = self.transforms(img, target)

        return img, target

    def __len__(self):
        return len(self.imgs)

In [None]:
# path setting
lecture_root = os.path.dirname(os.getcwd())
import sys
sys.path.append(lecture_root)

In [None]:
import utils.coco.transforms as T

data_path = os.path.join(lecture_root, 'data/PennFudanPed')
transform = T.Compose([T.PILToTensor(), 
                       T.ConvertImageDtype(torch.float)])


trainset = PennFudanDataset(data_path,transform)


<span style="font-size:150%">손실 함수</span>

In [None]:
import torch

sample, target = trainset[0]
sample = sample.unsqueeze(0)
target = [target]
model = fasterrcnn_resnet50_fpn(num_classes=10, weights=None, weights_backbone=None)
losses = model(sample, target)

print(list(losses.keys()))

<span style="font-size:150%">어그멘테이션</span>

In [None]:
h_flip = T.RandomHorizontalFlip(p=1)

trainset = PennFudanDataset(data_path, None)
sample, target = trainset[0]
print(target['boxes'])

new_sample, new_target = h_flip(sample, target)
print(new_target['boxes'])

In [None]:
import cv2
import matplotlib.pyplot as plt

%matplotlib inline 

fig, ax = plt.subplots(1, 1, figsize=(16, 8))

trainset = PennFudanDataset(data_path, transform)
sample, target = trainset[0]
print(target['boxes'])
sample = sample.permute(1,2,0).numpy()
sample = (sample*255).astype(np.uint8)

vis_sample = sample.copy()

for box in zip(target['boxes'].int()):
    box = box[0]
    cv2.rectangle(vis_sample,(box[0].item(), box[1].item()),(box[2].item(), box[3].item()),(0, 0, 255), 3)
    
ax.set_axis_off()
ax.imshow(vis_sample)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(16, 8))

h_flip = T.RandomHorizontalFlip(p=1)

data_path = os.path.join(lecture_root, 'data/PennFudanPed')
trainset = PennFudanDataset(data_path,transform)
sample, target = trainset[0]
new_sample, new_target = h_flip(sample, target)

print(new_target['boxes'])
new_sample = new_sample.permute(1,2,0).numpy()
new_sample = (new_sample*255).astype(np.uint8)

vis_sample = new_sample.copy()

for box in zip(new_target['boxes'].int()):
    box = box[0]
    cv2.rectangle(vis_sample,(box[0].item(), box[1].item()),(box[2].item(), box[3].item()),(0, 0, 255), 3)
    
ax.set_axis_off()
ax.imshow(vis_sample)
