1. Region Proposal network (RPN)
2. RPN loss functions
3. Region of Interest Pooling (ROI)
4. ROI loss functions

In [1]:
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import itertools

## Feautre Extraction

In [2]:
dummy_img = torch.zeros((1,3,800,800)).float()

# [y1, x1, y2, x2]
bbox = torch.FloatTensor([[20, 30, 400, 500], [300, 400, 500, 600]])
labels = torch.LongTensor([6, 8])

# 1x1 in feature map -> 16x16 in image
sub_sample = 16

In [3]:
model = torchvision.models.vgg16(pretrained=True)
fe = list(model.features)

# req_features = fe[:30]
req_features = []
k = dummy_img.clone()
for i in fe:
    k = i(k)
    if k.size()[2] < 800 // 16:
        break
    req_features.append(i)
    out_channels = k.size()[1]

print(len(req_features))
print(out_channels)

30
512


In [4]:
faster_rcnn_feature = nn.Sequential(*req_features)
sample_output = faster_rcnn_feature(dummy_img)
print(sample_output.shape)

torch.Size([1, 512, 50, 50])


## Anchor boxes

In [38]:
ratios = [0.5, 1, 2]
anchor_scales = [8, 16, 32]
anchor_number = len(ratios) * len(anchor_scales)
print(f'Total anchor #: {anchor_number}')

anchor_base = np.zeros((anchor_number ,4), dtype=np.float32)
print(f'anchor base: \n {anchor_base}')
print('-----------------------------------------')

ctr_y = sub_sample / 2.
ctr_x = sub_sample / 2.

for idx, (ratio, anchor_scale) in enumerate(itertools.product(ratios, anchor_scales)):
    h = sub_sample * anchor_scale * np.sqrt(ratio)
    w = sub_sample * anchor_scale * np.sqrt(1. / ratio)
    anchor_base[idx, 0] = ctr_y - h / 2 
    anchor_base[idx, 1] = ctr_x - w / 2
    anchor_base[idx, 2] = ctr_y + h / 2
    anchor_base[idx, 3] = ctr_x + w / 2
    
print(f'anchor_box: \n {anchor_base}')

Total anchor #: 9
anchor base: 
 [[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
-----------------------------------------
anchor_box: 
 [[ -37.254833  -82.50967    53.254833   98.50967 ]
 [ -82.50967  -173.01933    98.50967   189.01933 ]
 [-173.01933  -354.03867   189.01933   370.03867 ]
 [ -56.        -56.         72.         72.      ]
 [-120.       -120.        136.        136.      ]
 [-248.       -248.        264.        264.      ]
 [ -82.50967   -37.254833   98.50967    53.254833]
 [-173.01933   -82.50967   189.01933    98.50967 ]
 [-354.03867  -173.01933   370.03867   189.01933 ]]


In [45]:
fe_size = (800 // 16)  # 피처맵은 50 x 50
ctr_x = np.arange(16, (fe_size + 1) * 16, 16)
ctr_y = np.arange(16, (fe_size + 1) * 16, 16)

anchors = np.zeros((fe_size * fe_size * 9, 4))

ctr = np.zeros((len(ctr_x) * len(ctr_y), 2))
for idx, (x,y) in enumerate(itertools.product(ctr_x, ctr_y)):
    ctr[idx, 1] = x - 8
    ctr[idx, 0] = y - 8

print(len(ctr))
print(f'center example: \n {ctr[:3]} ....\n')

idx = 0
for c_y, c_x in ctr:
    for ratio, anchor_scale in itertools.product(ratios, anchor_scales):
        h = sub_sample * anchor_scale * np.sqrt(ratio)
        w = sub_sample * anchor_scale * np.sqrt(1. / ratio)
        anchors[idx, 0] = c_y - h / 2 
        anchors[idx, 1] = c_x - w / 2
        anchors[idx, 2] = c_y + h / 2
        anchors[idx, 3] = c_x + w / 2 
        idx += 1
        
print(f'anchor shape:{anchors.shape}')
print(anchors)

2500
center example: 
 [[ 8.  8.]
 [24.  8.]
 [40.  8.]] ....

anchor shape:(22500, 4)
[[ -37.254834    -82.50966799   53.254834     98.50966799]
 [ -82.50966799 -173.01933598   98.50966799  189.01933598]
 [-173.01933598 -354.03867197  189.01933598  370.03867197]
 ...
 [ 701.49033201  746.745166    882.50966799  837.254834  ]
 [ 610.98066402  701.49033201  973.01933598  882.50966799]
 [ 429.96132803  610.98066402 1154.03867197  973.01933598]]


------------
- IoU가 0.7 이상인 anchor는 positive, 0.3 이하인 anchor는 negative label을 부여한다

In [66]:
bbox = np.array([[20,30,400,500],[300,400,500,600]], dtype=np.float32)
labels = np.asarray([6, 8], dtype=np.int8) # 0 represents background

inside_index = np.where((anchors[:,0] >=0) &
                        (anchors[:,1] >=0) &
                        (anchors[:,2] <=800) &
                        (anchors[:,3] <=800))[0]
print(f'valid anchor counts: {inside_index.shape}')

label = np.empty((len(inside_index),), dtype=np.int32)
label.fill(-1)
print(f'anchor labels:{label.shape}')

valid_anchor_boxes = anchors[inside_index]
print(f'valid anchor shape:{valid_anchor_boxes.shape}')

valid anchor counts: (8940,)
anchor labels:(8940,)
valid anchor shape:(8940, 4)


In [67]:
def IoU(bbox, anchor):
    ya1, xa1, ya2, xa2 = anchor
    anchor_area = (ya2 - ya1) * (xa2 - xa1)
    yb1, xb1, yb2, xb2 = bbox
    box_area = (yb2 - yb1) * (xb2 - xb1)
    inter_x1 = max([xb1, xa1])
    inter_x2 = min([xb2, xa2])
    inter_y1 = max([yb1, ya1])
    inter_y2 = min([yb2, ya2])
    
    if (inter_x1 < inter_x2) and (inter_y1 < inter_y2):
        inter_area = (inter_y2 - inter_y1) * (inter_x2 - inter_x1)
        iou = inter_area / (anchor_area + box_area - inter_area)
    else:
        iou = 0
    return iou

In [83]:
ious = np.zeros((len(valid_anchor_boxes), 2), dtype=np.float32)
print(f'bbox: \n {bbox}')

for idx, anchor in enumerate(valid_anchor_boxes):
    iou1 = IoU(bbox[0], anchor)
    iou2 = IoU(bbox[1], anchor)
    ious[idx] = [iou1, iou2]
    
print(f'ious shape: {ious.shape}')

bbox: 
 [[ 20.  30. 400. 500.]
 [300. 400. 500. 600.]]
ious shape: (8940, 2)


In [101]:
gt_argmax_ious = ious.argmax(axis = 0) # 어떤 gt object가 max iou를 갖는지 알려줌
print(gt_argmax_ious) 

gt_max_ious = ious[gt_argmax_ious, [0, 1]] # max iou 값
print(gt_max_ious)

gt_argmax_ious = np.where(ious == gt_max_ious)[0] # 가장 높은 iou 값을 같는 gt object들을 알려준다.
print(gt_argmax_ious)


[2262 5620]
[0.68130493 0.61035156]
[2262 2508 5620 5628 5636 5644 5866 5874 5882 5890 6112 6120 6128 6136
 6358 6366 6374 6382]


In [133]:
argmax_ious = ious.argmax(axis=1) # 두개의 bbox중 iou max값을 갖는 인덱스 위치(argmax_ious)

print(argmax_ious.shape)
print(argmax_ious)

max_ious = ious[np.arange(len(inside_index)), argmax_ious] # 두개의 bbox중 max iou의 값들의 집합
print(max_ious)

(8940,)
[0 0 0 ... 0 0 0]
[0.06811669 0.07083762 0.07083762 ... 0.         0.         0.        ]


## label 조건
pos_iou_threshold = 0.7 <br>
neg_iou_threshold = 0.3  

label[max_ious < neg_iou_threshold] = 0 <br>
label[gt_argmax_ious] = 1 <br>
label[max_ious > pos_iou_threshold] = 1

- anchor 박스로 iou 계산 시 너무 많은 negative sample이 나오기 때문에 pos 와 neg 균형을 1:1로 맞추어 주는 작업이 필요  
  한장의 이미지 당 random 하게 256개의 anchor를 1대1로 맞춰 학습을 진행한다.  
  만약 pos가 128보다 적으면 neg를 padding한다.

In [141]:
pos_iou_threshold = 0.7
neg_iou_threshold = 0.3

label[max_ious < neg_iou_threshold] == 0
label[gt_argmax_ious] = 1
label[max_ious > pos_iou_threshold] = 1

In [142]:
pos_ratio = 0.5
n_sample = 256  

# positive samples
n_pos = pos_ratio * n_sample
pos_index = np.where(label == 1)[0]

if len(pos_index) > n_pos:
    disable_index = np.random.choice(pos_index, size=(len(pos_index) - n_pos), replace = False)
    label[disable_index] = -1

# negative samples
n_neg = np.sum(label == 1)
neg_index = np.where(label == 0)[0]

if len(neg_index) > n_neg:
    disable_index = np.random.choice(neg_index, size=(len(neg_index) - n_neg), replace = False)
    label[disable_index] = -1

In [149]:
max_iou_bbox = bbox[argmax_ious]
print(max_iou_bbox)

[[ 20.  30. 400. 500.]
 [ 20.  30. 400. 500.]
 [ 20.  30. 400. 500.]
 ...
 [ 20.  30. 400. 500.]
 [ 20.  30. 400. 500.]
 [ 20.  30. 400. 500.]]


In [158]:
height = valid_anchor_boxes[:, 2] - valid_anchor_boxes[:, 0]
width = valid_anchor_boxes[:, 3] - valid_anchor_boxes[:, 1]
ctr_y = valid_anchor_boxes[:, 0] + 0.5 * height
ctr_x = valid_anchor_boxes[:, 1] + 0.5 * width

base_height = max_iou_bbox[:, 2] - max_iou_bbox[:, 0]
base_width = max_iou_bbox[:, 3] - max_iou_bbox[:, 1]
base_ctr_y = max_iou_bbox[:, 0] + 0.5 * base_height
base_ctr_x = max_iou_bbox[:, 1] + 0.5 * base_width

----

<img src="https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FHaclG%2FbtqBdUkizUl%2FOzPRkcX2FPJPFmN8BKlzl1%2Fimg.png" width="600" height="400">

- 출처 https://ganghee-lee.tistory.com/37

In [185]:
eps = np.finfo(height.dtype).eps #dtype이 가질수 있는 최소값
height = np.maximum(height, eps) # 가능 최소값보다 큰것
width = np.maximum(width, eps)

dy = (base_ctr_y - ctr_y) / height
dx = (base_ctr_x - ctr_x) / width
dh = np.log(base_height / height)
dw = np.log(base_width / width)

anchor_locs = np.vstack((dy, dx, dh, dw)).transpose()
print(anchor_locs)

[[ 0.5855728   2.30914558  0.7415674   1.64727602]
 [ 0.49718446  2.30914558  0.7415674   1.64727602]
 [ 0.40879611  2.30914558  0.7415674   1.64727602]
 ...
 [-2.50801936 -5.29225232  0.7415674   1.64727602]
 [-2.59640771 -5.29225232  0.7415674   1.64727602]
 [-2.68479606 -5.29225232  0.7415674   1.64727602]]


In [194]:
# Final labels:
anchor_labels = np.empty((len(anchors),), dtype=label.dtype)
anchor_labels.fill(-1)
anchor_labels[inside_index] = label

# Final locations
anchor_locations = np.empty((len(anchors),) + anchors.shape[1:], dtype=anchor_locs.dtype)
anchor_locations.fill(0)
anchor_locations[inside_index, :] = anchor_locs

- anchor_locations [N, 4] — [22500, 4]
- anchor_labels [N,] — [22500]

----
## Resion Proposal Network

<img src="https://miro.medium.com/max/700/1*rQ99lLIs7xTAjTaKHHNatA.png" width="700" height="600">

- A box regrression layer
- A box classification layer

In [203]:
import torch.nn as nn 

mid_channels = 512
in_channels = 512
n_anchor = 9        # Number of anchors at each location in the feature map

conv1 = nn.Conv2d(in_channels, mid_channels, 3, 1, 1)
reg_layer = nn.Conv2d(mid_channels, n_anchor*4, 1, 1, 0)
cls_layer = nn.Conv2d(mid_channels, n_anchor*2, 1, 1, 0)

In [204]:
# conv1 sliding layer
conv1.weight.data.normal_(0, 0.01)
conv1.bias.data.zero_()

# reg_layer
reg_layer.weight.data.normal_(0, 0.01)
reg_layer.bias.data.zero_()

# cls_layer
cls_layer.weight.data.normal_(0, 0.01)
cls_layer.bias.data.zero_()

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [214]:
x= conv1(sample_output)
pred_anchor_locs = reg_layer(x)
pred_cls_scores = cls_layer(x)

print(pred_anchor_locs.shape, pred_cls_scores.shape)

torch.Size([1, 36, 50, 50]) torch.Size([1, 18, 50, 50])


In [215]:
pred_anchor_locs = pred_anchor_locs.permute(0, 2, 3, 1).contiguous().view(1, -1, 4)
print(pred_anchor_locs.shape)

pred_cls_scores = pred_cls_scores.permute(0, 2, 3 ,1).contiguous()
print(pred_cls_scores.shape)

objectness_score = pred_cls_scores.view(1, 50, 50, 9, 2)[:, :, :, :, 1].contiguous().view(1, -1)
print(objectness_score.shape)

pred_cls_scores = pred_cls_scores.view(1, -1, 2)
print(pred_cls_scores.shape)

torch.Size([1, 22500, 4])
torch.Size([1, 50, 50, 18])
torch.Size([1, 22500])
torch.Size([1, 22500, 2])


pred_cls_scores와 pred_anchor_locs 는 RPN network의 output 이며 가중치를 업데이트하기 위한 loss <br>
pred_cls_scores와 objectness_score 는 proposal layer에 input으로 사용, 추후 RoI 네트워크에 의해 사용되는 제안 집합 생성.

## Generating proposals to feed Fast R-CNN network

- Weather training_mode or testing mode
- nms_thresh
- n_train_pre_nms — number of bboxes before nms during training
- n_train_post_nms — number of bboxes after nms during training
- n_test_pre_nms — number of bboxes before nms during testing
- n_test_post_nms — number of bboxes after nms during testing
- min_size — minimum height of the object required to create a proposal.

In [344]:
nms_thresh = 0.7
n_train_pre_nms = 12000
n_train_post_nms = 2000
n_test_pre_nms = 6000
n_test_post_nms = 300
min_size = 16


anc_height = anchors[:, 2] - anchors[:, 0]
anc_width = anchors[:, 3] - anchors[:, 1]
anc_ctr_y = anchors[:, 0] + 0.5 * anc_height
anc_ctr_x = anchors[:, 1] + 0.5 * anc_width

pred_anchor_locs_numpy = pred_anchor_locs[0].data.numpy()
objectness_score_numpy = objectness_score[0].data.numpy()

In [345]:
dy = pred_anchor_locs_numpy[:, 0]
dx = pred_anchor_locs_numpy[:, 1]
dh = pred_anchor_locs_numpy[:, 2]
dw = pred_anchor_locs_numpy[:, 3]

ctr_y = dy * anc_height + anc_ctr_y
ctr_x = dx * anc_width + anc_ctr_x
h = np.exp(dh) * anc_height
w = np.exp(dw) * anc_width

- convert [ctr_x, ctr_y, h, w] to [y1, x1, y2, x2] format

In [346]:
# region of interest 
roi = np.zeros(pred_anchor_locs_numpy.shape, dtype = np.float32)
roi[:, 0] = ctr_y - 0.5 * h
roi[:, 1] = ctr_x - 0.5 * w
roi[:, 2] = ctr_y + 0.5 * h
roi[:, 3] = ctr_x + 0.5 * w

roi

array([[ -39.861942,  -77.231255,   54.963337,  100.58985 ],
       [ -77.41494 , -182.63435 ,   97.46352 ,  182.30638 ],
       [-177.5977  , -354.01462 ,  190.20793 ,  360.53113 ],
       ...,
       [ 700.5433  ,  747.29663 ,  881.66046 ,  835.72626 ],
       [ 614.21576 ,  705.8609  ,  966.6589  ,  883.4604  ],
       [ 420.8336  ,  602.99756 , 1132.7819  ,  967.1268  ]],
      dtype=float32)

- clip the predicted boxes to the image

In [347]:
img_size = (800, 800)
roi = roi.clip(0, img_size[0]) # 직사각형인 경우 달라질 수 있음
print(roi)

[[  0.         0.        54.963337 100.58985 ]
 [  0.         0.        97.46352  182.30638 ]
 [  0.         0.       190.20793  360.53113 ]
 ...
 [700.5433   747.29663  800.       800.      ]
 [614.21576  705.8609   800.       800.      ]
 [420.8336   602.99756  800.       800.      ]]


In [348]:
# Remove predicted boxes with either height or width < threshold

hs = roi[:, 2] - roi[:, 0]
ws = roi[:, 3] - roi[:, 1]
keep = np.where((hs >= min_size) & (ws >= min_size))[0]
roi = roi[keep, :] 
score = objectness_score_numpy[keep]

print(roi.shape, score.shape)

order = score.ravel().argsort()[::-1]
print(order)

(22500, 4) (22500,)
[862 871 493 ...   3  22 462]


In [349]:
# 가장 score가 높은 12000개의 roi 박스만 사용
order = order[:n_train_pre_nms]
roi = roi[order, :]
score = score[order]

print(roi.shape)
print(score.shape)

(12000, 4)
(12000,)


In [350]:
y1, x1, y2, x2 = roi.T
areas = (y2 - y1 + 1) * (x2 - x1 +1)
order = score.argsort()[::-1]

keep = []

while order.size > 0:
    i = order[0]
    keep.append(i)
    
    # 计算当前score最大anchor与其他anchor的IOU
    yy1 = np.maximum(y1[i], y1[order[1:]])
    xx1 = np.maximum(x1[i], x1[order[1:]])
    yy2 = np.maximum(y2[i], y2[order[1:]])
    xx2 = np.maximum(y2[i], y2[order[1:]])
    
    h = np.maximum(0.0, yy2-yy1)
    w = np.maximum(0.0, xx2-xx1)
    
    inter = h*w
    iou = inter / (areas[i] + areas[order[1:]] - inter)
    
    # IOU threshold
    inds = np.where(iou <= nms_thresh)[0]
    # 要注意这里inds+1, 是因为加上order[0]
    order = order[inds+1]

keep = keep[:n_train_post_nms]
roi = roi[keep]
print(roi.shape)

(494, 4)


In [351]:
len(keep)

494

(22500,)