1. Region Proposal network (RPN)
2. RPN loss functions
3. Region of Interest Pooling (ROI)
4. ROI loss functions

In [1]:
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import itertools

## Feautre Extraction

In [2]:
dummy_img = torch.zeros((1,3,800,800)).float()

# [y1, x1, y2, x2]
bbox = torch.FloatTensor([[20, 30, 400, 500], [300, 400, 500, 600]])
labels = torch.LongTensor([6, 8])

# 1x1 in feature map -> 16x16 in image
sub_sample = 16

In [3]:
model = torchvision.models.vgg16(pretrained=True)
fe = list(model.features)

# req_features = fe[:30]
req_features = []
k = dummy_img.clone()
for i in fe:
    k = i(k)
    if k.size()[2] < 800 // 16:
        break
    req_features.append(i)
    out_channels = k.size()[1]

print(len(req_features))
print(out_channels)

30
512


In [4]:
faster_rcnn_feature = nn.Sequential(*req_features)
sample_output = faster_rcnn_feature(dummy_img)
print(sample_output.shape)

torch.Size([1, 512, 50, 50])


## Anchor boxes

In [38]:
ratios = [0.5, 1, 2]
anchor_scales = [8, 16, 32]
anchor_number = len(ratios) * len(anchor_scales)
print(f'Total anchor #: {anchor_number}')

anchor_base = np.zeros((anchor_number ,4), dtype=np.float32)
print(f'anchor base: \n {anchor_base}')
print('-----------------------------------------')

ctr_y = sub_sample / 2.
ctr_x = sub_sample / 2.

for idx, (ratio, anchor_scale) in enumerate(itertools.product(ratios, anchor_scales)):
    h = sub_sample * anchor_scale * np.sqrt(ratio)
    w = sub_sample * anchor_scale * np.sqrt(1. / ratio)
    anchor_base[idx, 0] = ctr_y - h / 2 
    anchor_base[idx, 1] = ctr_x - w / 2
    anchor_base[idx, 2] = ctr_y + h / 2
    anchor_base[idx, 3] = ctr_x + w / 2
    
print(f'anchor_box: \n {anchor_base}')

Total anchor #: 9
anchor base: 
 [[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
-----------------------------------------
anchor_box: 
 [[ -37.254833  -82.50967    53.254833   98.50967 ]
 [ -82.50967  -173.01933    98.50967   189.01933 ]
 [-173.01933  -354.03867   189.01933   370.03867 ]
 [ -56.        -56.         72.         72.      ]
 [-120.       -120.        136.        136.      ]
 [-248.       -248.        264.        264.      ]
 [ -82.50967   -37.254833   98.50967    53.254833]
 [-173.01933   -82.50967   189.01933    98.50967 ]
 [-354.03867  -173.01933   370.03867   189.01933 ]]


In [45]:
fe_size = (800 // 16)  # 피처맵은 50 x 50
ctr_x = np.arange(16, (fe_size + 1) * 16, 16)
ctr_y = np.arange(16, (fe_size + 1) * 16, 16)

anchors = np.zeros((fe_size * fe_size * 9, 4))

ctr = np.zeros((len(ctr_x) * len(ctr_y), 2))
for idx, (x,y) in enumerate(itertools.product(ctr_x, ctr_y)):
    ctr[idx, 1] = x - 8
    ctr[idx, 0] = y - 8

print(len(ctr))
print(f'center example: \n {ctr[:3]} ....\n')

idx = 0
for c_y, c_x in ctr:
    for ratio, anchor_scale in itertools.product(ratios, anchor_scales):
        h = sub_sample * anchor_scale * np.sqrt(ratio)
        w = sub_sample * anchor_scale * np.sqrt(1. / ratio)
        anchors[idx, 0] = c_y - h / 2 
        anchors[idx, 1] = c_x - w / 2
        anchors[idx, 2] = c_y + h / 2
        anchors[idx, 3] = c_x + w / 2 
        idx += 1
        
print(f'anchor shape:{anchors.shape}')
print(anchors)

2500
center example: 
 [[ 8.  8.]
 [24.  8.]
 [40.  8.]] ....

anchor shape:(22500, 4)
[[ -37.254834    -82.50966799   53.254834     98.50966799]
 [ -82.50966799 -173.01933598   98.50966799  189.01933598]
 [-173.01933598 -354.03867197  189.01933598  370.03867197]
 ...
 [ 701.49033201  746.745166    882.50966799  837.254834  ]
 [ 610.98066402  701.49033201  973.01933598  882.50966799]
 [ 429.96132803  610.98066402 1154.03867197  973.01933598]]


## Resion Proposal Network

In [17]:
mid_channels = 512
in_channels = 512
n_anchor = 9        # Number of anchors at each location in the feature map

conv1 = nn.Conv2d(in_channels, mid_channels, 3, 1, 1)
reg_layer = nn.Conv2d(mid_channels, n_anchor*4, 1, 1, 0)
cls_layer = nn.Conv2d(mid_channels, n_anchor*2, 1, 1, 0)

In [19]:
# conv1 sliding layer
conv1.weight.data.normal_(0, 0.01)
conv1.bias.data.zero_()

# reg_layer
reg_layer.weight.data.normal_(0, 0.01)
reg_layer.bias.data.zero_()

# cls_layer
cls_layer.weight.data.normal_(0, 0.01)
cls_layer.bias.data.zero_()

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [29]:
x= conv1(sample_output)
pred_anchor_locs = reg_layer(x)
pred_cls_scores = cls_layer(x)

print(pred_anchor_locs.shape, pred_cls_scores.shape)

torch.Size([1, 36, 50, 50]) torch.Size([1, 18, 50, 50])


In [30]:
pred_anchor_locs = pred_anchor_locs.permute(0, 2, 3, 1).contiguous().view(1, -1, 4)
print(pred_anchor_locs.shape)

pred_cls_scores = pred_cls_scores.permute(0, 2, 3 ,1).contiguous()
print(pred_cls_scores.shape)

objectness_scores = pred_cls_scores.view(1, 50, 50, 9, 2)[:, :, :, :, 1].contiguous().view(1, -1)
print(objectness_scores.shape)

pred_cls_scores = pred_cls_scores.view(1, -1, 2)
print(pred_cls_scores.shape)

torch.Size([1, 22500, 4])
torch.Size([1, 50, 50, 18])
torch.Size([1, 22500])
torch.Size([1, 22500, 2])
