# 基于VGG-16的Fast RCNN网络

Fast RCNN主干网络基于VGG16，做了一定的修改：将最后一层池化层，改为RoI池化层，并在全连接层之后，增加了两个平行网络。

VGG16网络结构图如下：

<img src="./vgg16-architecture.png" style="width:300;height:300px;">


Fast RCNN加入了ROI 池化层和 2个平行网络，结构如下图：
<img src="./fastrcnn-architecture.png" style="width:300;height:300px;">


In [1]:
import time
import torch
import torchvision.models as models 
from torch import nn,optim

import utils

device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## 定义RoI pooling layer

In [2]:
import numpy as np
class RoIPool(nn.Module):
    def __init__(self,output_size=(7,7)):
        super(RoIPool,self).__init__()
        # 自适应池化
        self.roi_Pool=nn.AdaptiveMaxPool2d(output_size)
        self.size=output_size
    
    def forward(self,feature_map,rois,roi_idxs):
        assert feature_map.dim()==4,'should 4d: (n.c,h,w)'
        n=rois.shape[0]
        _,c,h,w=feature_map.size()
        x1,y1,x2,y2=rois[:,0],rois[:,1],rois[:,2],rois[:,3]
        # BN层之后，范围为[0,1]，恢复到原大小
        x1=np.floor(x1 * w).astype(int)
        y1=np.floor(y1 * h).astype(int)
        x2=np.ceil(x2 * w).astype(int)
        y2=np.ceil(y2 * h).astype(int)

        # 保存所有ROI，经过ROI池化层处理，得到的固定尺寸的roi特征图
        res=[]
        # n个ROI
        for i in range(n):
            # roi索引号和roi坐标相对应
            img=feature_map[ roi_idxs[i] ].unsqueeze(0)
            # 参数：batch,chaanels,h,w
            img=img[:,:,y1[i]:y2[i],x1[i]:x2[i]]
            # 对roi进行自适应卷积，输出固定尺寸的特征图
            img=self.roi_Pool(img)
            # 保存该roi的特征图
            res.append(img)
        # 按照维度，将所有roi的特征图，进行连结，输出n*h*w，n为特征图的个数， w和h为超参数，默认为7,7
        return torch.cat(res,dim=0)

w,h=images.size(2),images.size(3)
        n=rois.shape[0]
        x1,y1,x2,y2=rois[:,0],rois[:,1],rois[:,2],rois[:,3]
        x1 = np.floor(x1 * w).astype(int)
        x2 = np.ceil(x2 * w).astype(int)
        y1 = np.floor(y1 * h).astype(int)
        y2 = np.ceil(y2 * h).astype(int)
        
        res = []
        for i in range(n):
            img = images[roi_idx[i]].unsqueeze(0)
            img = img[:, :, y1[i]:y2[i], x1[i]:x2[i]]
            img = self.maxpool(img)
            res.append(img)
        res = torch.cat(res, dim=0)
        return res

## 定义Fast RCNN网络


加载torchvision.models库中的vgg16模型，

预训练为true: pretrained=true

In [3]:
vgg16=models.vgg16(pretrained=True)

feature=nn.Sequential(*list(vgg16.features.children())[:-1])
feature

In [24]:
class FastRcnn(nn.Module):
    def __init__(self,num_classes=10):
        super(FastRcnn,self).__init__()
        # 去掉 vgg16 features最后一个最大池化层，其余部分不变
        self.feature=nn.Sequential(*list(vgg16.features.children())[:-1])
        
        # 添加RoI最大池化层
        self.roiPool=RoIPool(output_size=(7,7))
        
        # 去掉 VGG16 全连接层的最后一层,其余部分不变
        self.fc=nn.Sequential(*list(vgg16.classifier.children())[:-1])
        
        # 两个平行分支
        # +1，是背景类
        self.clss=nn.Linear(4096,num_classes+1)
        self.bbox=nn.Linear(4096,(num_classes+1) * 4)
    
    def forward(self,x,rois,roi_idxs):
        
        x=self.feature(x)
        x=self.roiPool(x,rois,roi_idxs)
        # no grad
        x=x.detach()
        # 展平
        x=x.view(1,-1)
        x=self.fc(x)
        # 2个平行分支
        clss=self.clss(x)
        bbox=self.bbox(x).view(-1,num_classes+1,4)
        return clss,bbox

计算维度

In [4]:
feature=nn.Sequential(*list(vgg16.features.children())[:-1])
        
roiPool=RoIPool(output_size=(7,7))

fc=nn.Sequential(*list(vgg16.classifier.children())[:-1])

In [19]:
_x = torch.Tensor(1, 3, 224, 224)
_r = np.array([[0., 0., 1., 1.]])
_ri = np.array([0])

# _x = fc(roiPool( feature(_x), _r, _ri).view(1, -1))
# _x.size()  #  torch.Size([1, 4096])

In [20]:
out=feature(_x)
print(out.shape)

out=roiPool( out, _r, _ri)
print(out.shape)

out=fc(out.view(1,-1))
print(out.shape)


torch.Size([1, 512, 14, 14])
torch.Size([1, 512, 7, 7])


In [None]:
# out=fc(out.view(1,-1))

out.view(1,-1).

In [25]:
net = FastRcnn()
print(net)

FastRcnn(
  (feature): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilati

定义多任务损失函数

In [31]:
# 定义 loc 损失函数
# 计算 预测出的边界框 和 真值边界框 之间

class SmoothL1Loss(nn.Module):
    def __init__(self):
        super(SmoothL1Loss, self).__init__()

    def forward(self, preds, targets):
        res = self.smoothL1(preds - targets)
        return torch.sum(res)

    def smoothL1(self, x):
        if torch.abs(x) < 1:
            return 0.5 * torch.pow(x, 2)
        else:
            return torch.abs(x) - 0.5

In [33]:
def multiTaskLoss(probs,bbox,labels,gt_bbox):
    # 超参数，控制loc损失和cls损失，对总loss的贡献
    lamb=1
    # 分类
    clss=nn.CrossEntropyLoss()
    # 定位
    loca=SmoothL1Loss()
    
    # 分类损失
    loss_sc = clss(probs, labels)
    
    lbl = labels.view(-1, 1, 1).expand(labels.size(0), 1, 4)
    # 不计算背景类
    mask = (labels != 0).float().view(-1, 1).expand(labels.size(0), 4)
    
    # 定位损失
    loss_loc = loca(bbox.gather(1, lbl).squeeze(1) * mask, gt_bbox * mask)
    
    # 总的损失
    loss = loss_sc + lamb * loss_loc
    
    return loss, loss_sc, loss_loc

优化器

In [30]:
optimizer = torch.optim.Adam(net.parameters(), lr=1e-4)


torch.nn.modules.container.Sequential