## Faster RCNN Fine-tuning with PyTorch

In [1]:
import torch
import torchvision
from PIL import Image
import torchvision.transforms as T
import torchvision.models as models
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.ops import MultiScaleRoIAlign
from torchvision.models.detection import FasterRCNN

In [2]:
if torch.cuda.is_available():
    device = "cuda" 
else:
    device = "cpu"

In [3]:
# model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained = True)
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights = 'FasterRCNN_ResNet50_FPN_Weights.DEFAULT')
model = model.to(device)

In [4]:
first_image = T.ToTensor()(Image.open('image/FudanPed00066.png'))
first_box = torch.tensor([[248.0, 50.0, 329.0, 351.0]])
first_label = torch.tensor([1])

second_image = T.ToTensor()(Image.open('image/PennPed00011.png'))
second_box = torch.tensor([[92.0, 62.0, 236.0, 344.0], [242.0, 52.0, 301.0, 355.0]])
second_label = torch.tensor([1, 1])

print('First image size: {}'.format(first_image.size()))
print('Second image size: {}'.format(second_image.size()))

First image size: torch.Size([3, 359, 360])
Second image size: torch.Size([3, 376, 508])


In [5]:
first_input_image = first_image.clone()
second_input_image = second_image.clone()

inputs = [first_input_image.to(device), second_input_image.to(device)]

model.eval()
output = model(inputs)

print(output)

[{'boxes': tensor([[243.2258,  47.7818, 327.8421, 349.9229],
        [305.6328,  99.5300, 330.0190, 128.9751]], grad_fn=<StackBackward0>), 'labels': tensor([ 1, 34]), 'scores': tensor([0.9997, 0.0594], grad_fn=<IndexBackward0>)}, {'boxes': tensor([[ 89.9230,  59.4910, 225.3071, 342.8299],
        [244.2283,  49.8334, 304.4795, 362.8903],
        [245.9230, 127.6201, 276.5671, 197.8546],
        [252.0380,  15.8489, 369.9043, 367.3777],
        [245.7938,  99.7875, 294.3491, 198.7888],
        [243.9077, 121.8000, 276.3352, 198.4012],
        [247.6824,  51.5020, 301.1053, 203.1744],
        [245.5552,  95.3306, 295.3181, 199.0098],
        [274.8139,  96.3891, 301.2039, 188.7243],
        [123.6462,  56.9277, 191.9256, 338.0053],
        [240.7440,  44.0858, 333.5300, 235.6630],
        [267.6390, 100.0402, 299.7915, 187.5079]], grad_fn=<StackBackward0>), 'labels': tensor([ 1,  1, 27,  1, 27, 31,  1, 31, 27,  1,  1, 31]), 'scores': tensor([0.9996, 0.9931, 0.7120, 0.6038, 0.3407, 0.3300

In [6]:
first_input_image = first_image.clone()
first_target = {
    'boxes': first_box.clone().to(device),
    'labels' : first_label.clone().to(device)
    
} 

second_input_image = second_image.clone()
second_target = {
    'boxes': second_box.clone().to(device),
    'labels' : second_label.clone().to(device)
    
} 

inputs = [first_input_image.to(device), second_input_image.to(device)]
targets = [first_target, second_target]

model.train()
model(inputs, targets)

{'loss_classifier': tensor(0.0286, grad_fn=<NllLossBackward0>),
 'loss_box_reg': tensor(0.0299, grad_fn=<DivBackward0>),
 'loss_objectness': tensor(0.0118, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>),
 'loss_rpn_box_reg': tensor(0.0048, grad_fn=<DivBackward0>)}

In [7]:
model.eval()

print(model)

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [8]:
first_image = first_image.clone()
second_image = second_image.clone()

inputs = [first_image.to(device), second_image.to(device)]
trans_image_list, trans_target_list = model.transform(inputs)

print('Tensor size: {}'.format(trans_image_list.tensors.size()))

Tensor size: torch.Size([2, 3, 800, 1088])


In [9]:
print('transform ( GeneralizedRCNNTransform) parameters:')
print('min_size: {}'.format(model.transform.min_size))
print('max_size: {}'.format(model.transform.max_size))
print('image_mean: {}'.format(model.transform.image_mean))
print('image_std: {}'.format(model.transform.image_std))

transform ( GeneralizedRCNNTransform) parameters:
min_size: (800,)
max_size: 1333
image_mean: [0.485, 0.456, 0.406]
image_std: [0.229, 0.224, 0.225]


In [10]:
ft_min_size = 300
ft_max_size = 500

ft_mean = [0.485, 0.456, 0.406]
ft_std = [0.229, 0.224, 0.225]

In [11]:
backbone_out = model.backbone(trans_image_list.tensors)

In [12]:
for key, value in backbone_out.items():
    print('{}: {}'.format(key, value.size()))

0: torch.Size([2, 256, 200, 272])
1: torch.Size([2, 256, 100, 136])
2: torch.Size([2, 256, 50, 68])
3: torch.Size([2, 256, 25, 34])
pool: torch.Size([2, 256, 13, 17])


In [13]:
print('Number of output channel of the backbone: {}'.format(model.backbone.out_channels))

Number of output channel of the backbone: 256


In [14]:
alexnet = models.alexnet(weights = 'AlexNet_Weights.DEFAULT')

print(alexnet)

AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
 

In [15]:
ft_backbone = alexnet.features
ft_backbone.out_channels = 256

In [16]:
model.rpn

RegionProposalNetwork(
  (anchor_generator): AnchorGenerator()
  (head): RPNHead(
    (conv): Sequential(
      (0): Conv2dNormActivation(
        (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): ReLU(inplace=True)
      )
    )
    (cls_logits): Conv2d(256, 3, kernel_size=(1, 1), stride=(1, 1))
    (bbox_pred): Conv2d(256, 12, kernel_size=(1, 1), stride=(1, 1))
  )
)

In [17]:
print('Anchor sizes: {}'.format(model.rpn.anchor_generator.sizes))
print('Aspect ratios: {}'.format(model.rpn.anchor_generator.aspect_ratios))

Anchor sizes: ((32,), (64,), (128,), (256,), (512,))
Aspect ratios: ((0.5, 1.0, 2.0), (0.5, 1.0, 2.0), (0.5, 1.0, 2.0), (0.5, 1.0, 2.0), (0.5, 1.0, 2.0))


In [18]:
ft_anchor_generator = AnchorGenerator(sizes = ((32, 64, 128, 256),), 
                                      aspect_ratios = ((0.5, 1.0, 2.0),))

In [19]:
model.roi_heads

RoIHeads(
  (box_roi_pool): MultiScaleRoIAlign(featmap_names=['0', '1', '2', '3'], output_size=(7, 7), sampling_ratio=2)
  (box_head): TwoMLPHead(
    (fc6): Linear(in_features=12544, out_features=1024, bias=True)
    (fc7): Linear(in_features=1024, out_features=1024, bias=True)
  )
  (box_predictor): FastRCNNPredictor(
    (cls_score): Linear(in_features=1024, out_features=91, bias=True)
    (bbox_pred): Linear(in_features=1024, out_features=364, bias=True)
  )
)

In [20]:
print('Box RoI Pool Parameters:')
print('featmap_names: {}'.format(model.roi_heads.box_roi_pool.featmap_names))
print('output_size: {}'.format(model.roi_heads.box_roi_pool.output_size))
print('sampling_ratio: {}'.format(model.roi_heads.box_roi_pool.sampling_ratio))

Box RoI Pool Parameters:
featmap_names: ['0', '1', '2', '3']
output_size: (7, 7)
sampling_ratio: 2


In [21]:
backbone_out.keys()

odict_keys(['0', '1', '2', '3', 'pool'])

In [22]:
type(ft_backbone(torch.rand((2, 3, 300, 300))))

torch.Tensor

In [23]:
ft_roi_pooler = MultiScaleRoIAlign(featmap_names = ['0'], output_size = 4, sampling_ratio = 1)

In [24]:
ft_model = FasterRCNN(backbone = ft_backbone,
                      num_classes = 2, 
                      min_size = ft_min_size, 
                      max_size = ft_max_size, 
                      image_mean = ft_mean, 
                      image_std = ft_std, 
                      rpn_anchor_generator = ft_anchor_generator, 
                      box_roi_pool = ft_roi_pooler)

ft_model = ft_model.to(device)

In [25]:
first_input_image = first_image.clone()
second_input_image = second_image.clone()

inputs = [first_input_image.to(device), second_input_image.to(device)]

ft_model.eval()
output = ft_model(inputs)

print(output)

[{'boxes': tensor([[2.6733e+02, 2.2384e+02, 3.5762e+02, 2.8285e+02],
        [2.6513e+02, 1.6276e+02, 3.5750e+02, 2.6657e+02],
        [1.9986e+02, 1.3859e+02, 3.4652e+02, 2.7779e+02],
        [2.7702e+02, 1.8531e+02, 3.5778e+02, 2.4385e+02],
        [2.7293e+02, 2.2097e+02, 2.9775e+02, 2.7886e+02],
        [3.1392e+02, 2.1970e+02, 3.3927e+02, 2.7543e+02],
        [4.4749e+01, 2.7552e+02, 2.7225e+02, 3.5855e+02],
        [2.5415e+02, 1.9821e+02, 3.2132e+02, 2.7473e+02],
        [1.3625e+02, 5.5600e+01, 2.7576e+02, 2.1266e+02],
        [2.3665e+02, 2.1160e+02, 3.5652e+02, 3.0696e+02],
        [2.1023e+02, 1.6492e+02, 2.7849e+02, 2.3933e+02],
        [2.3371e+02, 1.2491e+02, 3.5783e+02, 2.3790e+02],
        [3.0065e+02, 1.9469e+02, 3.5872e+02, 2.2561e+02],
        [3.0588e+02, 0.0000e+00, 3.5984e+02, 3.0096e+02],
        [1.6422e+02, 1.5799e+02, 2.3592e+02, 1.8227e+02],
        [2.9139e+02, 9.3688e-01, 3.5899e+02, 1.5399e+02],
        [1.1008e+02, 3.1326e+02, 2.1641e+02, 3.5856e+02],
   

In [26]:
first_input_image = first_image.clone()
first_target = {
    'boxes': first_box.clone().to(device),
    'labels' : first_label.clone().to(device)
    
} 

second_input_image = second_image.clone()
second_target = {
    'boxes': second_box.clone().to(device),
    'labels' : second_label.clone().to(device)
    
} 

inputs = [first_input_image.to(device), second_input_image.to(device)]
targets = [first_target, second_target]

ft_model.train()
ft_model(inputs, targets)

{'loss_classifier': tensor(0.6533, grad_fn=<NllLossBackward0>),
 'loss_box_reg': tensor(0.0425, grad_fn=<DivBackward0>),
 'loss_objectness': tensor(0.6880, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>),
 'loss_rpn_box_reg': tensor(0.0066, grad_fn=<DivBackward0>)}