In [1]:
import cv2
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import transforms
from torchvision import models
import matplotlib.pyplot as plt

from PIL import Image
from PIL import ImageFont
from PIL import ImageDraw
from torchvision.models.detection import FasterRCNN
from torchsummary import summary

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: GeForce RTX 2080 Ti


In [3]:
backbone = models.mobilenet_v2(pretrained=True).to(device)

In [4]:
summary(backbone, input_size=(3,800,800))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 32, 400, 400]             864
       BatchNorm2d-2         [-1, 32, 400, 400]              64
             ReLU6-3         [-1, 32, 400, 400]               0
            Conv2d-4         [-1, 32, 400, 400]             288
       BatchNorm2d-5         [-1, 32, 400, 400]              64
             ReLU6-6         [-1, 32, 400, 400]               0
            Conv2d-7         [-1, 16, 400, 400]             512
       BatchNorm2d-8         [-1, 16, 400, 400]              32
  InvertedResidual-9         [-1, 16, 400, 400]               0
           Conv2d-10         [-1, 96, 400, 400]           1,536
      BatchNorm2d-11         [-1, 96, 400, 400]             192
            ReLU6-12         [-1, 96, 400, 400]               0
           Conv2d-13         [-1, 96, 200, 200]             864
      BatchNorm2d-14         [-1, 96, 2

In [5]:
features = list(backbone.features)
print(len(features))
print(features)

19
[ConvNormActivation(
  (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
  (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU6(inplace=True)
), InvertedResidual(
  (conv): Sequential(
    (0): ConvNormActivation(
      (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
    (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
), InvertedResidual(
  (conv): Sequential(
    (0): ConvNormActivation(
      (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
    (1): ConvNormActivation(
      (0): C

In [7]:
# only collect layers with output feature map size (W, H) < 50

dummy_img = torch.zeros((1, 3, 800, 800)).float()  # test image array
print(dummy_img.shape)

req_features = []
output = dummy_img.clone().to(device)

for feature in features:
    output = feature(output)
    # print(output.size()) # => torch.Size([batch_size, channel, width, height])
    # if output.size()[2] < 800//16:  # 800/16=50
    #     break
    req_features.append(feature)
    out_channels = output.size()

print(len(req_features))
# print(req_features)
print(out_channels)
print(out_channels[1])

torch.Size([1, 3, 800, 800])
19
torch.Size([1, 1280, 25, 25])
1280


In [8]:
# convert this list into a Seqeuntial module

faster_rcnn_feature_extractor = nn.Sequential(*req_features)

In [None]:
# test the results of the input image pass through the feature extractor

transform = transforms.Compose([transforms.ToTensor()])
imgTensor = transform(img).to(device)
imgTensor = imgTensor.unsqueeze(0)
output_map = faster_rcnn_feature_extractor(imgTensor)

print(output_map.size())