In [94]:
import torch
import torch.nn as nn
from torchvision.models.densenet import DenseNet121_Weights, densenet121
from torchvision.models.mobilenetv2 import mobilenet_v2,MobileNet_V2_Weights

## Visualising MobileNetV2 architecture

In [125]:
print(mobilenet_v2(MobileNet_V2_Weights).features)

Sequential(
  (0): Conv2dNormActivation(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU6(inplace=True)
  )
  (1): InvertedResidual(
    (conv): Sequential(
      (0): Conv2dNormActivation(
        (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
        (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (2): InvertedResidual(
    (conv): Sequential(
      (0): Conv2dNormActivation(
        (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (

## Original densenet backbone

In [96]:
backbone_dense = densenet121(weights=DenseNet121_Weights.IMAGENET1K_V1).features

In [97]:
# Pass the input through each part of the model
backbones = nn.ModuleList([
            backbone_dense[:4],
            backbone_dense.denseblock1,
            nn.Sequential(
                backbone_dense.transition1,
                backbone_dense.denseblock2,
            ),
            nn.Sequential(
                backbone_dense.transition2,
                backbone_dense.denseblock3,
            ),
            nn.Sequential(
                backbone_dense.transition3,
                backbone_dense.denseblock4,
            )
        ])
output_part_1 = backbones[0](torch.rand(1,3,224,224))  # Output of backbone[:4]
output_part_2 = backbones[1](output_part_1)  # Output of backbone.denseblock1
output_part_3 = backbones[2](output_part_2)  # Output of transition1 + denseblock2
output_part_4 = backbones[3](output_part_3)  # Output of transition2 + denseblock3
output_part_5 = backbones[4](output_part_4)  # Output of transition3 + denseblock4

# Print the sizes of the outputs
print("Output Size - Part 1:", output_part_1.size())
print("Output Size - Part 2:", output_part_2.size())
print("Output Size - Part 3:", output_part_3.size())
print("Output Size - Part 4:", output_part_4.size())
print("Output Size - Part 5:", output_part_5.size())

Output Size - Part 1: torch.Size([1, 64, 56, 56])
Output Size - Part 2: torch.Size([1, 256, 56, 56])
Output Size - Part 3: torch.Size([1, 512, 28, 28])
Output Size - Part 4: torch.Size([1, 1024, 14, 14])
Output Size - Part 5: torch.Size([1, 1024, 7, 7])


## Modification made to backbone to use MobilenetV2 instead (The one that is included into retinanet.py)

In [98]:
backbone_mobile = mobilenet_v2(MobileNet_V2_Weights).features

In [99]:
# Pass the input through each part of the model
backbones_mobile = nn.ModuleList([
            nn.Sequential(
                backbone_mobile[:3],
                nn.Conv2d(24, 64, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0), bias=False),
                nn.BatchNorm2d(64),
                nn.ReLU6(inplace=True)
            ),
            nn.Sequential(
                nn.Conv2d(64, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False),
                backbone_mobile[3:8],
                nn.ConvTranspose2d(64, 256, kernel_size=4, stride=4, padding=0, bias=False)
            ),
            nn.Sequential(
                nn.Conv2d(256, 64, kernel_size=1, stride=1, padding=0),
                backbone_mobile[8:11],
                nn.MaxPool2d(kernel_size=2, stride=2),
                nn.Conv2d(64, 512, kernel_size=1, stride=1, padding=0),
            ),
            nn.Sequential(
                nn.Conv2d(512, 64, kernel_size=3, stride=2, padding=1),  # Reduce spatial dimensions to 14x14
                nn.ReLU(inplace=True),
                backbone_mobile[11:15],
                nn.ConvTranspose2d(160, 512, kernel_size=2, stride=2),  # Upsample to 14x14
                nn.ReLU(inplace=True),
                nn.Conv2d(512, 1024, kernel_size=1, stride=1, padding=0),  # 1x1 convolution to change channel size
                nn.ReLU(inplace=True)
            ),
            nn.Sequential(
                nn.Conv2d(1024, 160, kernel_size=2, stride=2),  # Reduce spatial dimensions to 7x7
                nn.ReLU(inplace=True),
                backbone_mobile[15:],
                nn.Conv2d(1280, 1024, kernel_size=1, stride=1, padding=0)
                
            ) 
        ])
output_part_1 = backbones_mobile[0](torch.rand(1,3,224,224))  
output_part_2 = backbones_mobile[1](output_part_1)  
output_part_3 = backbones_mobile[2](output_part_2)  
output_part_4 = backbones_mobile[3](output_part_3)  
output_part_5 = backbones_mobile[4](output_part_4)  

# Print the sizes of the outputs
print("Output Size - Part 1:", output_part_1.size())
print("Output Size - Part 2:", output_part_2.size())
print("Output Size - Part 3:", output_part_3.size())
print("Output Size - Part 4:", output_part_4.size())
print("Output Size - Part 5:", output_part_5.size())

Output Size - Part 1: torch.Size([1, 64, 56, 56])
Output Size - Part 2: torch.Size([1, 256, 56, 56])
Output Size - Part 3: torch.Size([1, 512, 28, 28])
Output Size - Part 4: torch.Size([1, 1024, 14, 14])
Output Size - Part 5: torch.Size([1, 1024, 7, 7])


## original architecture of mobilenetv2 without changes

In [100]:
backbones_mobile = nn.ModuleList([
            nn.Sequential(
                backbone_mobile[:3],
            ),
            nn.Sequential(
                backbone_mobile[3:8],
            ),
            nn.Sequential(
                backbone_mobile[8:11],
            ),
            nn.Sequential(
                backbone_mobile[11:15],
            ),
            nn.Sequential(
                backbone_mobile[15:],
            ) 
        ])
output_part_1 = backbones_mobile[0](torch.rand(1,3,224,224))  # Output of backbone[:4]
output_part_2 = backbones_mobile[1](output_part_1)  # Output of backbone.denseblock1
output_part_3 = backbones_mobile[2](output_part_2)  # Output of transition1 + denseblock2
output_part_4 = backbones_mobile[3](output_part_3)  # Output of transition2 + denseblock3
output_part_5 = backbones_mobile[4](output_part_4)  # Output of transition3 + denseblock4

# Print the sizes of the outputs
print("Output Size - Part 1:", output_part_1.size())
print("Output Size - Part 2:", output_part_2.size())
print("Output Size - Part 3:", output_part_3.size())
print("Output Size - Part 4:", output_part_4.size())
print("Output Size - Part 5:", output_part_5.size())

Output Size - Part 1: torch.Size([1, 24, 56, 56])
Output Size - Part 2: torch.Size([1, 64, 14, 14])
Output Size - Part 3: torch.Size([1, 64, 14, 14])
Output Size - Part 4: torch.Size([1, 160, 7, 7])
Output Size - Part 5: torch.Size([1, 1280, 7, 7])


In [122]:
import torch
import torch.nn as nn
from torchvision.models.densenet import DenseNet121_Weights, densenet121
from torchvision.models.mobilenetv2 import mobilenet_v2,MobileNet_V2_Weights
from prior_box import PriorBox

class RetinaNet(nn.Module):

    def __init__(self, config, pretrained=True):
        super().__init__()

        # Feature Pyramid Network (FPN) with four feature maps of resolutions
        # 1/4, 1/8, 1/16, 1/32 and `num_filters` filters for all feature maps.
        num_anchors = config.get('num_anchors', 6)
        num_filters_fpn = config.get('num_filters_fpn', 128)
        self.num_classes = config['num_classes']
        fmaps = [56, 56, 28, 14, 7]
        self.size = config["img_size"]
        self.priorbox = PriorBox(self.size, feature_maps=fmaps)
        self.num_anchors = num_anchors
        self.fpn = FPN(out_channels=num_anchors * (4 + self.num_classes),backbone='mobilenet_v2')

        with torch.no_grad():
            self.priors = self.priorbox.forward()
            if torch.cuda.is_available():
                self.priors = self.priors.cuda()

    def forward(self, x):
        maps = self.fpn(x)
        loc = list()
        conf = list()
        for map in maps:
            loc.append(map[:, :self.num_anchors * 4].permute(0, 2, 3, 1).contiguous())
            conf.append(map[:, self.num_anchors * 4:].permute(0, 2, 3, 1).contiguous())

        loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
        conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
        output = (
            loc.view(loc.size(0), -1, 4),
            conf.view(conf.size(0), -1, self.num_classes),
            self.priors
        )
        return output


class FPN(nn.Module):

    def __init__(self, out_channels, backbone="densenet"):

        super().__init__()
        self.upsample = nn.UpsamplingNearest2d(scale_factor=2)
        self.backbone = backbone
        if backbone == "densenet":
            backbone = densenet121(weights=DenseNet121_Weights.IMAGENET1K_V1).features
            self.backbones = nn.ModuleList([
                backbone[:4],
                backbone.denseblock1,
                nn.Sequential(
                    backbone.transition1,
                    backbone.denseblock2,
                ),
                nn.Sequential(
                    backbone.transition2,
                    backbone.denseblock3,
                ),
                nn.Sequential(
                    backbone.transition3,
                    backbone.denseblock4,
                )
            ])
            self.enc0_channel = 64
            self.enc1_channel = 256
            self.enc2_channel = 512
            self.enc3_channel = 1024
            self.enc4_channel = 1024

        elif backbone == "mobilenet_v2":
            backbone_mobile = mobilenet_v2(MobileNet_V2_Weights).features
            self.backbones = nn.ModuleList([
                nn.Sequential(
                    backbone_mobile[:3], # out channels: 24
                ),
                nn.Sequential(
                    backbone_mobile[3:4], # out channels: 24
                ),
                nn.Sequential(
                    backbone_mobile[4:7], # out channels: 32
                ),
                nn.Sequential(
                    backbone_mobile[7:14], # out channels: 96
                ),
                nn.Sequential(
                    backbone_mobile[14:], # out channels: 1280
                )
            ])

            self.transform_enc4_to_enc3 = nn.Sequential(
                torch.nn.Conv2d(1280, 96, kernel_size=1, stride=1, padding=0),
                torch.nn.ReLU6(inplace=True)
            )

            self.enc0_channel = 24
            self.enc1_channel = 24
            self.enc2_channel = 32
            self.enc3_channel = 96
            self.enc4_channel = 1280

        else:
            raise f"{backbone} not implemented."

        self.up1 = nn.Sequential(
            nn.Conv2d(self.enc3_channel, self.enc2_channel, kernel_size=3, bias=False, padding=1),
            nn.BatchNorm2d(self.enc2_channel),
            nn.ReLU(inplace=True),
        )
        self.up2 = nn.Sequential(
            nn.Conv2d(self.enc2_channel, self.enc1_channel, kernel_size=3, bias=False, padding=1),
            nn.BatchNorm2d(self.enc1_channel),
            nn.ReLU(inplace=True),
        )
        self.up3 = nn.Sequential(
            nn.Conv2d(self.enc1_channel, self.enc0_channel, kernel_size=3, bias=False, padding=1),
            nn.BatchNorm2d(self.enc0_channel),
            nn.ReLU(inplace=True),
        )
        self.up4 = nn.Sequential(
            nn.Conv2d(self.enc0_channel + self.enc0_channel, self.enc0_channel, kernel_size=3, bias=False, padding=1),
            nn.BatchNorm2d(self.enc0_channel),
            nn.ReLU(inplace=True),
        )

        self.conv0 = nn.Sequential(
            nn.Conv2d(self.enc4_channel, out_channels, kernel_size=1),
        )
        self.conv1 = nn.Sequential(
            nn.Conv2d(self.enc2_channel, out_channels, kernel_size=1),
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(self.enc1_channel, out_channels, kernel_size=1),
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(self.enc0_channel, out_channels, kernel_size=1),
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(self.enc0_channel, out_channels, kernel_size=1),
        )

    def forward(self, x):
        # Bottom-up pathway, from ResNet
        enc0 = self.backbones[0](x)     # bs, channel_enc0, 56, 56
        print(enc0.size())
        enc1 = self.backbones[1](enc0)  # bs, channel_enc1, 56, 56
        print(enc1.size())
        enc2 = self.backbones[2](enc1)  # bs, channel_enc2, 28, 28
        print(enc2.size())
        enc3 = self.backbones[3](enc2)  # bs, channel_enc3, 14, 14
        print(enc3.size())
        enc4 = self.backbones[4](enc3)  # bs, channel_enc4, 7, 7
        print(enc4.size())

        up1 = self.upsample(enc4)  # bs, channel_enc4, 14, 14
        print("HI")
        print(up1.size())
        print(enc3.size())
        if up1.size(1) != enc3.size(1):
            # transform up1's channel size when channel_enc3 != channel_enc4
            up1 = self.transform_enc4_to_enc3(up1) + enc3
        else:
            up1 = up1 + enc3

        print(up1.size())
        up1 = self.up1(up1)  # bs, channel_enc2, 14, 14
        print("up1 =",up1.size())
        up2 = self.upsample(up1)  # bs, channel_enc2, 28, 28
        up2 = up2 + enc2
        up2 = self.up2(up2)  # bs, channel_enc1, 28, 28

        up3 = self.upsample(up2)  # bs, channel_enc1, 56, 56
        up3 = up3 + enc1
        up3 = self.up3(up3)  # bs, channel_enc1, 56, 56

        up4 = torch.cat([up3, enc0], 1)  # bs, channel_enc1 + channel_enc0, 56, 56
        up4 = self.up4(up4)

        map1 = self.conv0(enc4)
        map2 = self.conv1(up1)
        map3 = self.conv2(up2)
        map4 = self.conv3(up3)
        map5 = self.conv4(up4)
        # for i in [map1, map2, map3, map4, map5]:
        #     print(i.size())
        return map1, map2, map3, map4, map5


def build_retinanet(config):
    return nn.DataParallel(RetinaNet(config))

In [123]:
model = FPN(out_channels=6 * (4 + 2), backbone="mobilenet_v2")
test_image = torch.rand(1,3,224,224)
output_maps = model(test_image)
for each_map in output_maps:
    print(each_map.size())



torch.Size([1, 24, 56, 56])
torch.Size([1, 24, 56, 56])
torch.Size([1, 32, 28, 28])
torch.Size([1, 96, 14, 14])
torch.Size([1, 1280, 7, 7])
HI
torch.Size([1, 1280, 14, 14])
torch.Size([1, 96, 14, 14])
torch.Size([1, 96, 14, 14])
up1 = torch.Size([1, 32, 14, 14])
torch.Size([1, 36, 7, 7])
torch.Size([1, 36, 14, 14])
torch.Size([1, 36, 28, 28])
torch.Size([1, 36, 56, 56])
torch.Size([1, 36, 56, 56])


In [124]:
model = FPN(out_channels=6 * (4 + 2),backbone="densenet")
test_image = torch.rand(1,3,224,224)
output_maps = model(test_image)
for each_map in output_maps:
    print(each_map.size())

torch.Size([1, 64, 56, 56])
torch.Size([1, 256, 56, 56])
torch.Size([1, 512, 28, 28])
torch.Size([1, 1024, 14, 14])
torch.Size([1, 1024, 7, 7])
HI
torch.Size([1, 1024, 14, 14])
torch.Size([1, 1024, 14, 14])
torch.Size([1, 1024, 14, 14])
up1 = torch.Size([1, 512, 14, 14])
torch.Size([1, 36, 7, 7])
torch.Size([1, 36, 14, 14])
torch.Size([1, 36, 28, 28])
torch.Size([1, 36, 56, 56])
torch.Size([1, 36, 56, 56])
