# Residual Networks (ResNet)

Adding more layers to a deep learning model should intuitively improve its performance but the practical observations are not aligned to this notion. One of the issues with deeper models can be Vanishing/exploding gradients which hampers the convergence. This can be addressed by normalized initialisation and intermediate normalization layers (BatchNorm) which enables the network to start converging for SGD with backprop. Even with using normalization, the increase in network depth leads to saturation of accuracy which eventually degrades rapidly. Unexpectedly this is not due to overfitting.

This is referred to as degradation problem. The authors solve this problem using deep residual learning framework. With skip connections, weights of multiple non-linear layers can be driven to zero to approach identity mappings.

In [1]:
import time
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision

In [2]:
device= 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
# count how many trainable weights the model has
def count_parameters(model) -> None:
    total_params= sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f'Number of parameters: {total_params}')

# Define configs for different ResNet versions

Here, model_parameters['resnet50'] = ([64,128,256,512], [3,4,6,3], 4, True) represents the parameters for ResNet-50 where
- [64,128,256,512] -> channels in each intermediate block
- [3,4,6,3] -> number of repetition for Bottlenecks in each block
- 4 -> expansion_factor. Note that 64 turns to 256, 128 to 512. All the ResNet layers use the same expansion factor
- True -> create Bottleneck layer status. True only for ResNet-50+

In [4]:
# resnet_= (num channels, repetition, bottleneck expansion, bottleneck layer)
model_parameters= {}
model_parameters['resnet18'] = ([64,128,256,512], [2,2,2,2], 1, False)
model_parameters['resnet34'] = ([64,128,256,512], [3,4,6,3], 1, False)
model_parameters['resnet50'] = ([64,128,256,512], [3,4,6,3], 4, True)
model_parameters['resnet101']= ([64,128,256,512], [3,4,23,3],4, True)
model_parameters['resnet152']= ([64,128,256,512], [3,8,36,3],4, True)

# Define Bottleneck and Basic Blocks

Bottlenecks are the bulding units of ResNet architecture. A bottleneck layer is a layer that contains few nodes compared to the previous layers. It can be used to obtain a representation of the input with reduced dimensionality.

- A bottleneck consists of (conv1x1->BN->relu) -> (conv3x3->BN->relu) -> (conv1x1->BN) -> relu
- Bottleneck is used to reduce the computation cost for layers-50, 101, and 152. For ResNet-18/34, BasicBlocks are used instead of bottleneck

In [5]:
class BottleneckBlock(nn.Module):
    """
    Creates a bottleneck layer for ResNet.
    """

    def __init__(self, in_channels, mid_channels, expansion, stride=1) -> None:
        super(BottleneckBlock, self).__init__()
        # for all ResNet-50+
        self.bottleneck= nn.Sequential(
            # conv1x1 -> BN -> relu
            nn.Conv2d(in_channels, mid_channels, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(mid_channels),
            nn.ReLU(inplace=True),
            # conv3x3 -> BN -> relu
            nn.Conv2d(mid_channels, mid_channels, kernel_size=3, stride=stride, padding=1, bias=False),
            nn.BatchNorm2d(mid_channels),
            nn.ReLU(inplace=True),
            # conv1x1 -> BN
            nn.Conv2d(mid_channels, mid_channels*expansion, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(mid_channels*expansion),
        )


    def forward(self, x):
        x= self.bottleneck(x)

        return x


In [6]:
class BasicBlock(nn.Module):
    """
    Creates a basic block layer for ResNet.
    """

    def __init__(self, in_channels, mid_channels, stride=1) -> None:
        super(BasicBlock, self).__init__()
        # for ResNet-18/34
        self.basic_block= nn.Sequential(
            # conv3x3 -> BN -> relu
            nn.Conv2d(in_channels, mid_channels, kernel_size=3, stride=stride, padding=1, bias=False),
            nn.BatchNorm2d(mid_channels),
            nn.ReLU(inplace=True),
            # conv3x3 -> BN
            nn.Conv2d(mid_channels, mid_channels, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(mid_channels),
        )


    def forward(self, x):
        x= self.basic_block(x)

        return x


# Identity vs Downsampled

If **x** and the feature map size (including the number of channels) are the same, **x** can add to the feature map directly. If **x** and the feature map do not match, we apply downsample mapping using 1x1 convolutions where **x** and the feature map are projected to the required dimensions.

Increasing the number of filters in the ResidualBlock by a factor of 2 also comes with reducing the feature map dimensions to half. Decreasing the feature map dimensions occurs with stride=2 instead of maxpool in conv3x3 of the ResidualBlock.

In [7]:
class ResidualBlock(nn.Module):
    """
    Creates a residual layer for ResNet.
    """

    def __init__(self, in_channels, mid_channels, expansion=4, is_bottleneck=True, stride=1,
                 dropout=0.1) -> None:
        super(ResidualBlock, self).__init__()
        # is_bottleneck=True for all ResNet-50+
        if is_bottleneck:
            self.block= BottleneckBlock(in_channels, mid_channels, expansion, stride)
        else:
            self.block= BasicBlock(in_channels, mid_channels, stride)

        self.dropout= nn.Dropout2d(p=dropout) if dropout> 0.0 else None

        # if dim(x)== dim(F) -> Identity function
        if in_channels== mid_channels * expansion:
            self.identity= True
        else:
            self.identity= False
            self.downsample= nn.Sequential(
                # only conv -> BN and no relu
                nn.Conv2d(in_channels, mid_channels*expansion, kernel_size=1, stride=stride, padding=0, bias=False),
                nn.BatchNorm2d(mid_channels*expansion),
            )
        self.relu= nn.ReLU(inplace=True)


    def forward(self, x):
        out= self.block(x)
        if self.dropout is not None:
            out= self.dropout(out)
        # residual connection -- identity or projected map
        if self.identity:
            x= x + out
        else:
            x= self.downsample(x) + out

        return self.relu(x)


In [8]:
img= torch.randn(1, 64, 112, 112).to(device)
model= ResidualBlock(64, 64, 4, True, 2).to(device)
count_parameters(model)
print(model(img).shape)

Number of parameters: 75008
torch.Size([1, 256, 56, 56])


# Building the ResNet

The input RGB image is passed through a 7x7 Conv2d with stride=2, number of filters=64, and padding=3, followed by max pooling to reduce the feature map size by half. In the remaining architecture, only 3x3 filters are used with stride=2 in cases of reducing feature map size. Max pooling is not used.

According to the ResNet variant, create 4 Sequential ResidualBlocks, either BottleneckBlocks or BasicBlocks. All the four blocks have feature map size reduction using stride=2 except for block 1, where stride=1. This exception is because the 56x56 input remains the same throughout block 1.

After the four blocks, Average Pooling reduces the feature map to 1x1, followed by a Fully Connected Layer connecting the flattened feature map with the output classes.

In [9]:
class ResNet(nn.Module):
    """
    Initializes the ResNet architecture based on the provided variant.
    """

    def __init__(self, resnet_type, in_channels, num_classes, dropout=0.1) -> None:
        super(ResNet, self).__init__()
        # define the channels and repeatition lists along with expansion factor and stride
        channels= resnet_type[0]
        repetitions= resnet_type[1]
        expansion= resnet_type[2]
        is_bottleneck= resnet_type[3]

        self.conv_in= nn.Sequential(
            nn.Conv2d(in_channels, out_channels=64, kernel_size=7, stride=2, padding=3, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
        )
        self.block1= self.make_blocks(
            64, channels[0], repetitions[0], expansion, is_bottleneck, stride=1, dropout=dropout
        )
        self.block2= self.make_blocks(
            channels[0]*expansion, channels[1], repetitions[1], expansion, is_bottleneck, stride=2, dropout=dropout
        )
        self.block3= self.make_blocks(
            channels[1]*expansion, channels[2], repetitions[2], expansion, is_bottleneck, stride=2, dropout=dropout
        )
        self.block4= self.make_blocks(
            channels[2]*expansion, channels[3], repetitions[3], expansion, is_bottleneck, stride=2, dropout=dropout
        )
        self.average_pool= nn.AdaptiveAvgPool2d(output_size=(1, 1))
        self.fc_out= nn.Linear(channels[3]*expansion, num_classes)

        # initialize parameters with He
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)


    def make_blocks(self, in_channels, mid_channels, repetitions, expansion, is_bottleneck, stride,
                    dropout):
        layers= nn.Sequential(
            ResidualBlock(in_channels, mid_channels, expansion, is_bottleneck, stride, dropout),
            *[ResidualBlock(
                mid_channels*expansion, mid_channels, expansion, is_bottleneck, stride=1, dropout=dropout
            ) for _ in range(1, repetitions)],
        )

        return layers


    def forward(self, x):
        x= self.conv_in(x)
        x= self.block1(x)
        x= self.block2(x)
        x= self.block3(x)
        x= self.block4(x)
        x= torch.flatten(self.average_pool(x), start_dim=1)

        # softmax (if needed) is applied externally
        return self.fc_out(x)


In [10]:
img= torch.randn(1, 3, 224, 224).to(device)
model= ResNet(model_parameters['resnet50'], in_channels=3, num_classes=1000, dropout=0.0).to(device)
count_parameters(model)
print(model(img).shape)
model

Number of parameters: 25557032
torch.Size([1, 1000])


ResNet(
  (conv_in): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  )
  (block1): Sequential(
    (0): ResidualBlock(
      (block): BottleneckBlock(
        (bottleneck): Sequential(
          (0): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU(inplace=True)
          (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (5): ReLU(inplace=True)
          (6): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (7): BatchNorm2d(256, eps=1e-05, momentum=0.1,

In [11]:
import torchvision.models as models
from torchvision.models import ResNet50_Weights

tvis_model= models.resnet50(weights=ResNet50_Weights.IMAGENET1K_V1).to(device)
count_parameters(tvis_model)
print(tvis_model(img).shape)
tvis_model

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 190MB/s]


Number of parameters: 25557032
torch.Size([1, 1000])


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [None]:
# https://medium.com/@karuneshu21/resnet-paper-walkthrough-b7f3bdba55f0
# https://medium.com/@karuneshu21/how-to-resnet-in-pytorch-9acb01f36cf5