VGG Paper: [Very Deep Convolutional Networks for Large-Scale Image Recognition.]()

Source code for [torchvision.models.vgg](https://docs.pytorch.org/vision/0.8/_modules/torchvision/models/vgg.html)

vgg16: https://download.pytorch.org/models/vgg16-397923af.pth

vgg16_bn: https://download.pytorch.org/models/vgg16_bn-6c64b313.pth


# Import Dependences

In [1]:
!pip install torchinfo

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [115]:
import torch
import torch.nn as nn
from torch.nn.modules import linear

# from torch.utils.data import Dataset, DataLoader
# from torchvision import transforms

from torchinfo import summary
# from torchsummary import summary

import math
from collections import OrderedDict

# import os
# import glob
# import cv2
# from PIL import Image
# import numpy as np
# import matplotlib.pyplot as plt
# from tqdm import tqdm

# VGG Models (Dynamic)

In [73]:
# class ModelClassName(nn.Module):
#   def __init__(self):
#     super(VGG16, self).__init__()
#     pass
#   def forward(self, x):
#     pass

In [3]:
vgg16_architecture = [
    # Block 1
    {"type": "conv2d", "name": "block1-conv1", "in_channels": 3, "out_channels": 64, "kernel_size": 3, "stride": 1, "padding": 1},
    {"type": "conv2d", "name": "block1-conv2", "out_channels": 64, "kernel_size": 3, "stride": 1, "padding": 1},
    {"type": "maxpool2d", "name": "block1-maxpool", "kernel_size": 2, "stride": 2},

    # Block 2
    {"type": "conv2d", "name": "block2-conv1", "out_channels": 128, "kernel_size": 3, "stride": 1, "padding": 1},
    {"type": "conv2d", "name": "block2-conv2", "out_channels": 128, "kernel_size": 3, "stride": 1, "padding": 1},
    {"type": "maxpool2d", "name": "block2-maxpool", "kernel_size": 2, "stride": 2},

    # Block 3
    {"type": "conv2d", "name": "block3-conv1", "out_channels": 256, "kernel_size": 3, "stride": 1, "padding": 1},
    {"type": "conv2d", "name": "block3-conv2", "out_channels": 256, "kernel_size": 3, "stride": 1, "padding": 1},
    {"type": "conv2d", "name": "block3-conv3", "out_channels": 256, "kernel_size": 3, "stride": 1, "padding": 1},
    {"type": "maxpool2d", "name": "block3-maxpool", "kernel_size": 2, "stride": 2},

    # Block 4
    {"type": "conv2d", "name": "block4-conv1", "out_channels": 512, "kernel_size": 3, "stride": 1, "padding": 1},
    {"type": "conv2d", "name": "block4-conv2", "out_channels": 512, "kernel_size": 3, "stride": 1, "padding": 1},
    {"type": "conv2d", "name": "block4-conv3", "out_channels": 512, "kernel_size": 3, "stride": 1, "padding": 1},
    {"type": "maxpool2d", "name": "block4-maxpool", "kernel_size": 2, "stride": 2},

    # Block 5
    {"type": "conv2d", "name": "block5-conv1", "out_channels": 512, "kernel_size": 3, "stride": 1, "padding": 1},
    {"type": "conv2d", "name": "block5-conv2", "out_channels": 512, "kernel_size": 3, "stride": 1, "padding": 1},
    {"type": "conv2d", "name": "block5-conv3", "out_channels": 512, "kernel_size": 3, "stride": 1, "padding": 1},
    {"type": "maxpool2d", "name": "block5-maxpool", "kernel_size": 2, "stride": 2},

    # Fully connected layers
    {"type": "flatten"},
    {"type": "linear", "name": "fc1", "out_features": 4096, "activation": "relu"},
    {"type": "linear", "name": "fc2", "out_features": 4096, "activation": "relu"},
    {"type": "linear", "name": "fc3", "out_features": 1000, "activation": "softmax"}
]


In [4]:
class VGGConvBlock(nn.Module):
  def __init__(self, architecture, in_channels=3):
    super(VGGConvBlock, self).__init__()
    # class torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0,
    #                       dilation=1, groups=1, bias=True, padding_mode='zeros', device=None, dtype=None)
    self.in_channels = in_channels
    self.conv_layers = self.create_conv_layers(architecture)

  def forward(self, x):
    return self.conv_layers(x)

  def create_conv_layers(self, architecture):
    layers = OrderedDict()
    in_channels = self.in_channels

    conv_idx = 1
    pool_idx = 1

    for cfg in architecture:
      layer_type = cfg.get("type", "").lower()

      if layer_type == "conv2d":
        if (cfg.get("in_channels") is not None) and (cfg["in_channels"] != in_channels):
          raise ValueError(
            f"in_channels mismatch: expected {in_channels}, got {cfg['in_channels']}"
          )

        base_name = cfg.get("name", f"conv2d{conv_idx}")
        layers[f"{base_name}"] = nn.Conv2d(in_channels=in_channels, out_channels=cfg["out_channels"],
                                           kernel_size=cfg["kernel_size"], stride=cfg["stride"], padding=cfg["padding"])
        layers[f"{base_name}_batchnorm2d"] = nn.BatchNorm2d(num_features=cfg["out_channels"])
        layers[f"{base_name}_relu"] = nn.ReLU(inplace=True)

        in_channels = cfg["out_channels"]
        conv_idx += 1

      elif cfg["type"] == "maxpool2d":
        base_name = cfg.get("name", f"maxpool2d{pool_idx}")
        layers[f"{base_name}"] = nn.MaxPool2d(kernel_size=cfg["kernel_size"], stride=cfg["stride"])
        pool_idx += 1

    return nn.Sequential(layers)


In [76]:
# model = VGGConvBlock(architecture=vgg16_architecture, in_channels=3)
# # print(model)
# summary(model, (1, 3, 224, 224)) # for torchinfo -> summary
# # summary(model, (3, 224, 224)) # for torchsummary -> summary

# # for name, param in model.named_parameters():
# #     print(name)

In [5]:
class VGG(nn.Module):
  def __init__(self, architecture, in_channels=3, input_size=(224, 224)):
    super(VGG, self).__init__()
    self.in_channels = in_channels
    self.input_size = input_size # This is (H, W)

    # Store the full architecture for reference if needed, but primarily split it.
    self.full_architecture = architecture

    # Correctly split architecture into conv and FC parts
    conv_architecture = []
    fc_architecture = []
    in_fc_block = False # Flag to indicate if we've reached the FC block
    for cfg in architecture:
      if cfg.get("type", "").lower() == "flatten":
        in_fc_block = True # From now on, layers belong to FC block

      if in_fc_block:
        fc_architecture.append(cfg)
      else:
        conv_architecture.append(cfg)

    # Initialize convolutional blocks using the VGGConvBlock class
    # VGGConvBlock expects only the convolutional/pooling part of the architecture
    self.conv_blocks = VGGConvBlock(architecture=conv_architecture, in_channels=in_channels)

    # Initialize fully connected layers
    # Pass the FC part of the architecture and the conv_architecture (for sizing)
    self.fc_layers = self.create_fc_layers(fc_architecture, conv_architecture)

  def forward(self, x):
    x = self.conv_blocks(x)
    # The output of conv_blocks is typically a 4D tensor (N, C, H, W)
    # The first layer in fc_layers will be Flatten, so it handles the flattening.
    x = self.fc_layers(x)
    return x

  def create_fc_layers(self, fc_architecture, conv_architecture_for_sizing):
    layers = OrderedDict()

    # Calculate the input features for the first linear layer
    # compute_fc_input_size expects (architecture_list, (C, H, W))
    initial_fc_input_features, _ = VGG.compute_fc_input_size(
        architecture=conv_architecture_for_sizing, # Pass the list of conv configurations (list of dicts)
        input_size=(self.in_channels, self.input_size[0], self.input_size[1]) # Correctly pass (C, H, W)
    )
    in_features = initial_fc_input_features

    linear_idx = 1

    for cfg in fc_architecture:
      layer_type = cfg.get("type", "").lower()

      if layer_type == "flatten":
        layers[f"flatten"] = nn.Flatten() # Flatten layer before FC layers
      elif layer_type == "linear":
        base_name = cfg.get("name", f"linear{linear_idx}")
        # The first linear layer uses the calculated 'in_features'
        layers[f"{base_name}"] = nn.Linear(in_features, cfg["out_features"])

        if cfg.get("activation", "").lower() == "relu":
          layers[f"{base_name}_relu"] = nn.ReLU(inplace=True)
          layers[f"{base_name}_dropout"] = nn.Dropout(p=0.5)

        elif cfg.get("activation", "").lower() == "softmax":
          layers[f"{base_name}_softmax"] = nn.Softmax(dim=1)

        in_features = cfg["out_features"] # Update in_features for subsequent linear layers
        linear_idx += 1

    return nn.Sequential(layers)

  @staticmethod
  def compute_fc_input_size(architecture, input_size):
    """
    architecture : list of layer configs (specifically the convolutional/pooling part)
    input_size   : tuple (C, H, W) - Initial input size to the convolutional block
    returns: flattened feature size (C * H * W) and feature map (C, H, W)
    """
    C, H, W = input_size # This line is now safe because input_size will be (C, H, W)

    # Iterate through the convolutional and pooling layers to track feature map size
    for cfg in architecture:
        layer_type = cfg["type"].lower()

        if layer_type == "conv2d":
            # Update channels
            C = cfg["out_channels"]

            k = cfg["kernel_size"]
            s = cfg["stride"]
            p = cfg["padding"]

            H = math.floor((H + 2*p - k) / s) + 1
            W = math.floor((W + 2*p - k) / s) + 1

        elif layer_type == "maxpool2d":
            k = cfg["kernel_size"]
            s = cfg["stride"]

            H = math.floor((H - k) / s) + 1
            W = math.floor((W - k) / s) + 1
        # No need to handle 'flatten' here, as this method only receives conv/pool architecture
    return C * H * W, (C, H, W)

In [6]:
model = VGG(architecture=vgg16_architecture, in_channels=3, input_size=(224, 224))
# print(model)
summary(model, (1, 3, 224, 224)) # for torchinfo -> summary
# summary(model, (3, 224, 224)) # for torchsummary -> summary

Layer (type:depth-idx)                   Output Shape              Param #
VGG                                      [1, 1000]                 --
├─VGGConvBlock: 1-1                      [1, 512, 7, 7]            --
│    └─Sequential: 2-1                   [1, 512, 7, 7]            --
│    │    └─Conv2d: 3-1                  [1, 64, 224, 224]         1,792
│    │    └─BatchNorm2d: 3-2             [1, 64, 224, 224]         128
│    │    └─ReLU: 3-3                    [1, 64, 224, 224]         --
│    │    └─Conv2d: 3-4                  [1, 64, 224, 224]         36,928
│    │    └─BatchNorm2d: 3-5             [1, 64, 224, 224]         128
│    │    └─ReLU: 3-6                    [1, 64, 224, 224]         --
│    │    └─MaxPool2d: 3-7               [1, 64, 112, 112]         --
│    │    └─Conv2d: 3-8                  [1, 128, 112, 112]        73,856
│    │    └─BatchNorm2d: 3-9             [1, 128, 112, 112]        256
│    │    └─ReLU: 3-10                   [1, 128, 112, 112]        --
│

# VGG16 Model (Static)

In [68]:
class VGG16(nn.Module):
    def __init__(self, num_classes=10):
        super(VGG16, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU())
        self.layer2 = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.layer3 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU())
        self.layer4 = nn.Sequential(
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.layer5 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU())
        self.layer6 = nn.Sequential(
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU())
        self.layer7 = nn.Sequential(
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.layer8 = nn.Sequential(
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU())
        self.layer9 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU())
        self.layer10 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.layer11 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU())
        self.layer12 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU())
        self.layer13 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.fc = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(7*7*512, 4096),
            nn.ReLU())
        self.fc1 = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(4096, 4096),
            nn.ReLU())
        self.fc2= nn.Sequential(
            nn.Linear(4096, num_classes))

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.layer5(out)
        out = self.layer6(out)
        out = self.layer7(out)
        out = self.layer8(out)
        out = self.layer9(out)
        out = self.layer10(out)
        out = self.layer11(out)
        out = self.layer12(out)
        out = self.layer13(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)
        out = self.fc1(out)
        out = self.fc2(out)
        return out

In [69]:
model = VGG16(num_classes=1000)
# print(model)
summary(model, (1, 3, 224, 224)) # for torchinfo -> summary

Layer (type:depth-idx)                   Output Shape              Param #
VGG16                                    [1, 1000]                 --
├─Sequential: 1-1                        [1, 64, 224, 224]         --
│    └─Conv2d: 2-1                       [1, 64, 224, 224]         1,792
│    └─BatchNorm2d: 2-2                  [1, 64, 224, 224]         128
│    └─ReLU: 2-3                         [1, 64, 224, 224]         --
├─Sequential: 1-2                        [1, 64, 112, 112]         --
│    └─Conv2d: 2-4                       [1, 64, 224, 224]         36,928
│    └─BatchNorm2d: 2-5                  [1, 64, 224, 224]         128
│    └─ReLU: 2-6                         [1, 64, 224, 224]         --
│    └─MaxPool2d: 2-7                    [1, 64, 112, 112]         --
├─Sequential: 1-3                        [1, 128, 112, 112]        --
│    └─Conv2d: 2-8                       [1, 128, 112, 112]        73,856
│    └─BatchNorm2d: 2-9                  [1, 128, 112, 112]        256
│