In [1]:
# Model independent of input size

## CLassification

In [28]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import torchvision.models as models

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
class ResNetVariableInput(nn.Module):
    """ ResNet with adaptive pooling to make it input independent """
    
    def __init__(self, num_classes=16):
        super().__init__()
        # Load pretrained ResNet50
        self.backbone = models.resnet50(weights="IMAGENET1K_V1")
        
        # Replace fixed-size avgpool with adaptive global pooling
        # (N, 2048, H/32, W/32) --> (N, 2048, 1, 1)
        # This makes the network input-size agnostic, because its output is always [B, C, 1, 1]
        # C is 2048, the number of channels of the last CNN
        self.backbone.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        
        # Replace final FC layer for 16 classes
        in_features = self.backbone.fc.in_features
        self.backbone.fc = nn.Linear(in_features, num_classes)

    def forward(self, x):
        return self.backbone(x)

In [11]:
# Test the model on variable input sizes ----
model = ResNetVariableInput(num_classes=16).to(device)

# Create dummy inputs with different spatial sizes
x1 = torch.randn(2, 3, 128, 128).to(device)   # small square
x2 = torch.randn(2, 3, 256, 512).to(device)   # rectangular
x3 = torch.randn(2, 3, 480, 640).to(device)   # bigger

# Forward pass
y1 = model(x1)
y2 = model(x2)
y3 = model(x3)

print("Output shapes:")
print(list(x1.shape), "->", list(y1.shape))  # (2, 16)
print(list(x2.shape), "->", list(y2.shape))  # (2, 16)
print(list(x3.shape), "->", list(y3.shape))  # (2, 16)

Output shapes:
[2, 3, 128, 128] -> [2, 16]
[2, 3, 256, 512] -> [2, 16]
[2, 3, 480, 640] -> [2, 16]


## Segmentation

- Most segmentation models that follows the encoder-decoder paradigm are input-size independent, because the input and output are the same. The only constraint is the input size should be large enough to survive the downsample part of the network.
- Some segmentation model have extra constraints. For example, DeepLabv3 needs that H and W should be multiples of 8 or 16, depending on the selected stride. Otherwise, rounding during downsampling/upsampling can cause tiny misalignments.
- Models based on early architectures, e.g., Alexnet-based where FC layers are repurposed for segmentation. But, if theses layers are replaced by 1x1 convolutions, the model can be input-size independent.

The important takeaways is that most models are independent or can be somehow adapted to be independent.

In [14]:
class EfficientNetSegmentation(nn.Module):
    def __init__(self, num_classes=21):
        super().__init__()
        # Load EfficientNet backbone (no classifier head)
        # we will use it as the encoder
        backbone = models.efficientnet_b0(weights="IMAGENET1K_V1")
        self.encoder = backbone.features  # (b, 1280, h/32, w/32)

        # Simple decoder (upsample back to input size)
        self.decoder = nn.Sequential(
            nn.Conv2d(1280, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False),  # h/16, w/16

            nn.Conv2d(512, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False),  # h/8, w/8

            nn.Conv2d(256, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False),  # h/4, w/4

            nn.Conv2d(128, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Upsample(scale_factor=4, mode="bilinear", align_corners=False),  # back to h, w

            nn.Conv2d(64, num_classes, kernel_size=1)  # final segmentation map
        )

    def forward(self, x):
        feats = self.encoder(x)  # (b, 1280, h/32, w/32)
        out = self.decoder(feats) # (b, num_classes, h, w)
        return out

In [16]:
model = EfficientNetSegmentation(num_classes=16)

# Forward pass
y1 = model(x1)
y2 = model(x2)
y3 = model(x3)

print("Output shapes:")
print(list(x1.shape), "->", list(y1.shape))  # (128, 128)
print(list(x2.shape), "->", list(y2.shape))  # (256, 512) - notice that the dimension don't need to be simetrical
print(list(x3.shape), "->", list(y3.shape))  # (480, 640)

Output shapes:
[2, 3, 128, 128] -> [2, 16, 128, 128]
[2, 3, 256, 512] -> [2, 16, 256, 512]
[2, 3, 480, 640] -> [2, 16, 480, 640]


In [26]:
class AlexNetSeg(nn.Module):
    """ use 1x1 kernels"""

    def __init__(self, num_classes=21):
        super().__init__()
        self.num_classes = num_classes
        # load model
        alexnet = models.alexnet(weights=None)
        
        # Use features (convolutions)
        self.features = alexnet.features   # -> (b, 256, 6, 6) for 227x227 input
        
        # But if we convert the classifier FC layers into 1x1 convolutions
        # the model works
        self.classifier = nn.Sequential(
            nn.Conv2d(256, 4096, kernel_size=1),   # replaces first FC
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Conv2d(4096, 4096, kernel_size=1),  # replaces second FC
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Conv2d(4096, num_classes, kernel_size=1)  # final segmentation logits
        )
        
    def forward(self, x):
        feats = self.features(x)   # (b, 256, h/32, w/32)
        out = self.classifier(feats)  # (b, num_classes, h_out, w_out)
        
        # Upsample back to input size
        out = F.interpolate(out, size=x.shape[2:], mode="bilinear", align_corners=False)
        return out

In [29]:
model = AlexNetSeg(num_classes=21)

# Forward pass
y1 = model(x1)
y2 = model(x2)
y3 = model(x3)

print("Output shapes:")
print(list(x1.shape), "->", list(y1.shape))  # (128, 128)
print(list(x2.shape), "->", list(y2.shape))  # (256, 512) - notice that the dimension don't need to be simetrical
print(list(x3.shape), "->", list(y3.shape))  # (480, 640)

Output shapes:
[2, 3, 128, 128] -> [2, 21, 128, 128]
[2, 3, 256, 512] -> [2, 21, 256, 512]
[2, 3, 480, 640] -> [2, 21, 480, 640]


In [30]:
class AlexNetSeg2(nn.Module):
    """ another way to solve the input-size problem: use AdaptiveAvgPool2d"""
    
    def __init__(self, num_classes=21):
        super().__init__()
        self.num_classes = num_classes
        # load model
        alexnet = models.alexnet(weights=None)
        
        # Use features (convolutions)
        self.features = alexnet.features   # -> (b, 256, 6, 6) for 227x227 input
        
        # But if we convert the classifier FC layers into 1x1 convolutions
        # the model works
        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d((6,6)),           # force 6x6 feature map
            nn.Conv2d(256, 4096, kernel_size=6),   # replaces first FC
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Conv2d(4096, 4096, kernel_size=1),  # replaces second FC
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Conv2d(4096, num_classes, kernel_size=1)  # final segmentation logits
        )
        
    def forward(self, x):
        feats = self.features(x)   # (b, 256, h/32, w/32)
        out = self.classifier(feats)  # (b, num_classes, h_out, w_out)
        
        # Upsample back to input size
        out = F.interpolate(out, size=x.shape[2:], mode="bilinear", align_corners=False)
        return out

In [31]:
model = AlexNetSeg2(num_classes=21)

# Forward pass
y1 = model(x1)
y2 = model(x2)
y3 = model(x3)

print("Output shapes:")
print(list(x1.shape), "->", list(y1.shape))  # (128, 128)
print(list(x2.shape), "->", list(y2.shape))  # (256, 512) - notice that the dimension don't need to be simetrical
print(list(x3.shape), "->", list(y3.shape))  # (480, 640)

Output shapes:
[2, 3, 128, 128] -> [2, 21, 128, 128]
[2, 3, 256, 512] -> [2, 21, 256, 512]
[2, 3, 480, 640] -> [2, 21, 480, 640]
