<a href="https://colab.research.google.com/github/damianoimola/naolo-nao-only-look-once/blob/master/naolo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn

Dynamic Convolutions paper: https://arxiv.org/pdf/1912.03458

Awesome Dynamic Convolutions: https://github.com/kaijieshi7/awesome-dynamic-convolution

In [None]:
class AttentionLayer(nn.Module):
    """attention layer ad described in the paper"""
    def __init__(self, c_dim, hidden_dim, nof_kernels):
        super().__init__()
        # global average pooling layer
        self.global_pooling = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten()
        )

        self.to_scores = nn.Sequential(
            nn.Linear(c_dim, hidden_dim),
            nn.ReLU(inplace=True),
            nn.Linear(hidden_dim, nof_kernels)
        )

    def forward(self, x, temperature=1):
        out = self.global_pooling(x)
        scores = self.to_scores(out)
        return F.softmax(scores / temperature, dim=-1)

In [None]:
class DynamicConvolution(nn.Module):
    def __init__(self, in_channels, out_channels, nof_kernels, reduce, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True):
        """Dynamic convolution layer as written in the paper"""
        super().__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels

        # control whether in_c and out_c are divisible by groups (i.e. the connection between inputs and outputs)
        assert in_channels % groups == 0 and out_channels % groups == 0

        self.groups = groups
        self.stride = stride
        self.padding = padding
        self.dilation = dilation
        self.nof_kernels = nof_kernels
        self.attention = AttentionLayer(in_channels, max(1, in_channels // reduce), nof_kernels)
        self.kernel_size = _pair(kernel_size)
        self.kernels_weights = nn.Parameter(torch.Tensor(nof_kernels, out_channels, in_channels // self.groups, *self.kernel_size), requires_grad=True)

        if bias:
            self.kernels_bias = nn.Parameter(torch.Tensor(nof_kernels, out_channels), requires_grad=True)
        else:
            # register bias parameter in class' memory
            self.register_parameter('kernels_bias', None)

        self.initialize_parameters()

    def initialize_parameters(self):
        # Kaiming uniform initialization
        for i_kernel in range(self.nof_kernels):
            init.kaiming_uniform_(self.kernels_weights[i_kernel], a=math.sqrt(5))

        # uniform bias initialization (if present)
        if self.kernels_bias is not None:
            bound = 1 / math.sqrt(self.kernels_weights[0, 0].numel())
            nn.init.uniform_(self.kernels_bias, -bound, bound)

    def forward(self, x, temperature=1):
        batch_size = x.shape[0]

        # attention computation
        # [batch_size, nof_kernels]
        alphas = self.attention(x, temperature)

        # combining kernels using the scores (i.e. dynamic kernel aggregation)
        # [batch_size, out_C, in_C/groups, kernel_height, kernel_width]
        agg_weights = torch.sum(
            torch.mul(self.kernels_weights.unsqueeze(0), alphas.view(batch_size, -1, 1, 1, 1, 1)),
            dim=1
        )

        # reshape for grouped convolution
        # [batch_size*out_C, in_C, kernel_size, kernel_size]
        agg_weights = agg_weights.view(-1, *agg_weights.shape[-3:])

        # the very same we did just now but for biases
        if self.kernels_bias is not None:
            agg_bias = torch.sum(torch.mul(self.kernels_bias.unsqueeze(0), alphas.view(batch_size, -1, 1)), dim=1)
            agg_bias = agg_bias.view(-1)
        else:
            agg_bias = None

        # reshape for grouped convolutions
        # [1, batch_size*out_C, H, W]
        x_grouped = x.contiguous().view(1, batch_size*self.out_channels, *x.shape[-2:])

        # grouped convolution
        # [1, batch_size*out_C, H', W']
        out = F.conv2d(x_grouped, agg_weights, agg_bias, groups=self.groups*batch_size,
                       stride=self.stride, padding=self.padding, dilation=self.dilation)

        # reshape back to standard dimensions
        # [batch_size, out_C, H', W']
        out = out.view(batch_size, -1, *out.shape[-2:])

        return out

In [None]:
# ===== HELPER FUNCTIONS (only 3x3 used) =====
def dconv3x3(in_channels, out_channels):
    return DynamicConvolution(in_channels, out_channels, nof_kernels=4, reduce=1, kernel_size=3, stride=1, padding=1, groups=1, bias=False, dilation=1)

def dconv5x5(in_channels, out_channels):
    return DynamicConvolution(in_channels, out_channels, nof_kernels=4, reduce=1, kernel_size=5, stride=1, padding=1, groups=1, bias=False, dilation=1)

def dconv7x7(in_channels, out_channels):
    return DynamicConvolution(in_channels, out_channels, nof_kernels=4, reduce=1, kernel_size=7, stride=1, padding=1, groups=1, bias=False, dilation=1)

In [None]:
class FastYOLO(nn.Module):
    def __init__(
        self,
        num_classes=80,
        anchors=None,
        width_multiplier=0.5,
        conv_layer=nn.Conv2d,
    ):
        """
        A fast, compact YOLO-like model.

        Args:
            num_classes (int): Number of object classes.
            anchors (list of tuples): Anchor box sizes (w, h).
            width_multiplier (float): Factor to reduce channel widths.
            conv_layer (nn.Module): Convolutional layer class to use (e.g., nn.Conv2d or DynamicConv2d).
        """
        super(FastYOLO, self).__init__()
        if anchors is None:
            # Default anchors (tiny-scale)
            anchors = [(10,13), (16,30), (33,23)]
        self.anchors = torch.tensor(anchors, dtype=torch.float32)
        self.num_anchors = len(anchors)
        self.num_classes = num_classes

        # Base channel configuration (Tiny YOLO-like)
        base_channels = [16, 32, 64, 128, 256]
        channels = [int(c * width_multiplier) for c in base_channels]

        # Construct sequential backbone
        layers = []
        in_ch = 3
        for out_ch in channels:
            layers.append(conv_layer(in_ch, out_ch, kernel_size=3, stride=1, padding=1, bias=False))
            layers.append(nn.BatchNorm2d(out_ch))
            layers.append(nn.LeakyReLU(0.1, inplace=True))
            layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
            in_ch = out_ch

        # Final conv to produce predictions
        # Each anchor predicts (5 + num_classes) values
        pred_channels = self.num_anchors * (self.num_classes + 5)
        layers.append(conv_layer(in_ch, pred_channels, kernel_size=1, stride=1, padding=0, bias=True))

        self.model = nn.Sequential(*layers)

    def forward(self, x):
        """
        Forward pass through the network.

        Returns:
            preds: Tensor of shape [B, num_anchors*(5+num_classes), H/32, W/32]
            anchors: Tensor of shape [num_anchors, 2]
        """
        preds = self.model(x)
        return preds, self.anchors

if __name__ == "__main__":
    # Example usage
    model = FastYOLO(num_classes=20, width_multiplier=0.33, conv_layer=DynamicConv2d)
    dummy = torch.randn(1, 3, 416, 416)
    preds, anchors = model(dummy)
    print("Predictions shape:", preds.shape)
    print("Anchors:", anchors)


In [None]:
model = FastYOLO(num_classes=20, width_multiplier=0.33, conv_layer=DynamicConv2d)
dummy = torch.randn(1, 3, 416, 416)
preds, anchors = model(dummy)
print("Predictions shape:", preds.shape)
print("Anchors:", anchors)