In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#| default_exp detection

# Detection
> Performing bounding box detection on PASCAL VOC 2007 using Darknet

In [None]:
import torch
import torch.nn as nn

class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3):
        super().__init__()
        padding = 1 if kernel_size == 3 else 0
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, padding=padding)
        self.bn = nn.BatchNorm2d(out_channels)
        self.leaky = nn.LeakyReLU(0.1)

    def forward(self, x): return self.leaky(self.bn(self.conv(x)))

class Darknet19(nn.Module):
    def __init__(self):
        super().__init__()
        self.features = nn.Sequential(
            # Initial layers
            ConvBlock(3, 32),
            nn.MaxPool2d(2),
            ConvBlock(32, 64),
            nn.MaxPool2d(2),
            
            # First block
            ConvBlock(64, 128),
            ConvBlock(128, 64, kernel_size=1),
            ConvBlock(64, 128),
            nn.MaxPool2d(2),
            
            # Second block
            ConvBlock(128, 256),
            ConvBlock(256, 128, kernel_size=1),
            ConvBlock(128, 256),
            nn.MaxPool2d(2),
            
            # Third block
            ConvBlock(256, 512),
            ConvBlock(512, 256, kernel_size=1),
            ConvBlock(256, 512),
            ConvBlock(512, 256, kernel_size=1),
            ConvBlock(256, 512),
            nn.MaxPool2d(2),
            
            # Fourth block
            ConvBlock(512, 1024),
            ConvBlock(1024, 512, kernel_size=1),
            ConvBlock(512, 1024),
            ConvBlock(1024, 512, kernel_size=1),
            ConvBlock(512, 1024)
        )

    def forward(self, x): return self.features(x)


In [None]:
class YOLOv2(nn.Module):
    def __init__(self, num_classes, num_anchors=5):
        super().__init__()
        self.backbone = Darknet19()
        
        # Passthrough layer from earlier feature map
        self.passthrough_conv = ConvBlock(512, 64, kernel_size=1)
        
        # Detection head
        self.detection = nn.Sequential(
            ConvBlock(1024 + 256, 1024),  # +256 from passthrough (64*4)
            nn.Conv2d(1024, num_anchors * (5 + num_classes), kernel_size=1)
        )
        
    def reorg_layer(self, x):
        # Reorganize 26x26x64 to 13x13x256
        batch_size, channels, height, width = x.size()
        x = x.view(batch_size, channels, height//2, 2, width//2, 2)
        x = x.permute(0, 1, 3, 5, 2, 4)
        x = x.contiguous().view(batch_size, channels*4, height//2, width//2)
        return x

    def forward(self, x):
        # Get backbone features
        for i, layer in enumerate(self.backbone.features):
            x = layer(x)
            if i == 13:  # Save feature map for passthrough
                passthrough = self.passthrough_conv(x)
                passthrough = self.reorg_layer(passthrough)
        
        # Concatenate passthrough with final features
        x = torch.cat([passthrough, x], dim=1)
        
        # Detection head
        return self.detection(x)
