###Experiment with YOLOv11 Head

####Replaced traditional v4 head with v11 head. but the output was not satisfactory. the model is not improving its performance.

####In the model darknet was used as its head. FPN+PAN+CBAM and YoloV11 head were used as its neck and head respectively. But the result was not satisfoctary.


#####Few of the architecture and its codes are referenced from internet and different github repos. All the references are mentioned in the written report. Codes are used only for experiment purpose only to see if we can improve the model performance. I do not claim that all the codes are written by me.

#####After running the codes it was observed that there was no improvement on metrics like precision recall and map score. The model was trained with 30 epochs.

In [94]:
import torch
import torch.nn as nn
import timm
import torch.nn.functional as F

# -----------------------------------------------------------------------------------------------------------------------------------------
# The source code was referenced from its original author. This is not a code written by me or I do not claim the ownership of the codes.
# This code is used only for experiment purpose only to see if we can imprve model performance.
# -----------------------------------------------------------------------------------------------------------------------------------------
class CBAM(nn.Module):
    def __init__(self, channels, reduction_ratio=16):
        super(CBAM, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.max_pool = nn.AdaptiveMaxPool2d(1)

        self.fc = nn.Sequential(
            nn.Conv2d(channels, channels // reduction_ratio, 1, bias=False),
            nn.ReLU(),
            nn.Conv2d(channels // reduction_ratio, channels, 1, bias=False)
        )

        self.spatial = nn.Sequential(
            nn.Conv2d(2, 1, kernel_size=7, padding=3, bias=False),
            nn.Sigmoid()
        )

    def forward(self, x):
        avg_out = self.fc(self.avg_pool(x))
        max_out = self.fc(self.max_pool(x))
        channel_att = torch.sigmoid(avg_out + max_out)
        x = x * channel_att

        avg_out = torch.mean(x, dim=1, keepdim=True)
        max_out, _ = torch.max(x, dim=1, keepdim=True)
        spatial_att = self.spatial(torch.cat([avg_out, max_out], dim=1))
        x = x * spatial_att

        return x


class FPN(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(FPN, self).__init__()
        self.lateral_convs = nn.ModuleList([
            nn.Conv2d(c, out_channels, 1) for c in in_channels
        ])
        self.smooth_convs = nn.ModuleList([
            nn.Conv2d(out_channels, out_channels, 3, padding=1) for _ in in_channels
        ])

    def forward(self, features):
        features = features[::-1]
        x = self.lateral_convs[0](features[0])
        results = [x]

        for i in range(1, len(features)):
            lateral = self.lateral_convs[i](features[i])
            x = F.interpolate(x, scale_factor=2, mode='nearest') + lateral
            results.insert(0, self.smooth_convs[i](x))

        return results


class PANet(nn.Module):
    def __init__(self, in_channels):
        super(PANet, self).__init__()
        self.downsample_convs = nn.ModuleList([
            nn.Conv2d(in_channels, in_channels, 3, stride=2, padding=1) for _ in range(2)
        ])
        self.output_convs = nn.ModuleList([
            nn.Conv2d(in_channels, in_channels, 3, padding=1) for _ in range(3)
        ])

    def forward(self, features):
        P3, P4, P5 = features
        P4 = self.output_convs[0](P4 + self.downsample_convs[0](P3))
        P5 = self.output_convs[1](P5 + self.downsample_convs[1](P4))
        P3 = self.output_convs[2](P3)
        return [P3, P4, P5]

class DFL(nn.Module):
    def __init__(self, c=16):
        super().__init__()
        self.conv = nn.Conv2d(c, 1, 1, bias=False)
        self.conv.weight.data.fill_(1.0 / c)

    def forward(self, x):
        return self.conv(x)

class Conv(nn.Module):
    def __init__(self, in_channels, out_channels, act=nn.SiLU(), k=1, s=1, p=0, g=1):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, k, s, p, groups=g, bias=False)
        self.bn = nn.BatchNorm2d(out_channels)
        self.act = act

    def forward(self, x):
        return self.act(self.bn(self.conv(x)))

class Head(nn.Module):
    anchors = torch.empty(0)
    strides = torch.empty(0)

    def __init__(self, nc=1, filters=(256, 256, 256)):
        super().__init__()
        self.ch = 16
        self.nc = nc
        self.nl = len(filters)
        self.no = nc + self.ch * 4
        self.stride = torch.zeros(self.nl)

        box = max(64, filters[0] // 4)
        cls = max(80, filters[0], self.nc)

        self.dfl = DFL(self.ch)
        self.box = nn.ModuleList([
            nn.Sequential(
                Conv(x, box, nn.SiLU(), k=3, p=1),
                Conv(box, box, nn.SiLU(), k=3, p=1),
                nn.Conv2d(box, 4 * self.ch, kernel_size=1)
            ) for x in filters
        ])
        self.cls = nn.ModuleList([
            nn.Sequential(
                Conv(x, x, nn.SiLU(), k=3, p=1, g=x),
                Conv(x, cls, nn.SiLU()),
                Conv(cls, cls, nn.SiLU(), k=3, p=1, g=cls),
                Conv(cls, cls, nn.SiLU()),
                nn.Conv2d(cls, self.nc, kernel_size=1)
            ) for x in filters
        ])

    def forward(self, x):
        for i, (box, cls) in enumerate(zip(self.box, self.cls)):
            x[i] = torch.cat([box(x[i]), cls(x[i])], dim=1)
        return x

def bbox_ciou(box1, box2):
    b1_x, b1_y, b1_w, b1_h = box1[:,0], box1[:,1], box1[:,2], box1[:,3]
    b2_x, b2_y, b2_w, b2_h = box2[:,0], box2[:,1], box2[:,2], box2[:,3]

    b1_x1 = b1_x - b1_w / 2
    b1_y1 = b1_y - b1_h / 2
    b1_x2 = b1_x + b1_w / 2
    b1_y2 = b1_y + b1_h / 2
    b2_x1 = b2_x - b2_w / 2
    b2_y1 = b2_y - b2_h / 2
    b2_x2 = b2_x + b2_w / 2
    b2_y2 = b2_y + b2_h / 2

    inter_x1 = torch.max(b1_x1, b2_x1)
    inter_y1 = torch.max(b1_y1, b2_y1)
    inter_x2 = torch.min(b1_x2, b2_x2)
    inter_y2 = torch.min(b1_y2, b2_y2)

    inter_area = torch.clamp(inter_x2 - inter_x1, min=0) * torch.clamp(inter_y2 - inter_y1, min=0)
    b1_area = b1_w * b1_h
    b2_area = b2_w * b2_h
    union_area = b1_area + b2_area - inter_area
    iou = inter_area / (union_area + 1e-7)

    enclose_x1 = torch.min(b1_x1, b2_x1)
    enclose_y1 = torch.min(b1_y1, b2_y1)
    enclose_x2 = torch.max(b1_x2, b2_x2)
    enclose_y2 = torch.max(b1_y2, b2_y2)
    c2 = (enclose_x2 - enclose_x1)**2 + (enclose_y2 - enclose_y1)**2

    center_dist = (b1_x - b2_x)**2 + (b1_y - b2_y)**2
    v = (4 / (3.1415**2)) * torch.pow(torch.atan(b1_w / b1_h) - torch.atan(b2_w / b2_h), 2)
    alpha = v / (1 - iou + v + 1e-7)

    ciou = iou - (center_dist / (c2 + 1e-7)) - alpha * v
    return ciou


class YOLOLoss(nn.Module):
    def __init__(self, lambda_box=5.0, lambda_obj=1.0, lambda_cls=0.5):
        super(YOLOLoss, self).__init__()
        self.bce = nn.BCEWithLogitsLoss()
        self.lambda_box = lambda_box
        self.lambda_obj = lambda_obj
        self.lambda_cls = lambda_cls

    def forward(self, pred_box, target_box, pred_obj, target_obj, pred_cls, target_cls):
        loss_box = 1 - bbox_ciou(pred_box, target_box).mean()
        loss_obj = self.bce(pred_obj, target_obj)
        loss_cls = self.bce(pred_cls, target_cls)

        total_loss = (self.lambda_box * loss_box +
                      self.lambda_obj * loss_obj +
                      self.lambda_cls * loss_cls)

        return total_loss




In [32]:
import torch
import torch.nn as nn
import timm

class CSPDarknetBackbone(nn.Module):
    def __init__(self):
        super().__init__()
        #------------------------------------------------------
        # Using CSPDarknet53 pretrained backbone
        # The original idea of using transformer did not work
        # But the model did not perform as expected
        #------------------------------------------------------
        self.model = timm.create_model('cspdarknet53', pretrained=True, features_only=True)

    def forward(self, x):
        return self.model(x)


In [74]:
#-----------------------------------------------------------
# This code has beed referenced from analyticsvidya
# All the reference has been mentioned in the written report
#
#------------------------------------------------------------
class Head(nn.Module):
    anchors = torch.empty(0)
    strides = torch.empty(0)

    def __init__(self, nc=1, filters=(256, 256, 256)):
        super().__init__()
        self.ch = 1
        self.nc = nc
        self.nl = len(filters)
        self.no = nc + 5

        box = max(64, filters[0] // 4)
        cls = max(80, filters[0], self.nc)

        self.dfl = DFL(self.ch)
        self.box = nn.ModuleList([
            nn.Sequential(
                Conv(x, box, nn.SiLU(), k=3, p=1),
                Conv(box, box, nn.SiLU(), k=3, p=1),
                nn.Conv2d(box, 4, kernel_size=1)
            ) for x in filters
        ])
        self.obj_cls = nn.ModuleList([
            nn.Sequential(
                Conv(x, x, nn.SiLU(), k=3, p=1),
                nn.Conv2d(x, 1 + self.nc, kernel_size=1)
            ) for x in filters
        ])

    def forward(self, x):
        outputs = []
        for i, (box, obj_cls) in enumerate(zip(self.box, self.obj_cls)):
            bbox = box(x[i])
            obj_cls_pred = obj_cls(x[i])
            outputs.append(torch.cat([bbox, obj_cls_pred], dim=1))
        return outputs


In [37]:
#-------------------------------------------------------------------------------------------------
#This is a modified architecture of FPN module to adjust the output according to the architecture
#-------------------------------------------------------------------------------------------------

class FPN(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(FPN, self).__init__()
        self.lateral_convs = nn.ModuleList([
            nn.Conv2d(c, out_channels, 1) for c in in_channels[::-1]
        ])
        self.smooth_convs = nn.ModuleList([
            nn.Conv2d(out_channels, out_channels, 3, padding=1) for _ in in_channels
        ])

    def forward(self, features):

        features = features[::-1]
        x = self.lateral_convs[0](features[0])
        results = [self.smooth_convs[0](x)]

        for i in range(1, len(features)):
            lateral = self.lateral_convs[i](features[i])
            x = F.interpolate(x, size=lateral.shape[-2:], mode='nearest') + lateral
            results.insert(0, self.smooth_convs[i](x))

        return results


In [96]:
#-----------------------------------------------------------------------
# Model Architecture for testing purpose
#-----------------------------------------------------------------------


class Yolov4HybridCNN(nn.Module):
    def __init__(self, num_classes=1):
        super(Yolov4HybridCNN, self).__init__()


        self.backbone = timm.create_model('cspdarknet53', pretrained=True, features_only=True)


        backbone_channels = self.backbone.feature_info.channels()[-3:]


        self.cbam_blocks = nn.ModuleList([CBAM(c) for c in backbone_channels])


        self.fpn_out_channels = 256
        self.fpn = FPN(backbone_channels, self.fpn_out_channels)


        self.panet = PANet(self.fpn_out_channels)


        self.head = Head(nc=num_classes, filters=(self.fpn_out_channels,) * 3)

    def forward(self, x):
        features = self.backbone(x)[-3:]


        for idx, f in enumerate(features):
            print(f"Selected Feature[{idx}] shape: {f.shape}")


        features = [cbam(f) for cbam, f in zip(self.cbam_blocks, features)]


        fpn_features = self.fpn(features)


        panet_features = self.panet(fpn_features)


        outputs = self.head(panet_features)

        return outputs


In [76]:
model = Yolov4HybridCNN(num_classes=1)
x = torch.randn(1, 3, 224, 224)
outputs = model(x)

print("\nCorrected Final output shapes:")
for idx, out in enumerate(outputs):
    print(f"Output[{idx}] shape: {out.shape}")


Selected Feature[0] shape: torch.Size([1, 256, 28, 28])
Selected Feature[1] shape: torch.Size([1, 512, 14, 14])
Selected Feature[2] shape: torch.Size([1, 1024, 7, 7])

Corrected Final output shapes:
Output[0] shape: torch.Size([1, 6, 28, 28])
Output[1] shape: torch.Size([1, 6, 14, 14])
Output[2] shape: torch.Size([1, 6, 7, 7])


In [40]:
from google.colab import drive;
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [41]:
import os

# Define folder path
folder_path = '/content/drive/MyDrive/Number_Plate'

# List files in the folder
files = os.listdir(folder_path)
print("Files in the folder:")
print(files)


Files in the folder:
['.DS_Store', 'dataset']


In [59]:
import os
import cv2
import torch
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

class YoloDataset(Dataset):
    def __init__(self, image_dir, label_dir, img_size=224, S=7, num_classes=1, transform=None):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.image_files = sorted([f for f in os.listdir(image_dir) if f.endswith(('.jpg', '.png'))])
        self.img_size = img_size
        self.S = S  # grid size (SxS)
        self.num_classes = num_classes

        self.transform = transform or transforms.Compose([
            transforms.Resize((img_size, img_size)),
            transforms.ToTensor()
        ])

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_filename = self.image_files[idx]
        img_path = os.path.join(self.image_dir, img_filename)
        label_path = os.path.join(self.label_dir, os.path.splitext(img_filename)[0] + '.txt')

        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = Image.fromarray(img)
        img = self.transform(img)

        target = torch.zeros((self.S, self.S, 5 + self.num_classes))
        if os.path.exists(label_path):
            with open(label_path, 'r') as f:
                for line in f.readlines():
                    class_id, x_center, y_center, width, height = map(float, line.strip().split())

                    i, j = int(self.S * y_center), int(self.S * x_center)
                    x_cell, y_cell = self.S * x_center - j, self.S * y_center - i
                    width_cell, height_cell = width * self.S, height * self.S

                    if target[i, j, 4] == 0:  # if cell is empty
                        target[i, j, :4] = torch.tensor([x_cell, y_cell, width_cell, height_cell])
                        target[i, j, 4] = 1  # objectness score
                        target[i, j, 5 + int(class_id)] = 1  # class one-hot

        return img, target



# Example usage:
train_dataset = YoloDataset(
    "/content/drive/MyDrive/Number_Plate/dataset/train/images",
    "/content/drive/MyDrive/Number_Plate/dataset/train/labels"
)
val_dataset = YoloDataset(
    "/content/drive/MyDrive/Number_Plate/dataset/val/images",
    "/content/drive/MyDrive/Number_Plate/dataset/val/labels"
)



In [60]:
def collate_fn(batch):
    images, targets = zip(*batch)
    images = torch.stack(images)
    targets = torch.stack(targets)
    return images, targets


In [61]:
train_loader = DataLoader(
    train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn
)
val_loader = DataLoader(
    val_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn
)


In [63]:
train_loader = DataLoader(
    train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn
)


In [77]:
def train_one_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0

    for images, targets in dataloader:
        images = torch.stack(images).to(device)
        targets = torch.stack(targets).to(device)

        optimizer.zero_grad()

        with torch.amp.autocast('cuda'):
            preds = model(images)



            pred_box = preds[..., :4]
            pred_obj = preds[..., 4:5]
            pred_cls = preds[..., 5:]

            target_box = targets[..., :4]
            target_obj = targets[..., 4:5]
            target_cls = targets[..., 5:]

            loss = criterion(pred_box, target_box, pred_obj, target_obj, pred_cls, target_cls)

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(dataloader)


In [88]:
def decode_predictions(preds, conf_threshold=0.5):
    preds = preds.permute(0, 2, 3, 1).contiguous()
    pred_box = preds[..., :4]
    pred_obj = torch.sigmoid(preds[..., 4:5])
    pred_cls = torch.sigmoid(preds[..., 5:])

    pred_score = pred_obj * pred_cls

    boxes, scores, labels = [], [], []

    B, H, W, _ = pred_score.shape

    for b in range(B):
        for i in range(H):
            for j in range(W):
                score = pred_score[b, i, j, 0].item()
                if score > conf_threshold:
                    x, y, w, h = pred_box[b, i, j]
                    cx = (j + x.item()) / W
                    cy = (i + y.item()) / H
                    ww = w.item() / W
                    hh = h.item() / H
                    boxes.append([cx, cy, ww, hh])
                    scores.append(score)
                    labels.append(0)

    return boxes, scores, labels


In [82]:
def compute_metrics(pred_boxes, gt_boxes, iou_threshold=0.5):
    def iou(box1, box2):

        b1_x1 = box1[0] - box1[2] / 2
        b1_y1 = box1[1] - box1[3] / 2
        b1_x2 = box1[0] + box1[2] / 2
        b1_y2 = box1[1] + box1[3] / 2

        b2_x1 = box2[0] - box2[2] / 2
        b2_y1 = box2[1] - box2[3] / 2
        b2_x2 = box2[0] + box2[2] / 2
        b2_y2 = box2[1] + box2[3] / 2

        inter_x1 = max(b1_x1, b2_x1)
        inter_y1 = max(b1_y1, b2_y1)
        inter_x2 = min(b1_x2, b2_x2)
        inter_y2 = min(b1_y2, b2_y2)

        inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1)
        box1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1)
        box2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1)
        union = box1_area + box2_area - inter_area
        return inter_area / (union + 1e-6)

    TP, FP, FN = 0, 0, 0
    matched = set()

    for pred in pred_boxes:
        found = False
        for i, gt in enumerate(gt_boxes):
            if i in matched:
                continue
            if iou(pred, gt) >= iou_threshold:
                TP += 1
                matched.add(i)
                found = True
                break
        if not found:
            FP += 1
    FN = len(gt_boxes) - TP

    precision = TP / (TP + FP + 1e-6)
    recall = TP / (TP + FN + 1e-6)

    return precision, recall


In [83]:
def validate_one_epoch(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    all_precisions = []
    all_recalls = []

    with torch.no_grad():
        for images, targets in dataloader:
            images = torch.stack(images).to(device)
            targets = torch.stack(targets).to(device)

            with torch.amp.autocast('cuda'):
                preds = model(images)

                preds = preds[2].permute(0, 2, 3, 1).contiguous()

                pred_box = preds[..., :4]
                pred_obj = preds[..., 4:5]
                pred_cls = preds[..., 5:]

                target_box = targets[..., :4]
                target_obj = targets[..., 4:5]
                target_cls = targets[..., 5:]

                loss = criterion(pred_box, target_box, pred_obj, target_obj, pred_cls, target_cls)

            total_loss += loss.item()


            pred_boxes, _, _ = decode_predictions(preds)
            gt_boxes = target_box[target_obj.squeeze(-1) == 1].tolist()

            precision, recall = compute_metrics(pred_boxes, gt_boxes)
            all_precisions.append(precision)
            all_recalls.append(recall)

    avg_precision = sum(all_precisions) / len(all_precisions)
    avg_recall = sum(all_recalls) / len(all_recalls)

    return total_loss / len(dataloader), avg_precision, avg_recall, avg_precision

In [84]:
def run_training(model, train_loader, val_loader, optimizer, scheduler, criterion, device, num_epochs=30):
    best_map = 0.0
    for epoch in range(1, num_epochs + 1):
        print(f"\nEpoch {epoch}/{num_epochs}")
        train_loss = train_one_epoch(model, train_loader, optimizer, criterion, device)
        val_loss, precision, recall, map50 = validate_one_epoch(model, val_loader, criterion, device)
        scheduler.step()

        print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Precision: {precision:.2f} | Recall: {recall:.2f} | mAP@0.5: {map50:.2f}")

        if map50 > best_map:
            best_map = map50
            torch.save(model.state_dict(), "best_yolo_hybrid.pth")
            print("Best model saved!")


In [89]:
import torch
from torch.utils.data import DataLoader
from torchvision import transforms


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


train_dataset = YoloDataset(
    image_dir="/content/drive/MyDrive/Number_Plate/dataset/train/images",
    label_dir="/content/drive/MyDrive/Number_Plate/dataset/train/labels",
    img_size=224
)

val_dataset = YoloDataset(
    image_dir="/content/drive/MyDrive/Number_Plate/dataset/val/images",
    label_dir="/content/drive/MyDrive/Number_Plate/dataset/val/labels",
    img_size=224
)


train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))


model = Yolov4HybridCNN(num_classes=1).to(device)
criterion = YOLOLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.8)


run_training(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    scheduler=scheduler,
    criterion=criterion,
    device=device,
    num_epochs=30
)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Selected Feature[2] shape: torch.Size([4, 1024, 7, 7])
Selected Feature[0] shape: torch.Size([4, 256, 28, 28])
Selected Feature[1] shape: torch.Size([4, 512, 14, 14])
Selected Feature[2] shape: torch.Size([4, 1024, 7, 7])
Selected Feature[0] shape: torch.Size([4, 256, 28, 28])
Selected Feature[1] shape: torch.Size([4, 512, 14, 14])
Selected Feature[2] shape: torch.Size([4, 1024, 7, 7])
Selected Feature[0] shape: torch.Size([4, 256, 28, 28])
Selected Feature[1] shape: torch.Size([4, 512, 14, 14])
Selected Feature[2] shape: torch.Size([4, 1024, 7, 7])
Selected Feature[0] shape: torch.Size([4, 256, 28, 28])
Selected Feature[1] shape: torch.Size([4, 512, 14, 14])
Selected Feature[2] shape: torch.Size([4, 1024, 7, 7])
Selected Feature[0] shape: torch.Size([4, 256, 28, 28])
Selected Feature[1] shape: torch.Size([4, 512, 14, 14])
Selected Feature[2] shape: torch.Size([4, 1024, 7, 7])
Selected Feature[0] shape: torch.Size([4, 256