# DrishT: SSDLite-MobileNetV3 Text Detection Training

**Model**: SSDLite320 + MobileNetV3-Large (~3.4M params)  
**Dataset**: 7,344 train / 915 val / 915 test images (COCO JSON)  
**Categories**: text, license_plate, traffic_sign, autorickshaw, tempo, truck, bus  
**GPU**: Kaggle T4 (free, 30h/week)  

## Setup
1. Add dataset `drisht-detection` to this notebook
2. Enable GPU: Settings → Accelerator → GPU T4 x2
3. Run all cells

In [None]:
import os, sys, json, time, shutil
from pathlib import Path
from collections import defaultdict

import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.amp import autocast, GradScaler
import torchvision
from torchvision.models.detection import ssdlite320_mobilenet_v3_large
from torchvision.models import MobileNet_V3_Large_Weights
import torchvision.transforms.functional as TF
import torchvision.transforms as T
from PIL import Image
from tqdm.auto import tqdm

print(f'PyTorch: {torch.__version__}')
print(f'CUDA: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Configuration

In [None]:
# --- Adjust this path based on your Kaggle dataset name ---
DATA_ROOT = Path('/kaggle/input/drisht-detection')
OUTPUT_DIR = Path('/kaggle/working/detection_output')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Paths
TRAIN_JSON = DATA_ROOT / 'train' / 'annotations.json'
TRAIN_IMAGES = DATA_ROOT / 'train' / 'images'
VAL_JSON = DATA_ROOT / 'val' / 'annotations.json'
VAL_IMAGES = DATA_ROOT / 'val' / 'images'

# Model
NUM_CLASSES = 8  # 7 categories + background
INPUT_SIZE = 320

# Training
BATCH_SIZE = 16
NUM_WORKERS = 4   # Kaggle T4 has 4 vCPUs — use all of them
EPOCHS = 80
LR = 0.01
MOMENTUM = 0.9
WEIGHT_DECAY = 4e-5
LR_MIN = 1e-6
FREEZE_BACKBONE_EPOCHS = 5
PATIENCE = 12
USE_AMP = True

# Category names
CATEGORIES = {
    0: 'background', 1: 'text', 2: 'license_plate', 3: 'traffic_sign',
    4: 'autorickshaw', 5: 'tempo', 6: 'truck', 7: 'bus'
}

# Verify data
for p in [TRAIN_JSON, VAL_JSON]:
    assert p.exists(), f'Missing: {p}'
print(f'Train images: {len(list(TRAIN_IMAGES.iterdir()))}')
print(f'Val images: {len(list(VAL_IMAGES.iterdir()))}')

## Dataset

In [None]:
import random


class COCODetectionDataset(Dataset):
    """COCO JSON format dataset for torchvision detection models."""

    def __init__(self, json_path, img_dir, augment=False, input_size=320):
        with open(json_path, 'r') as f:
            coco = json.load(f)
        self.img_dir = Path(img_dir)
        self.input_size = input_size
        self.augment = augment
        self.images = {img['id']: img for img in coco['images']}
        self.img_to_anns = defaultdict(list)
        for ann in coco['annotations']:
            self.img_to_anns[ann['image_id']].append(ann)
        self.img_ids = [iid for iid in self.images if len(self.img_to_anns[iid]) > 0]

    def __len__(self):
        return len(self.img_ids)

    def __getitem__(self, idx):
        img_id = self.img_ids[idx]
        img_info = self.images[img_id]
        image = Image.open(self.img_dir / img_info['file_name']).convert('RGB')
        orig_w, orig_h = image.size

        boxes, labels = [], []
        for ann in self.img_to_anns[img_id]:
            x, y, w, h = ann['bbox']
            if w <= 0 or h <= 0: continue
            boxes.append([x, y, x + w, y + h])
            labels.append(ann['category_id'])

        if boxes:
            boxes = torch.tensor(boxes, dtype=torch.float32)
            labels = torch.tensor(labels, dtype=torch.int64)
        else:
            boxes = torch.zeros((0, 4), dtype=torch.float32)
            labels = torch.zeros((0,), dtype=torch.int64)

        if self.augment:
            if random.random() < 0.5:
                image = TF.hflip(image)
                if len(boxes) > 0:
                    boxes[:, [0, 2]] = orig_w - boxes[:, [2, 0]]
            if random.random() < 0.5:
                image = T.ColorJitter(0.3, 0.3, 0.3, 0.1)(image)

        scale_x = self.input_size / orig_w
        scale_y = self.input_size / orig_h
        image = image.resize((self.input_size, self.input_size), Image.BILINEAR)

        if len(boxes) > 0:
            boxes[:, [0, 2]] *= scale_x
            boxes[:, [1, 3]] *= scale_y
            boxes[:, [0, 2]] = boxes[:, [0, 2]].clamp(0, self.input_size)
            boxes[:, [1, 3]] = boxes[:, [1, 3]].clamp(0, self.input_size)

        image = TF.to_tensor(image)
        return image, {'boxes': boxes, 'labels': labels}


def collate_fn(batch):
    images, targets = zip(*batch)
    return list(images), list(targets)


train_ds = COCODetectionDataset(TRAIN_JSON, TRAIN_IMAGES, augment=True, input_size=INPUT_SIZE)
val_ds = COCODetectionDataset(VAL_JSON, VAL_IMAGES, augment=False, input_size=INPUT_SIZE)

# num_workers=4 (all Kaggle vCPUs), persistent_workers avoids respawning,
# prefetch_factor pre-stages batches so GPU never starves
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,
                          num_workers=NUM_WORKERS, collate_fn=collate_fn,
                          pin_memory=True, persistent_workers=True, prefetch_factor=3)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False,
                        num_workers=NUM_WORKERS, collate_fn=collate_fn,
                        pin_memory=True, persistent_workers=True, prefetch_factor=3)

print(f'Train: {len(train_ds)} images, {len(train_loader)} batches')
print(f'Val: {len(val_ds)} images, {len(val_loader)} batches')
print(f'DataLoader: workers={NUM_WORKERS}, persistent=True, prefetch=3')

## Model

In [None]:
model = ssdlite320_mobilenet_v3_large(
    num_classes=NUM_CLASSES,
    weights_backbone=MobileNet_V3_Large_Weights.IMAGENET1K_V1,
)
model = model.to(DEVICE)

total = sum(p.numel() for p in model.parameters())
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Total params: {total:,}')
print(f'Trainable:    {trainable:,}')
print(f'Size:         {sum(p.numel() * p.element_size() for p in model.parameters()) / 1024**2:.1f} MB')

## Training Utilities

In [None]:
def freeze_backbone(model, freeze=True):
    for param in model.backbone.parameters():
        param.requires_grad = not freeze


def _box_iou_single(box1, box2):
    x1 = max(box1[0].item(), box2[0].item())
    y1 = max(box1[1].item(), box2[1].item())
    x2 = min(box1[2].item(), box2[2].item())
    y2 = min(box1[3].item(), box2[3].item())
    inter = max(0, x2 - x1) * max(0, y2 - y1)
    area1 = (box1[2] - box1[0]).item() * (box1[3] - box1[1]).item()
    area2 = (box2[2] - box2[0]).item() * (box2[3] - box2[1]).item()
    return inter / max(area1 + area2 - inter, 1e-6)


@torch.no_grad()
def compute_map(model, loader, device, iou_threshold=0.5, max_batches=50):
    model.eval()
    all_dets = defaultdict(list)
    all_n_gt = defaultdict(int)

    for batch_idx, (images, targets) in enumerate(tqdm(loader, desc='mAP')):
        if batch_idx >= max_batches: break
        images = [img.to(device) for img in images]
        preds = model(images)

        for pred, gt in zip(preds, targets):
            gt_boxes = gt['boxes'].to(device)
            gt_labels = gt['labels'].to(device)
            for lbl in gt_labels.tolist():
                all_n_gt[lbl] += 1

            matched = set()
            for i in range(len(pred['boxes'])):
                cls = pred['labels'][i].item()
                score = pred['scores'][i].item()
                best_iou, best_idx = 0.0, -1
                for gi in (gt_labels == cls).nonzero(as_tuple=True)[0].tolist():
                    if gi in matched: continue
                    iou = _box_iou_single(pred['boxes'][i], gt_boxes[gi])
                    if iou > best_iou:
                        best_iou, best_idx = iou, gi
                is_tp = best_iou >= iou_threshold and best_idx >= 0
                if is_tp: matched.add(best_idx)
                all_dets[cls].append((score, is_tp))

    aps = {}
    for cls in sorted(all_n_gt):
        dets = sorted(all_dets.get(cls, []), key=lambda x: -x[0])
        n_gt = all_n_gt[cls]
        if n_gt == 0: continue
        tp, fp = 0, 0
        prec, rec = [], []
        for _, is_tp in dets:
            tp += is_tp; fp += not is_tp
            prec.append(tp / (tp + fp)); rec.append(tp / n_gt)
        ap = sum(max((p for p, r in zip(prec, rec) if r >= t), default=0) for t in [i/10 for i in range(11)]) / 11
        aps[cls] = ap

    return sum(aps.values()) / max(len(aps), 1), aps


def train_one_epoch(model, loader, optimizer, scaler, device, epoch):
    model.train()
    total_loss, n = 0.0, 0
    pbar = tqdm(loader, desc=f'Epoch {epoch}')
    for images, targets in pbar:
        images = [img.to(device) for img in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        optimizer.zero_grad()
        if scaler:
            with autocast('cuda'):
                loss_dict = model(images, targets)
                loss = sum(loss_dict.values())
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            loss_dict = model(images, targets)
            loss = sum(loss_dict.values())
            loss.backward()
            optimizer.step()
        total_loss += loss.item(); n += 1
        pbar.set_postfix(loss=f'{loss.item():.4f}')
    return total_loss / max(n, 1)


@torch.no_grad()
def val_loss(model, loader, device):
    model.train()  # Need train mode for loss
    total, n = 0.0, 0
    for images, targets in tqdm(loader, desc='Val'):
        images = [img.to(device) for img in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        loss = sum(model(images, targets).values())
        total += loss.item(); n += 1
    return total / max(n, 1)

print('Utilities defined.')

## Train

In [None]:
# Freeze backbone initially
freeze_backbone(model, freeze=True)

# Optimizer with separate param groups
head_params = [p for n, p in model.named_parameters() if not n.startswith('backbone') and p.requires_grad]
backbone_params = [p for p in model.backbone.parameters() if p.requires_grad]

optimizer = optim.SGD([
    {'params': head_params, 'lr': LR},
    {'params': backbone_params, 'lr': LR * 0.1},
], momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)

scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS, eta_min=LR_MIN)
scaler = GradScaler('cuda') if USE_AMP and DEVICE.type == 'cuda' else None

best_val = float('inf')
best_map = 0.0
patience_ctr = 0
history = []

print(f'Training for {EPOCHS} epochs on {DEVICE}')
print(f'AMP: {scaler is not None}, Backbone frozen for first {FREEZE_BACKBONE_EPOCHS} epochs')
print()

In [None]:
for epoch in range(1, EPOCHS + 1):
    t0 = time.time()

    # Unfreeze backbone
    if epoch == FREEZE_BACKBONE_EPOCHS + 1:
        freeze_backbone(model, freeze=False)
        backbone_params = list(model.backbone.parameters())
        head_params = [p for n, p in model.named_parameters() if not n.startswith('backbone')]
        optimizer = optim.SGD([
            {'params': head_params, 'lr': scheduler.get_last_lr()[0]},
            {'params': backbone_params, 'lr': scheduler.get_last_lr()[0] * 0.1},
        ], momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS - epoch + 1, eta_min=LR_MIN)
        print(f'  >> Backbone unfrozen at epoch {epoch}')

    tloss = train_one_epoch(model, train_loader, optimizer, scaler, DEVICE, epoch)
    vloss = val_loss(model, val_loader, DEVICE)

    # mAP every 5 epochs
    mAP = 0.0
    if epoch % 5 == 0 or epoch == EPOCHS:
        mAP, per_cls = compute_map(model, val_loader, DEVICE)
        ap_str = ' | '.join(f'{CATEGORIES.get(c,c)}: {ap:.3f}' for c, ap in sorted(per_cls.items()))
        print(f'  mAP@0.5: {mAP:.4f}  [{ap_str}]')

    scheduler.step()
    lr = optimizer.param_groups[0]['lr']
    elapsed = time.time() - t0

    print(f'Epoch {epoch:3d} | Train: {tloss:.4f} | Val: {vloss:.4f} | mAP: {mAP:.4f} | LR: {lr:.6f} | {elapsed:.1f}s')
    history.append({'epoch': epoch, 'train_loss': tloss, 'val_loss': vloss, 'mAP': mAP, 'lr': lr})

    # Save best
    if vloss < best_val:
        best_val = vloss
        patience_ctr = 0
        torch.save({'model': model.state_dict(), 'epoch': epoch, 'best_val': best_val, 'best_map': best_map},
                   OUTPUT_DIR / 'best.pth')
        print(f'  -> Saved best model (val_loss={vloss:.4f})')
    else:
        patience_ctr += 1

    if mAP > best_map:
        best_map = mAP
        torch.save({'model': model.state_dict(), 'epoch': epoch, 'best_map': best_map},
                   OUTPUT_DIR / 'best_map.pth')
        print(f'  -> Saved best mAP model ({mAP:.4f})')

    # Checkpoint every 10 epochs
    if epoch % 10 == 0:
        torch.save({'model': model.state_dict(), 'epoch': epoch},
                   OUTPUT_DIR / f'epoch_{epoch}.pth')

    if patience_ctr >= PATIENCE:
        print(f'\nEarly stopping at epoch {epoch}')
        break

print(f'\nDone! Best val_loss: {best_val:.4f}, Best mAP: {best_map:.4f}')

## Training Curves

In [None]:
import matplotlib.pyplot as plt

epochs = [h['epoch'] for h in history]
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

ax1.plot(epochs, [h['train_loss'] for h in history], label='Train')
ax1.plot(epochs, [h['val_loss'] for h in history], label='Val')
ax1.set_xlabel('Epoch'); ax1.set_ylabel('Loss'); ax1.legend(); ax1.set_title('Loss')

map_epochs = [h['epoch'] for h in history if h['mAP'] > 0]
map_vals = [h['mAP'] for h in history if h['mAP'] > 0]
ax2.plot(map_epochs, map_vals, 'go-')
ax2.set_xlabel('Epoch'); ax2.set_ylabel('mAP@0.5'); ax2.set_title('mAP')

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'training_curves.png', dpi=150)
plt.show()

## Export ONNX

In [None]:
# Load best model and export to ONNX for mobile deployment
best_ckpt = torch.load(OUTPUT_DIR / 'best_map.pth', map_location='cpu')
model_export = ssdlite320_mobilenet_v3_large(num_classes=NUM_CLASSES)
model_export.load_state_dict(best_ckpt['model'])
model_export.eval()

dummy = [torch.randn(3, 320, 320)]
torch.onnx.export(
    model_export, dummy, str(OUTPUT_DIR / 'ssdlite_detection.onnx'),
    opset_version=17,
    input_names=['image'],
    output_names=['boxes', 'labels', 'scores'],
)
onnx_size = (OUTPUT_DIR / 'ssdlite_detection.onnx').stat().st_size / 1024**2
print(f'ONNX exported: {onnx_size:.1f} MB')

## Save History & Download
Download the output files from `/kaggle/working/detection_output/`

In [None]:
import json
with open(OUTPUT_DIR / 'history.json', 'w') as f:
    json.dump(history, f, indent=2)

print('Output files:')
for f in sorted(OUTPUT_DIR.iterdir()):
    print(f'  {f.name}: {f.stat().st_size / 1024**2:.1f} MB')