# Yolo on Fisheye
We have two methods to solve this fisheye detection problem: 

Method I:
1. Dewarp fisheye images
2. Run a yolo detector on it.
3. Map predicted boxes back to fisheye coordinates

Method II: 
1. Augment fisheye images a little bit for improving detections
2. Train yolo detector directly on fisheye frames
3. Map predicted boxes back to fisheye coordinates

## Method II

In [1]:
import torch
import os
from PIL import Image
import torchvision
from torch.utils.data import DataLoader, Dataset
from pycocotools.coco import COCO
import albumentations as A
from albumentations.pytorch import ToTensorV2
import glob
import numpy as np

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


  return torch._C._cuda_getDeviceCount() > 0


In [3]:
# class FishEyeData(Dataset):
#     def __init__(self, img_dir, ann_file, transforms=None):
#         self.coco = COCO(ann_file)
#         self.img_dir = img_dir
#         self.ids = list(self.coco.imgs.keys())
#         self.transforms = transforms

#     def __getitem__(self, idx):
#         img_id = self.ids[idx]
#         img_info = self.coco.loadImgs(img_id)[0]
#         path = os.path.join(self.img_dir, img_info['file_name'])
#         image = Image.open(path)

#         ann_ids = self.coco.getAnnIds(img_id)
#         anns = self.coco.loadAnns(ann_ids)
#         bboxes, labels = [], []
#         for ann in anns:
#             x, y, w, h = ann['bbox']
#             bboxes.append([x, y, x + w, y + h])
#             labels.append(ann['category_id'])

#         if self.transforms:
#             augmented = self.transforms(image=image, bboxes=bboxes, category_ids=labels)
#             image = augmented['image']
#             bboxes = torch.tensor(augmented['bboxes'], dtype=torch.float32)
#             labels = torch.tensor(augmented['category_ids'], dtype=torch.int64)
#         else:
#             image = ToTensorV2()(image=image)['image']
#             bboxes = torch.tensor(bboxes, dtype=torch.float32)
#             labels = torch.tensor(labels, dtype=torch.int64)

#         target = {'boxes': bboxes, 'labels': labels}
#         return image, target

#     def __len__(self):
#         return len(self.ids)


In [4]:
class FishEyeData(Dataset):
    def __init__(self, img_dir, label_dir, transforms=None):
        self.img_paths = sorted(glob.glob(os.path.join(img_dir, '*.png')))
        self.label_dir = label_dir
        self.transforms = transforms

    def __len__(self):
        return len(self.img_paths)

    def __getitem__(self, idx):
        img_path = self.img_paths[idx]
        img = Image.open(img_path)
        img = np.array(img)
        h, w = img.shape[:2]
        # Load YOLO labels: class x_center y_center width height (normalized)
        base = os.path.splitext(os.path.basename(img_path))[0]
        lbl_path = os.path.join(self.label_dir, base + '.txt')
        bboxes, labels = [], []
        if os.path.exists(lbl_path):
            with open(lbl_path, 'r') as f:
                for line in f:
                    cls, x_c, y_c, bw, bh = map(float, line.split())
                    # convert to pixel coords
                    x_c *= w; y_c *= h; bw *= w; bh *= h
                    x1 = x_c - bw/2; y1 = y_c - bh/2
                    x2 = x_c + bw/2; y2 = y_c + bh/2
                    bboxes.append([x1, y1, x2, y2])
                    labels.append(int(cls))
        # Apply transforms (Albumentations expects 'bboxes' in pascal_voc format)
        if self.transforms:
            augmented = self.transforms(image=img, bboxes=bboxes, category_ids=labels)
            img = augmented['image']
            bboxes = torch.tensor(augmented['bboxes'], dtype=torch.float32)
            labels = torch.tensor(augmented['category_ids'], dtype=torch.int64)
        else:
            img = ToTensorV2()(image=img)['image']
            bboxes = torch.tensor(bboxes, dtype=torch.float32)
            labels = torch.tensor(labels, dtype=torch.int64)
        target = { 'boxes': bboxes, 'labels': labels }
        return img, target


In [5]:
train_transforms = A.Compose([
    A.OpticalDistortion(distort_limit=0.3, shift_limit=0.0, p=0.7),
    A.Rotate(limit=15, p=0.5),
    A.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.05, p=0.5),
    A.HorizontalFlip(p=0.5),
    A.Resize(640, 640),
    ToTensorV2()
], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['category_ids']))

val_transforms = A.Compose([
    A.Resize(640, 640),
    ToTensorV2()
], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['category_ids']))


  A.OpticalDistortion(distort_limit=0.3, shift_limit=0.0, p=0.7),


In [6]:
train_img_folder = '/media/abhitemb/DATA/Users/abhitemb/Documents/FishEye/TrainVal/Fisheye8K_all_including_train&test/train/images'
train_ann_file   = '/media/abhitemb/DATA/Users/abhitemb/Documents/FishEye/TrainVal/Fisheye8K_all_including_train&test/train/images'
val_img_folder   = '/absolute/path/to/FishEye8K/images/val'
val_ann_file     = '/absolute/path/to/FishEye8K/annotations/instances_val.json'

# 4. DataLoaders
train_dataset = FishEyeData(train_img_folder, train_ann_file, transforms=train_transforms)
# val_dataset   = FishEyeData(val_img_folder, val_ann_file, transforms=val_transforms)
train_loader  = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))
# val_loader    = DataLoader(val_dataset, batch_size=2, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))

In [7]:
len(train_dataset)

2438

In [8]:
num_classes = 6  # 5 classes + background
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(in_features, num_classes)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)



FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [9]:
# 6. Optimizer and scheduler
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)


In [10]:
epochs = 20
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for images, targets in train_loader:
        print(len(images), targets)

        images = [img.to(device) for img in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        loss_dict = model(images, targets)
        loss = sum(loss for loss in loss_dict.values())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    lr_scheduler.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")


4 ({'boxes': tensor([]), 'labels': tensor([], dtype=torch.int64)}, {'boxes': tensor([]), 'labels': tensor([], dtype=torch.int64)}, {'boxes': tensor([]), 'labels': tensor([], dtype=torch.int64)}, {'boxes': tensor([]), 'labels': tensor([], dtype=torch.int64)})


AssertionError: Expected target boxes to be a tensor of shape [N, 4], got torch.Size([0]).

In [None]:
os.makedirs('models', exist_ok=True)
torch.save(model.state_dict(), 'models/fasterrcnn_fisheye_aug.pth')

# 9. Inference example
model.eval()
example = val_dataset[0][0].unsqueeze(0).to(device)
with torch.no_grad():
    output = model(example)
print(output)