# Building an Object Detection Algorithm from Scratch
In this notebook, we will implement a simplified object detection algorithm from scratch using PyTorch. We will create a model, define a dataset, and train the model.

## Step 1: Set Up the Environment
We start by importing the necessary libraries.

In [1]:

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from torchvision.transforms import functional as F
from PIL import Image


## Step 2: Define the Model
We will define a simple object detection model with a backbone CNN for feature extraction and a prediction head.

In [2]:

class SimpleObjectDetector(nn.Module):
    def __init__(self, num_classes=2, grid_size=7, bbox_per_cell=2):
        super(SimpleObjectDetector, self).__init__()
        self.num_classes = num_classes
        self.grid_size = grid_size
        self.bbox_per_cell = bbox_per_cell

        # Backbone CNN
        self.conv_layers = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )

        # Prediction head
        self.fc_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(32 * (grid_size // 2) * (grid_size // 2), 512),
            nn.ReLU(),
            nn.Linear(512, grid_size * grid_size * (bbox_per_cell * 5 + num_classes))
        )

    def forward(self, x):
        x = self.conv_layers(x)
        x = self.fc_layers(x)
        return x.view(-1, self.grid_size, self.grid_size, self.bbox_per_cell * 5 + self.num_classes)


## Step 3: Define the Dataset
We will create a custom dataset class that returns images and corresponding bounding boxes and labels.

In [3]:

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, images, annotations, transform=None):
        self.images = images
        self.annotations = annotations
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = self.images[idx]
        bbox = self.annotations[idx]['boxes']
        labels = self.annotations[idx]['labels']

        if self.transform:
            image = self.transform(image)

        return image, bbox, labels


## Step 4: Define the Loss Function
We need to calculate losses for both the bounding boxes and the class predictions.

In [8]:
def compute_loss(predictions, targets, grid_size, num_classes, bbox_per_cell):
    # Placeholder for loss computation
    return torch.tensor(0.0)

## Step 5: Train the Model
We will initialize the dataset, model, optimizer, and dataloader, and define a training loop.

In [9]:
images = [torch.rand(3, 224, 224) for _ in range(10)]  # Random images
annotations = [{'boxes': torch.tensor([[50, 50, 100, 100]]), 'labels': torch.tensor([1])} for _ in range(10)]

# Dataset and DataLoader
dataset = CustomDataset(images, annotations)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True)

# Model, optimizer, and loss
model = SimpleObjectDetector(num_classes=2)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [19]:
for img, bbox, labels in dataloader:
    print(labels)
    break

tensor([[1],
        [1]])


In [20]:
for epoch in range(10):
    for batch_idx, (images, bboxes, labels) in enumerate(dataloader):
        optimizer.zero_grad()
        predictions = model(images)
        loss = compute_loss(predictions, bboxes, grid_size=7, num_classes=2, bbox_per_cell=2)
        loss.backward()
        optimizer.step()
        print(f"Epoch {epoch}, Batch {batch_idx}, Loss: {loss.item()}")


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [6]:
import torch.nn as nn
import torch

class SimpleObjectDetector(nn.Module):
    def __init__(self, num_classes):
        super(SimpleObjectDetector, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.flatten_size = 64 * 56 * 56  # Adjust based on image size and conv layers
        self.fc = nn.Linear(self.flatten_size, num_classes)

    def forward(self, x):
        x = self.conv(x)  # Apply convolutions
        x = x.view(x.size(0), -1)  # Flatten the tensor
        x = self.fc(x)  # Apply fully connected layer
        return x


## Step 6: Visualize Predictions
Finally, we visualize the predictions made by the model.

In [None]:

def visualize_predictions(image, predictions):
    image = image.permute(1, 2, 0).numpy()
    plt.imshow(image)
    plt.axis('off')
    plt.show()

# Example usage
image = torch.rand(3, 224, 224)  # Random image
predictions = model(image.unsqueeze(0))
visualize_predictions(image, predictions)


# YOLO

YOLO <a href='https://arxiv.org/pdf/1506.02640'>paper</a> by Facebook 

In [None]:
bike_img = 'dirtbike.jpg'
piano_img = 'piano.jpg'

In [None]:
from ultralytics import YOLO
import cv2
import matplotlib.pyplot as plt

In [None]:
model = YOLO('yolov8n.pt')  # You can use other models like 'yolov8s.pt', etc.

In [None]:
image = cv2.imread(bike_img)
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert to RGB for visualization

In [None]:
# Perform object detection
%time results = model(image)

In [None]:
# Access the first result (since results is a list)
result = results[0]

# Visualize predictions (bounding boxes, labels)
annotated_image = result.plot()  # `plot()` returns an annotated image

In [None]:
# Display the annotated image
plt.imshow(cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB))
plt.axis('off')
plt.show()

In [None]:
# Optional: Print detection details
for box, cls, conf in zip(result.boxes.xyxy, result.boxes.cls, result.boxes.conf):
    print(f"Class: {model.names[int(cls)]}, Confidence: {conf:.2f}, Box: {box.tolist()}")

In [None]:
def plot_boxed_img(img_path: str):
    image = cv2.imread(img_path)
#     image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = model(image)[0]
    annotated_image = results.plot()  
    plt.imshow(cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB))
    plt.axis('off')
    plt.show()

In [None]:
plot_boxed_img(piano_img)

## How does Yolo work?

YOLO divides the image into grids (squares). For each grid it predicts the probability of any given class, for example 20 classes. If the probability is not above some threshold, nothing is predicted as true

Once it DID predict something as true, if the object is not in the grid's center, it won't predict anything, although there IS something there. Once it DOES find an object that its center is in the grid, it predicts the following things:

1. The center offset, relative to the cell
2. The bounding box size (width and height) of the object
3. The class

YOLO implementation written by ChatGPT

In [None]:
import torch
import torch.nn as nn

class ResidualBlock(nn.Module):
    """Residual Block with two convolutional layers."""
    def __init__(self, in_channels, filters):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels, filters, kernel_size=1, stride=1, padding=0, bias=False)
        self.bn1 = nn.BatchNorm2d(filters)
        self.conv2 = nn.Conv2d(filters, in_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(in_channels)
        self.relu = nn.LeakyReLU(0.1)

    def forward(self, x):
        residual = x
        x = self.relu(self.bn1(self.conv1(x)))
        x = self.bn2(self.conv2(x))
        return self.relu(x + residual)

class DarknetBlock(nn.Module):
    """Darknet block with N residual blocks."""
    def __init__(self, in_channels, filters, num_blocks):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Conv2d(in_channels, filters, kernel_size=3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(filters),
            nn.LeakyReLU(0.1),
            *[ResidualBlock(filters, filters // 2) for _ in range(num_blocks)]
        )

    def forward(self, x):
        return self.layers(x)

class Darknet53(nn.Module):
    """Backbone: Darknet-53."""
    def __init__(self):
        super().__init__()
        self.initial_conv = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(32),
            nn.LeakyReLU(0.1),
        )
        self.block1 = DarknetBlock(32, 64, 1)
        self.block2 = DarknetBlock(64, 128, 2)
        self.block3 = DarknetBlock(128, 256, 8)
        self.block4 = DarknetBlock(256, 512, 8)
        self.block5 = DarknetBlock(512, 1024, 4)

    def forward(self, x):
        outputs = []
        x = self.initial_conv(x)
        x = self.block1(x)
        x = self.block2(x)
        outputs.append(x)  # First scale
        x = self.block3(x)
        outputs.append(x)  # Second scale
        x = self.block4(x)
        outputs.append(x)  # Third scale
        x = self.block5(x)
        return outputs

class YOLOHead(nn.Module):
    """YOLO Head for bounding box and class prediction."""
    def __init__(self, in_channels, num_anchors, num_classes):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels, in_channels // 2, kernel_size=1, stride=1, padding=0, bias=False)
        self.bn1 = nn.BatchNorm2d(in_channels // 2)
        self.conv2 = nn.Conv2d(in_channels // 2, num_anchors * (5 + num_classes), kernel_size=1, stride=1, padding=0)

    def forward(self, x):
        x = nn.LeakyReLU(0.1)(self.bn1(self.conv1(x)))
        x = self.conv2(x)
        return x

class YOLOv3(nn.Module):
    """YOLOv3 Model."""
    def __init__(self, num_classes=80, num_anchors=3):
        super().__init__()
        self.backbone = Darknet53()
        self.head1 = YOLOHead(1024, num_anchors, num_classes)
        self.head2 = YOLOHead(512, num_anchors, num_classes)
        self.head3 = YOLOHead(256, num_anchors, num_classes)

    def forward(self, x):
        backbone_outputs = self.backbone(x)
        head1_out = self.head1(backbone_outputs[-1])  # Largest scale
        head2_out = self.head2(backbone_outputs[-2])  # Medium scale
        head3_out = self.head3(backbone_outputs[-3])  # Smallest scale
        return head1_out, head2_out, head3_out

In [None]:
# Example usage
model = YOLOv3(num_classes=80, num_anchors=3)
dummy_input = torch.randn(1, 3, 416, 416)
output = model(dummy_input)
for out in output:
    print(out.shape)  # Output shapes for 3 scales

YOLO uses a skip connection layer, which basically means moving a layer's output straight to another layer input. For example if I have 5 layers, I'll take the output of layer 3 and put it straight into layer 5 without passing through layer 4

A special kind of skip connection is the residual connection layer, which basically means adding the non processed input to the processed input, and together they make an output.

For example I have 5 layers. Layer 2 gets input, does something and outputs it. In a skip connection layer, the 3rd layer will get both the processed input and (summed) the raw input to layer 2.

It helps especially with vanishing gradients