# Import and Download dataset

In [1]:
import kagglehub

data_dir = kagglehub.dataset_download(
    'andrewmvd/dog-and-cat-detection',
)
print('Path to dataset files:', data_dir)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: C:\Users\Admin\.cache\kagglehub\datasets\andrewmvd\dog-and-cat-detection\versions\1


In [2]:
import torch.backends.cudnn as cudnn
cudnn.benchmark = False
cudnn.deterministic = True
cudnn.enabled = False

In [3]:
import os
import torch
import time
import numpy as np
import pandas as pd
import seaborn as sns
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import xml.etree.ElementTree as ET

from PIL import Image
from tqdm.auto import tqdm
from torchvision import transforms, models
from torch.utils.data import DataLoader, Dataset, Subset
from torchmetrics.detection.mean_ap import MeanAveragePrecision
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from torchvision.models.resnet import ResNet18_Weights, ResNet50_Weights
from torch.cuda.amp import GradScaler, autocast

In [4]:
import sys
sys.path.append('../..')
from utils.memory_tracker import MemoryTracker, safe_to_device

In [5]:
class ImageDataset(Dataset):
    def __init__(
        self,
        annotations_dir,
        image_dir,
        transform=None,
    ):
        self.annotations_dir = annotations_dir
        self.image_dir = image_dir
        self.transform = transform
        self.image_files = self.filter_images_with_multiple_objects()
        
    def filter_images_with_multiple_objects(self):
        valid_image_files = []
        for f in os.listdir(self.image_dir):
            if not os.path.isfile(os.path.join(self.image_dir, f)):
                continue
            img_name = f
            annotation_name = os.path.splitext(img_name)[0] + '.xml'
            annotation_path = os.path.join(self.annotations_dir, annotation_name)
            
            if self.count_objects_in_annotation(annotation_path) == 1:
                valid_image_files.append(img_name)
            else:
                print(f'Image {img_name} has multiple objects and will be excluded from the dataset')
        return valid_image_files
    
    def count_objects_in_annotation(self, annotation_path):
        try:
            tree = ET.parse(annotation_path)
            root = tree.getroot()
            count = 0
            for _ in root.findall('object'):
                count += 1
            return count
        except FileNotFoundError:
            return 0
        
    def parse_annotation(self, annotation_path):
        tree = ET.parse(annotation_path)
        root = tree.getroot()
        
        img_width = int(root.find('size/width').text)
        img_height = int(root.find('size/height').text)
        
        label = None
        bbox = None
        for obj in root.findall('object'):
            name = obj.find('name').text
            if not label:
                label = name
                xmin = int(obj.find('bndbox/xmin').text)
                ymin = int(obj.find('bndbox/ymin').text)
                xmax = int(obj.find('bndbox/xmax').text)
                ymax = int(obj.find('bndbox/ymax').text)
                
                bbox = [
                    xmin / img_width,
                    ymin / img_height,
                    xmax / img_width,
                    ymax / img_height,
                ]
                
        label_num = 0 if label == 'cat' else 1 if label == 'dog' else -1
        return label_num, torch.tensor(bbox, dtype=torch.float32)
        
    def __len__(self):
        return len(self.image_files)
    
    def __getitem__(self, idx):
        img1_file = self.image_files[idx]
        img1_path = os.path.join(self.image_dir, img1_file)

        annotation_name = os.path.splitext(img1_file)[0] + ".xml"
        img1_annotations = self.parse_annotation(
            os.path.join(self.annotations_dir, annotation_name)
        )

        if idx == len(self.image_files) - 1:
            idx2 = 0
        else:
            idx2 = idx + 1
        img2_file = self.image_files[idx2]
        img2_path = os.path.join(self.image_dir, img2_file)

        annotation_name = os.path.splitext(img2_file)[0] + ".xml"
        img2_annotations = self.parse_annotation(
            os.path.join(self.annotations_dir, annotation_name)
        )

        img1 = Image.open(img1_path).convert("RGB")
        img2 = Image.open(img2_path).convert("RGB")

        # Horizontal merge
        merged_image = Image.new(
            "RGB", (img1.width + img2.width, max(img1.height, img2.height))
        )
        merged_image.paste(img1, (0, 0))
        merged_image.paste(img2, (img1.width, 0))
        merged_w = img1.width + img2.width
        merged_h = max(img1.height, img2.height)

        merged_annotations = []

        # Adjust bbox coordinates for objects from img1 AND normalize
        new_bbox1 = [
            img1_annotations[1][0] * img1.width / merged_w,  # Normalize xmin
            img1_annotations[1][1] * img1.height / merged_h,  # Normalize ymin
            img1_annotations[1][2] * img1.width / merged_w,  # Normalize xmax
            img1_annotations[1][3] * img1.height / merged_h,  # Normalize ymax
        ]
        merged_annotations.append({"bbox": new_bbox1, "label": img1_annotations[0]})

        # Adjust bbox coordinates for objects from img2 AND normalize
        new_bbox2 = [
            (img2_annotations[1][0] * img2.width + img1.width)
            / merged_w,  # Normalize xmin
            img2_annotations[1][1] * img2.height / merged_h,  # Normalize ymin
            (img2_annotations[1][2] * img2.width + img1.width)
            / merged_w,  # Normalize xmax
            img2_annotations[1][3] * img2.height / merged_h,  # Normalize ymax
        ]

        merged_annotations.append({"bbox": new_bbox2, "label": img2_annotations[0]})

        # Convert merged image to tensor
        if self.transform:
            merged_image = self.transform(merged_image)
        else:
            merged_image = transforms.ToTensor()(merged_image)

        # Convert annotations to 1D tensors, with shape (4,) for bbox and (1,) for label
        annotations = torch.zeros((len(merged_annotations), 5))
        for i, ann in enumerate(merged_annotations):
            annotations[i] = torch.cat(
                (torch.tensor(ann["bbox"]), torch.tensor([ann["label"]]))
            )

        return merged_image, annotations

In [6]:
annotations_dir = os.path.join(data_dir, 'annotations')
image_dir = os.path.join(data_dir, 'images')

image_files = [f for f in os.listdir(image_dir) if os.path.isfile(os.path.join(image_dir, f))]
df = pd.DataFrame({'image_name': image_files})

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
df

Unnamed: 0,image_name
0,Cats_Test0.png
1,Cats_Test1.png
2,Cats_Test10.png
3,Cats_Test100.png
4,Cats_Test1000.png
...,...
3681,Cats_Test995.png
3682,Cats_Test996.png
3683,Cats_Test997.png
3684,Cats_Test998.png


# Prepare dataset

In [7]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

base_dataset = ImageDataset(annotations_dir, image_dir, transform)
# Get the total dataset size
dataset_size = len(base_dataset)

# Calculate train and validation sizes
val_size = int(0.2 * dataset_size)
train_size = dataset_size - val_size

# Generate indices
indices = np.arange(dataset_size)
np.random.seed(42)  # Ensure reproducibility
np.random.shuffle(indices)

# Split indices for train and validation sets
train_indices, val_indices = indices[:train_size], indices[train_size:]
# train_indices, val_indices = indices[:1], indices[:1]

# Create Subsets using the appropriate base dataset
train_dataset = Subset(base_dataset, train_indices)
val_dataset = Subset(base_dataset, val_indices)

train_loader = DataLoader(
    train_dataset,
    batch_size=8,
    shuffle=True,
    pin_memory=True,
    num_workers=4,
    persistent_workers=True,
    drop_last=True,
)
val_loader = DataLoader(
    val_dataset,
    batch_size=8,
    shuffle=False,
    pin_memory=True,
    num_workers=4,
    persistent_workers=True,
    drop_last=False,
)

num_classes = 2
class_to_idx = {"cat": 0, "dog": 1}

Image Cats_Test736.png has multiple objects and will be excluded from the dataset


# Model

In [8]:
class SimpleYOLO(nn.Module):
    def __init__(self, num_classes):
        super(SimpleYOLO, self).__init__()
        self.backbone = models.resnet50(weights=ResNet50_Weights.DEFAULT)
        self.num_classes = num_classes

        # Remove the final classification layer of ResNet
        self.backbone = nn.Sequential(*list(self.backbone.children())[:-2])

        # Add the YOLO head
        self.fcs = nn.Linear(
            2048, 2 * 2 * (4 + self.num_classes)
        )  # 2 is for the number of grid cell

    def forward(self, x):
        # x shape: (batch_size, C, H, W)
        features = self.backbone(x)
        features = F.adaptive_avg_pool2d(
            features, (1, 1)
        )  # shape: (batch_size, 2048, 1, 1)
        features = features.view(features.size(0), -1)  # shape: (batch_size, 2048)
        features = self.fcs(features)

        return features

In [9]:
def calculate_loss(output, targets, device, num_classes):
    mse_loss = nn.MSELoss()
    ce_loss = nn.CrossEntropyLoss()

    batch_size = output.shape[0]
    total_loss = 0

    for i in range(batch_size):  # Iterate through each image in the batch
        grid_have_object = torch.zeros((batch_size, 2, 2), device=device)

        for j in range(len(targets[i])):  # Iterate through objects in the image
            # Determine which grid cell the object's center falls into
            # Assuming bbox coordinates are normalized to [0, 1]
            bbox_center_x = (targets[i][j][0] + targets[i][j][2]) / 2
            bbox_center_y = (targets[i][j][1] + targets[i][j][3]) / 2

            grid_x = int(
                bbox_center_x * 2
            )  # Multiply by number of grid cells (2 in this case)
            grid_y = int(bbox_center_y * 2)

            grid_have_object[i, grid_y, grid_x] = 1

            # 1. Classification Loss for the responsible grid cell
            # Convert label to one-hot encoding only for this example
            label_one_hot = torch.zeros(num_classes, device=device)
            label_one_hot[int(targets[i][j][4])] = 1

            # Classification loss (using CrossEntropyLoss)
            classification_loss = ce_loss(output[i, grid_y, grid_x, 4:], label_one_hot)

            # 2. Regression Loss for the responsible grid cell
            bbox_target = targets[i][j][:4].to(device)
            regression_loss = mse_loss(output[i, grid_y, grid_x, :4], bbox_target)

            # import pdb; pdb.set_trace()

            total_loss += classification_loss + regression_loss

        # 3. No Object Loss (for other grid cells)
        no_obj_loss = 0
        for other_grid_y in range(2):
            for other_grid_x in range(2):
                if grid_have_object[i, other_grid_y, other_grid_x] == 0:
                    # MSE loss for predicting no object (all zeros)
                    no_obj_loss += mse_loss(
                        output[i, other_grid_y, other_grid_x, :4],
                        torch.zeros(4, device=device),
                    )

        total_loss += no_obj_loss

    return total_loss / batch_size  # Average loss over the batch

In [10]:
def calculate_iou(box1, box2):
    # Calculate intersection coordinates
    x_a = max(box1[0], box2[0])
    y_a = max(box1[1], box2[1])
    x_b = min(box1[2], box2[2])
    y_b = min(box1[3], box2[3])

    # Compute the area of intersection
    inter_area = max(0, x_b - x_a) * max(0, y_b - y_a)

    # Compute the area of both the prediction and ground-truth
    box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
    box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])

    # Compute the IoU
    iou = inter_area / float(box1_area + box2_area - inter_area)

    return iou

In [11]:
from typing import Tuple, Dict, List

def train(
    model: torch.nn.Module,
    optimizer: torch.optim.Optimizer,
    dataloader: torch.utils.data.DataLoader,
    num_classes: int,
    scaler: torch.amp.GradScaler,
    device: torch.device,
    desc: str = 'Training',
    position: int = 1,
) -> Tuple[float, float]:
    model.train()
    total_loss = total_acc = total_count = 0
    
    try:        
        with tqdm(
            dataloader,
            desc=desc,
            unit='batch',
            total=len(dataloader),
            position=position,
            leave=True,
        ) as pbar:
            for batch_idx, (images, targets) in enumerate(pbar):
                try:
                    optimizer.zero_grad()
                    batch_size = images.shape[0]
                    
                    with autocast(device, dtype=torch.float16):
                        output = model(images)
                        output = output.view(batch_size)

                        loss =  calculate_loss(output, targets, device, num_classes)
                    
                    # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
                    # Backward passes under autocast are not recommended.
                    # Backward ops run in the same dtype autocast chose for corresponding forward ops.
                    scaler.scale(loss).backward()

                    # scaler.step() first unscales the gradients of the optimizer's assigned params.
                    # If these gradients do not contain infs or NaNs, optimizer.step() is then called,
                    # otherwise, optimizer.step() is skipped.
                    scaler.step(optimizer)

                    # Updates the scale for next iteration.
                    scaler.update()
                    
                    total_loss += float(loss.detach().item())
                    _, predictions = output.max(1)
                    total_acc += (predictions == targets).sum().item()
                    total_count += output.size(0)
                    
                    del images, targets, output, predictions
                except RuntimeError as e:
                    print(f"\nError in training batch {batch_idx}: {str(e)}")
                    optimizer.zero_grad()
                    continue
                
                allocated, reserved = MemoryTracker.get_memory_stats()
                pbar.set_postfix({
                    'Loss': f'{total_loss/max(1, total_count):.4f}',
                    'Acc': f'{100.*total_acc/max(1, total_count):.4f}%',
                    'Allocated GPU': f'{allocated:.2f}MB',
                    'Reserved GPU': f'{reserved:.2f}MB'
                })
    except Exception as e:
        print(f"\nTraining error: {str(e)}")
        MemoryTracker.clear_memory(model)
        raise

    epoch_loss = total_loss / max(1, total_count)
    epoch_acc = total_acc / max(1, total_count)
    
    return epoch_acc, epoch_loss

def eval(
    model: torch.nn.Module,
    dataloader: torch.utils.data.DataLoader,
    device: torch.device,
    num_classes: int = 2,
    desc: str = 'Validating',
    position: int = 1,
    is_leaving = False
) -> Tuple[float, float]:
    model.eval()
    running_loss = 0.0
    all_preds = []
    all_targets = []
    
    try:  
        with tqdm(
            dataloader,
            desc=desc,
            unit='sample',
            unit_scale=dataloader.batch_size,
            position=position,
            leave=is_leaving,
        ) as pbar:
            with torch.no_grad():
                for batch_idx, (images, targets) in enumerate(pbar):
                    try:
                        images = safe_to_device(images, device)
                        
                        output = model(images)
                        # Reshape output to (batch_size, grid_y, grid_x, 4 + num_classes)
                        output = output.view(images.shape[0], 2, 2, 4 + num_classes)
                        
                        total_loss = calculate_loss(output, targets, device, num_classes)
                        running_loss += total_loss.item()
                        
                        # Process predictions and targets for mAP calculation
                        for batch_idx in range(images.shape[0]):
                            preds = []
                            targs = []
                            
                            # Process ground truth targets
                            for target in targets[batch_idx]:
                                targs.append({
                                    "boxes": target[:4].unsqueeze(0),
                                    "labels": target[4].unsqueeze(0).long(),
                                })
                            
                            # Match predictions to ground truth targets
                            for target_idx, target in enumerate(targets[batch_idx]):
                                best_iou = -1
                                best_pred_idx = -1
                                
                                for grid_y in range(2):
                                    for grid_x in range(2):
                                        bbox_pred = output[batch_idx, grid_y, grid_x, :4]
                                        
                                        if torch.all(torch.round(bbox_pred, decimals=2) == 0):
                                            continue
                                            
                                        iou = calculate_iou(bbox_pred, target[:4])
                                        
                                        if iou > best_iou:
                                            best_iou = iou
                                            best_pred_idx = (grid_y, grid_x)
                                
                                if best_pred_idx != -1:
                                    grid_y, grid_x = best_pred_idx
                                    bbox_pred = output[batch_idx, grid_y, grid_x, :4]
                                    class_probs = torch.softmax(output[batch_idx, grid_y, grid_x, 4:], dim=0)
                                    class_pred = class_probs.argmax().item()
                                    confidence = class_probs[class_pred].item()
                                    
                                    preds.append({
                                        "boxes": bbox_pred.unsqueeze(0),
                                        "scores": torch.tensor([confidence], device=device),
                                        "labels": torch.tensor([class_pred], device=device),
                                    })
                            
                            # Add batch predictions and targets to overall lists
                            if preds:
                                all_preds.append({
                                    k: v.to(device) if isinstance(v, torch.Tensor) else v
                                    for k, v in preds[0].items()
                                })
                            
                            if targs:
                                all_targets.append({
                                    k: v.to(device) if isinstance(v, torch.Tensor) else v
                                    for k, v in targs[0].items()
                                })
                        
                        allocated, reserved = MemoryTracker.get_memory_stats()
                        pbar.set_postfix({
                            'Loss': f'{running_loss/max(1, batch_idx+1):.4f}',
                            'Allocated GPU': f'{allocated:.2f}MB',
                            'Reserved GPU': f'{reserved:.2f}MB'
                        })
                        
                    except RuntimeError as e:
                        print(f"\nError in validation batch {batch_idx}: {str(e)}")
                        continue
                        
    except Exception as e:
        print(f"\nValidation error: {str(e)}")
        MemoryTracker.clear_memory(model)
        raise

    val_loss = running_loss / len(dataloader)
    
    # Calculate mAP
    metric = MeanAveragePrecision()
    metric.update(all_preds, all_targets)
    mAP = metric.compute()["map"]
    
    return mAP, val_loss

In [12]:
def fit(
    model: torch.nn.Module,
    num_classes: int,
    optimizer: torch.optim.Optimizer,
    train_loader: torch.utils.data.DataLoader,
    val_loader: torch.utils.data.DataLoader,
    device: torch.device,
    epochs: int,
) -> Dict[str, List[float]]:
    history = {
        'train_acc': [], 'train_loss': [],
        'val_acc': [], 'val_loss': [],
        'epoch_times': [], 'gpu_allocated': [], 'gpu_reserved': []
    }
    try:
        scaler = GradScaler()
        
        with tqdm(range(epochs), desc="Epochs", position=0, leave=True) as epoch_pbar:
            for epoch in epoch_pbar:
                try:
                    epoch_start = time.time()

                    train_acc, train_loss = train(
                        model, optimizer, train_loader, num_classes, scaler, device,
                        desc=f"Epoch {epoch+1}/{epochs} [Train]",
                        position=0,
                    )

                    val_acc, val_loss = eval(
                        model, val_loader, device,
                        num_classes,
                        desc=f"Epoch {epoch+1}/{epochs} [Val]",
                        position=0,
                        is_leaving=True,
                    )

                    epoch_time = time.time() - epoch_start
                    allocated, reserved = MemoryTracker.get_memory_stats()

                    history['train_acc'].append(train_acc)
                    history['train_loss'].append(train_loss)
                    history['val_acc'].append(val_acc)
                    history['val_loss'].append(val_loss)
                    history['epoch_times'].append(epoch_time)
                    history['gpu_allocated'].append(allocated)
                    history['gpu_reserved'].append(reserved)

                except Exception as e:
                    print(f"\nError in epoch {epoch + 1}: {str(e)}")
                    MemoryTracker.clear_memory(model)
                    continue
    
    except Exception as e:
        print(f"\nTraining loop error: {str(e)}")
        MemoryTracker.clear_memory(model)

    return history

In [None]:
num_epochs = 10

model = SimpleYOLO(num_classes)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = safe_to_device(model, device)

optimizer = optim.Adam(model.parameters(), lr=2e-3)
print(device)
print(model)

history = fit(
    model,
    num_classes,
    optimizer,
    train_loader,
    val_loader,
    device,
    num_epochs,
)
    



cpu
SimpleYOLO(
  (backbone): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(
          (0): Con

Epoch 1/10 [Train]:   0%|          | 0/368 [00:00<?, ?batch/s]

In [None]:
max_memory = max(history['gpu_allocated'])
print(f"Peak GPU memory usage: {max_memory:.2f} MB")

# Visualize prediction

In [None]:
def visualize_data_samples(dataset, num_rows=3, num_cols=3, start_idx=0, class_to_idx=None):
    """
    Visualize samples from dataset with bounding boxes and labels
    
    Args:
        dataset: Dataset to visualize
        num_rows: Number of rows in grid
        num_cols: Number of columns in grid 
        start_idx: Starting index in dataset
        class_to_idx: Dictionary mapping class names to indices
    """
    # Add denormalization transform
    denorm = transforms.Compose([
        transforms.Normalize(
            mean=[-0.485/0.229, -0.456/0.224, -0.406/0.225],
            std=[1/0.229, 1/0.224, 1/0.225]
        )
    ])

    fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 15))
    axes = axes.flatten()

    for idx, ax in enumerate(axes):
        merged_image, annotations = dataset[start_idx + idx]

        # Denormalize the image before converting to PIL
        if isinstance(merged_image, torch.Tensor):
            merged_image = denorm(merged_image)
            # Clamp values to [0,1] range
            merged_image = torch.clamp(merged_image, 0, 1)
            image = transforms.ToPILImage()(merged_image)
        else:
            image = merged_image

        ax.imshow(image)

        # Draw each bounding box on the image
        for ann in annotations:
            bbox = ann[:4]
            label = int(ann[4].item())

            # Scale bounding box coordinates to image size
            width, height = image.size
            x_min = bbox[0] * width
            y_min = bbox[1] * height
            x_max = bbox[2] * width
            y_max = bbox[3] * height

            # Create a rectangle patch
            rect = patches.Rectangle(
                (x_min, y_min),
                x_max - x_min,
                y_max - y_min,
                linewidth=2,
                edgecolor="g",
                facecolor="none",
            )
            ax.add_patch(rect)

            # Add label text if class_to_idx is provided
            if class_to_idx:
                label_name = label
                ax.text(
                    x_min,
                    y_min - 5,
                    label_name,
                    color="r",
                    fontsize=10,
                    bbox=dict(facecolor="white", alpha=0.7),
                )

        ax.set_title(f"Sample {start_idx + idx}")
        ax.axis("off")

    plt.tight_layout()
    plt.show()

# Plot first grid (images 0-8)
visualize_data_samples(val_dataset, start_idx=0, class_to_idx=class_to_idx)