In [2]:
# prompt: implement a resnet convolutional network with pytorch for detecting boudning boxes and defining classes on images

import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms

# Define the ResNet model
class ResNetBoundingBoxDetector(nn.Module):
    def __init__(self, num_classes, grid_size=7, num_anchors=1):
        super(ResNetBoundingBoxDetector, self).__init__()
        self.resnet = models.resnet18(pretrained=True)  # Use a pretrained ResNet
        num_features = self.resnet.fc.in_features

        # Replace the ResNet's FC layer with custom outputs
        self.resnet.fc = nn.Identity()

        # Bounding box regression: 4 coords (x, y, w, h) per anchor per grid cell
        self.fc_bbox = nn.Linear(num_features, grid_size * grid_size * num_anchors * 4)

        # Class prediction: num_classes per anchor per grid cell
        self.fc_class = nn.Linear(num_features, grid_size * grid_size * num_anchors * num_classes)

        self.sigmoid = nn.Sigmoid()  # To normalize bounding box coordinates (0-1)

        self.grid_size = grid_size
        self.num_anchors = num_anchors
        self.num_classes = num_classes

    def forward(self, x):
        # Feature extraction
        x = self.resnet(x)

        # Bounding box coordinates
        bbox_coords = self.sigmoid(self.fc_bbox(x))
        bbox_coords = bbox_coords.view(-1, self.grid_size, self.grid_size, self.num_anchors, 4)

        # Class scores
        class_scores = self.fc_class(x)
        class_scores = class_scores.view(-1, self.grid_size, self.grid_size, self.num_anchors, self.num_classes)

        return bbox_coords, class_scores

# Example usage:
num_classes = 10  # Number of classes
num_boxes = 1 # Number of bounding boxes per image

model = ResNetBoundingBoxDetector(36)


# Example input (replace with your actual image data)
dummy_input = torch.randn(1, 3, 224, 224)

# Perform a forward pass
bbox_coords, class_scores = model(dummy_input)


print("Bounding Box Coordinates:", bbox_coords)
print("Class Scores:", class_scores)

# Define loss functions and optimizer
# ... (add your loss function and optimizer here)
# Example:
# criterion_bbox = nn.MSELoss() # or another suitable loss function
# criterion_class = nn.CrossEntropyLoss() # assuming class_scores are not probabilities
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
# ... (add your training loop here)

# Evaluation
# ... (add your evaluation metrics)

Bounding Box Coordinates: tensor([[[[[0.4040, 0.4500, 0.5655, 0.4082]],

          [[0.6181, 0.7294, 0.6103, 0.5124]],

          [[0.6302, 0.4698, 0.5376, 0.4066]],

          [[0.7269, 0.3208, 0.5704, 0.5425]],

          [[0.3405, 0.6155, 0.5017, 0.2718]],

          [[0.6383, 0.5059, 0.5659, 0.3284]],

          [[0.3924, 0.4908, 0.4499, 0.4931]]],


         [[[0.5650, 0.7490, 0.6087, 0.2458]],

          [[0.6082, 0.4122, 0.2490, 0.5270]],

          [[0.4823, 0.4570, 0.6441, 0.4650]],

          [[0.5130, 0.3953, 0.6709, 0.4198]],

          [[0.4355, 0.3837, 0.7253, 0.2568]],

          [[0.6370, 0.2580, 0.6187, 0.3856]],

          [[0.5814, 0.6404, 0.5980, 0.7349]]],


         [[[0.6619, 0.3949, 0.5487, 0.4469]],

          [[0.4373, 0.5052, 0.3910, 0.3630]],

          [[0.5558, 0.4774, 0.4815, 0.6201]],

          [[0.4652, 0.4201, 0.4570, 0.5788]],

          [[0.2039, 0.6195, 0.7260, 0.4164]],

          [[0.4998, 0.3866, 0.1973, 0.5877]],

          [[0.6599, 0.4525, 0.

In [3]:
import os
import torch
from torch.utils.data import Dataset
import cv2
import numpy as np

class YOLODataset(Dataset):
    def __init__(self, img_dir, annot_dir, transform=None):
        self.img_dir = img_dir
        self.annot_dir = annot_dir
        self.transform = transform
        self.img_files = [f for f in os.listdir(img_dir) if f.endswith('.jpg')]

    def __len__(self):
        return len(self.img_files)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_files[idx])
        annot_path = os.path.join(self.annot_dir, self.img_files[idx].replace('.jpg', '.txt'))

        # Load image
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) / 255.0  # Normalize to [0, 1]
        height, width, _ = img.shape

        # Load annotations
        boxes = []
        labels = []
        with open(annot_path, 'r') as f:
            for line in f.readlines():
                cls, x_center, y_center, box_width, box_height = map(float, line.strip().split())
                x_min = (x_center - box_width / 2) * width
                y_min = (y_center - box_height / 2) * height
                x_max = (x_center + box_width / 2) * width
                y_max = (y_center + box_height / 2) * height
                boxes.append([x_min, y_min, x_max, y_max])
                labels.append(int(cls))

        boxes = torch.tensor(boxes, dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.int64)

        # Apply transformations (if any)
        if self.transform:
            augmented = self.transform(image=img, bboxes=boxes, class_labels=labels)
            img = augmented['image']
            boxes = torch.tensor(augmented['bboxes'], dtype=torch.float32)
            labels = torch.tensor(augmented['class_labels'], dtype=torch.int64)

        # Return the image and the target dictionary
        return torch.tensor(img).permute(2, 0, 1), {'boxes': boxes, 'labels': labels}


In [4]:
import torch
import torch.nn as nn
import torchvision.models as models

def create_model(num_classes, grid_size=7, num_anchors=1):
    """
    Creates a ResNet-based model for object detection with bounding box and class predictions.

    Args:
        num_classes (int): Number of classes for classification.
        grid_size (int): Size of the output grid (e.g., 7x7).
        num_anchors (int): Number of anchors per grid cell.

    Returns:
        nn.Module: The initialized ResNet-based detector model.
    """
    class ResNetBoundingBoxDetector(nn.Module):
        def __init__(self, num_classes, grid_size, num_anchors):
            super(ResNetBoundingBoxDetector, self).__init__()
            self.resnet = models.resnet18(pretrained=True)  # Use a pretrained ResNet
            num_features = self.resnet.fc.in_features

            # Replace the ResNet's FC layer with custom outputs
            self.resnet.fc = nn.Identity()

            # Bounding box regression: 4 coords (x, y, w, h) per anchor per grid cell
            self.fc_bbox = nn.Linear(num_features, grid_size * grid_size * num_anchors * 4)

            # Class prediction: num_classes per anchor per grid cell
            self.fc_class = nn.Linear(num_features, grid_size * grid_size * num_anchors * num_classes)

            self.sigmoid = nn.Sigmoid()  # To normalize bounding box coordinates (0-1)

            self.grid_size = grid_size
            self.num_anchors = num_anchors
            self.num_classes = num_classes

        def forward(self, x):
            # Feature extraction
            x = self.resnet(x)

            # Bounding box coordinates
            bbox_coords = self.sigmoid(self.fc_bbox(x))
            bbox_coords = bbox_coords.view(-1, self.grid_size, self.grid_size, self.num_anchors, 4)

            # Class scores
            class_scores = self.fc_class(x)
            class_scores = class_scores.view(-1, self.grid_size, self.grid_size, self.num_anchors, self.num_classes)

            return bbox_coords, class_scores

    # Return an instance of the detector
    return ResNetBoundingBoxDetector(num_classes=num_classes, grid_size=grid_size, num_anchors=num_anchors)


In [7]:
from torch.utils.data import DataLoader

# Paths
img_dir = "dataset/train/images"
annot_dir = "dataset/train/labels"

# Dataset and DataLoader
train_dataset = YOLODataset(img_dir, annot_dir)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))

# Model setup
num_classes = 37
model = create_model(num_classes)
model = model.to('cuda')

# Optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=0.005, momentum=0.9, weight_decay=0.0005)

# Training loop
model.train()
for epoch in range(10):  # Train for 10 epochs
    epoch_loss = 0
    for images, targets in train_loader:
        images = list(image.to('cuda') for image in images)
        images = torch.stack(images).to('cuda')
        targets = [{k: v.to('cuda') for k, v in t.items()} for t in targets]

        # Forward pass
        loss_dict = model(images)
        losses = sum(loss for loss in loss_dict.values())

        # Backward pass
        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        epoch_loss += losses.item()

    print(f"Epoch [{epoch+1}/10], Loss: {epoch_loss:.4f}")


RuntimeError: Input type (torch.cuda.DoubleTensor) and weight type (torch.cuda.FloatTensor) should be the same