In [1]:
import os
import sys
import numpy as np
from datetime import datetime

import copy
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms

from torchvision.models import resnet50, ResNet50_Weights


from torch.utils.data import DataLoader, Dataset
from torchvision.datasets import VOCDetection
import torchvision.transforms as transforms
from torchvision.models import alexnet, resnet50
from torchvision.transforms import functional as F
from xml.etree import ElementTree as ET
import pandas as pd
from PIL import Image
from sklearn.preprocessing import LabelEncoder

# models from torchvision
import torchvision.models as models
from torchvision.ops import RoIPool

sys.path.append('../../')  
from Object_Detection.RCNN import selective_search
from torchmetrics import Accuracy
from torchinfo import summary

from torch.utils.tensorboard import SummaryWriter


# Get Fast RCNN from torchvision


In [2]:
def parse_annotations(annotation):
    objects = annotation['annotation']['object']
    boxes = []
    classes = []
    for obj in objects:
        xmin = float(obj['bndbox']['xmin'])
        ymin = float(obj['bndbox']['ymin'])
        xmax = float(obj['bndbox']['xmax'])
        ymax = float(obj['bndbox']['ymax'])
        boxes.append([xmin, ymin, xmax, ymax])
        classes.append(obj['name'])
    return torch.tensor(boxes), classes  # Convert boxes to tensors, keep classes as list or map them to integers


In [3]:
# Define the transformation
transform = transforms.Compose([
    transforms.Resize((800, 800)),  # Resize images to a common size
    transforms.ToTensor(),  # Convert images to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

class CustomVOCDataset(VOCDetection):
    def __getitem__(self, index):
        img, target = super(CustomVOCDataset, self).__getitem__(index)
        boxes, classes = parse_annotations(target)
        return img, boxes, classes

# Update the dataset instance with the custom class
dataset = CustomVOCDataset(root='./data/VOCdevkit/VOC2012', year='2012', image_set='train', download=True, transform=transform)
data_loader = DataLoader(dataset, batch_size=1, shuffle=True, num_workers=0)


Using downloaded and verified file: ./data/VOCdevkit/VOC2012\VOCtrainval_11-May-2012.tar
Extracting ./data/VOCdevkit/VOC2012\VOCtrainval_11-May-2012.tar to ./data/VOCdevkit/VOC2012


In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision.models import resnet50
from torchvision.ops import RoIPool
from torch.utils.data import DataLoader
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Define the backbone network
weights = ResNet50_Weights.IMAGENET1K_V1
base_model = resnet50(weights=weights)
base_model = nn.Sequential(*list(base_model.children())[:-2])

# Define the RoI Pooling and classification head
roi_pool = RoIPool(output_size=(7, 7), spatial_scale=1.0 / 16)

num_classes = 21  # PASCAL VOC classes + background
classifier = nn.Sequential(
    nn.Linear(2048 * 7 * 7, 4096),  # Adjusted based on the RoI Pooling output
    nn.ReLU(inplace=True),
    nn.Linear(4096, 4096),
    nn.ReLU(inplace=True),
    nn.Linear(4096, num_classes + 4)  # class scores and bbox regressor outputs
)

gs = selective_search.get_selective_search()

# Label encoder to convert string labels to integers
label_encoder = LabelEncoder()
label_encoder.fit(['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 
                   'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor', 'background'])


# Hyperparameters
num_epochs = 10
learning_rate = 0.001
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Loss function and optimizer
criterion_cls = nn.CrossEntropyLoss()
criterion_reg = nn.SmoothL1Loss()
optimizer = optim.SGD(classifier.parameters(), lr=learning_rate, momentum=0.9, weight_decay=0.0005)

# Training loop
classifier.to(device)
base_model.to(device)

for epoch in range(num_epochs):
    classifier.train()
    base_model.train()

    running_loss = 0.0
    for i, (images, annotations, targets) in enumerate(data_loader):

        # Generate region proposals using selective search
        image_array = images.permute(0, 2, 3, 1).numpy()
        image_array = np.array(image_array)[0]
        selective_search.config(gs, image_array, strategy='q')
        rects = selective_search.get_rects(gs)        
        print("Number of proposals:", len(rects))

        # Convert rects to the format expected by RoI Pooling
        rects = [[r[0], r[1], r[0] + r[2], r[1] + r[3]] for r in rects]
        rects = torch.tensor(rects).float().to(device)


        images = images.to(device)
        annotations = [annot.to(device) for annot in annotations]  # Assuming boxes is a list of tensors
        
        # Encode the string labels to integers
        label_targets = [torch.tensor(label_encoder.transform(target), dtype=torch.long).to(device) for target in targets]

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        features = base_model(images)
        print("Feature map shape:", features.shape)  # Debugging feature map shape
        
        # Apply RoI pooling
        roi_pooled_features = roi_pool(features, [rects])
        print("RoI pooled feature shape:", roi_pooled_features.shape)  # Debugging RoI pooled feature shape
        
        # Flatten RoI pooled features
        roi_pooled_features = roi_pooled_features.view(roi_pooled_features.size(0), -1)
        print("Flattened RoI pooled feature shape:", roi_pooled_features.shape)  # Debugging flattened feature shape

        # Forward pass through the classifier
        outputs = classifier(roi_pooled_features)

        # Split outputs into class scores and bounding box regressions
        cls_scores = outputs[:, :num_classes]
        bbox_regressions = outputs[:, num_classes:]

        # Compute losses for each RoI
        cls_loss = 0
        reg_loss = 0

        for j in range(len(label_targets)):
            cls_loss += criterion_cls(cls_scores[j].unsqueeze(0), label_targets[j])
            
            target_bbox = rects[j].view(1, -1)  # Ensure target is (1, 4)
            pred_bbox = bbox_regressions[j].view(1, -1)  # Ensure prediction is (1, 4)
            reg_loss += criterion_reg(pred_bbox, target_bbox)

        loss = cls_loss + reg_loss
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        if i == 1:  # Just for debugging
            print("Run the loop once")
            break

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(data_loader)}")


Number of proposals: 2914
Feature map shape: torch.Size([1, 2048, 25, 25])
RoI pooled feature shape: torch.Size([2914, 2048, 7, 7])
Flattened RoI pooled feature shape: torch.Size([2914, 100352])
Number of proposals: 4192
Feature map shape: torch.Size([1, 2048, 25, 25])
RoI pooled feature shape: torch.Size([4192, 2048, 7, 7])
Flattened RoI pooled feature shape: torch.Size([4192, 100352])
Run the loop once
Epoch [1/10], Loss: 0.35300503074028117
Number of proposals: 5314
Feature map shape: torch.Size([1, 2048, 25, 25])
RoI pooled feature shape: torch.Size([5314, 2048, 7, 7])
Flattened RoI pooled feature shape: torch.Size([5314, 100352])
Number of proposals: 746
Feature map shape: torch.Size([1, 2048, 25, 25])
RoI pooled feature shape: torch.Size([746, 2048, 7, 7])
Flattened RoI pooled feature shape: torch.Size([746, 100352])
Run the loop once
Epoch [2/10], Loss: 0.9378403534688211
Number of proposals: 3681
Feature map shape: torch.Size([1, 2048, 25, 25])
RoI pooled feature shape: torch.S