In [104]:
%matplotlib inline
import os
import sys
from datetime import datetime

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init

import torchvision
from torchvision import models

import torchvision.transforms as transforms
from torch.utils.data import ConcatDataset
import torch.optim as optim
from torchmetrics import Accuracy
from torchinfo import summary
from torch.utils.tensorboard import SummaryWriter
from torchvision.datasets import VOCDetection

sys.path.append('../')  
from Object_Detection import selective_search
from Object_Detection.bbox import calculate_iou
from PIL import Image

import matplotlib.pyplot as plt
import numpy as np

# Settings 

torch.set_printoptions(precision=3)

In [105]:
voc_dataset = VOCDetection(root='./data', 
                           year='2012', 
                           image_set='train',
                           download=True, 
                        #    transform=transform
                           )

Using downloaded and verified file: ./data\VOCtrainval_11-May-2012.tar
Extracting ./data\VOCtrainval_11-May-2012.tar to ./data


# Load model and its finetuned weights (PASCAL VOC)

In [None]:
# Step 1: Load the ResNet50 model without pretrained weights
resnet50 = models.resnet50(weights=False)

resnet50.fc = torch.nn.Linear(resnet50.fc.in_features, 20)

In [107]:
# Step 2: Load your finetuned weights
# Make sure that the path 'voc_finetune_resnet_weights.pth' is correct and accessible
resnet50.load_state_dict(torch.load('resnet50_pascal_voc.pth'))

<All keys matched successfully>

In [None]:
summary(model=resnet50, input_size=(1, 3, 227, 227), col_width=20,
                  col_names=['input_size', 'output_size', 'num_params', 'trainable'], row_settings=['var_names'], verbose=0)

In [108]:
class ResNet50Modified(nn.Module):
    def __init__(self, original_model):
        super(ResNet50Modified, self).__init__()
        # Adapt this line if your model was changed differently
        self.features = nn.Sequential(*list(original_model.children())[:-1])  # Typically removes the final fully connected layer and avg pooling layer

    def forward(self, x):
        x = self.features(x)
        return x

# Create the modified model with loaded weights
modified_resnet50 = ResNet50Modified(resnet50)

# Put the model in evaluation mode if you are doing inference or feature extraction
modified_resnet50.eval()

if torch.cuda.is_available():
    modified_resnet50 = modified_resnet50.cuda()

In [57]:
summary(model=modified_resnet50, input_size=(1, 3, 227, 227), col_width=20,
                  col_names=['input_size', 'output_size', 'num_params', 'trainable'], row_settings=['var_names'], verbose=0)

Layer (type (var_name))                       Input Shape          Output Shape         Param #              Trainable
ResNet50Modified (ResNet50Modified)           [1, 3, 227, 227]     [1, 2048, 1, 1]      --                   True
├─Sequential (features)                       [1, 3, 227, 227]     [1, 2048, 1, 1]      --                   True
│    └─Conv2d (0)                             [1, 3, 227, 227]     [1, 64, 114, 114]    9,408                True
│    └─BatchNorm2d (1)                        [1, 64, 114, 114]    [1, 64, 114, 114]    128                  True
│    └─ReLU (2)                               [1, 64, 114, 114]    [1, 64, 114, 114]    --                   --
│    └─MaxPool2d (3)                          [1, 64, 114, 114]    [1, 64, 57, 57]      --                   --
│    └─Sequential (4)                         [1, 64, 57, 57]      [1, 256, 57, 57]     --                   True
│    │    └─Bottleneck (0)                    [1, 64, 57, 57]      [1, 256, 57, 57]    

# Generate features 

In [149]:
voc_dataset

Dataset VOCDetection
    Number of datapoints: 5717
    Root location: ./data

In [109]:
# Single image load from the VOC dataset and in numpy format

image, target = voc_dataset[0]
image_array = np.array(image)

In [161]:
preprocess = transforms.Compose([
    transforms.Resize((227, 227)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def batch_process_features(model, regions):
    """ Process a batch of regions through the model """
    # Convert list of PIL images to tensor
    regions_tensor = torch.stack([preprocess(region) for region in regions]).cuda()
    
    # Extract features
    with torch.no_grad():
        features = model(regions_tensor)
    return features.cpu().numpy()

gs = selective_search.get_selective_search()


# Assuming 'voc_dataset' is iterable with (image, annot)
for image, annot in voc_dataset:
    # Get proposals and annotations similar to your original code
    image_array = np.array(image)
    selective_search.config(gs, image_array, strategy='q')
    rects = selective_search.get_rects(gs)
    print('Done with selective search')

    # Prepare annotations
    annotations = []
    for obj in annot['annotation']['object']:
        annotations.append({
            'class': obj['name'],
            'bbox': [int(obj['bndbox']['xmin']), int(obj['bndbox']['ymin']), int(obj['bndbox']['xmax']), int(obj['bndbox']['ymax'])]
        })
    print('Done with annotations')

    # Collect regions for batch processing
    regions = [image.crop((x1, y1, x2, y2)) for x1, y1, x2, y2 in rects]

    # Process all regions at once in batches
    features = []
    for i in range(0, len(regions), 32):  # batch_size e.g., 32
        batch = regions[i:i + 32]
        batch_features = batch_process_features(modified_resnet50, batch)
        features.extend(batch_features)

    print('Done with feature extraction')

    # Calculate IOUs and assign labels after extracting all features
    labels = []
    for proposal, feature in zip(rects, features):
        proposal_iou = {cls['class']: calculate_iou(proposal, cls['bbox']) for cls in annotations}
        best_iou = max(proposal_iou.values())
        if best_iou > 0.5:
            matched_class = max(proposal_iou, key=proposal_iou.get)
            labels.append((proposal, matched_class, 'positive'))
        else:
            labels.append((proposal, None, 'negative'))

    print('Done with label assignment')

    # Prepare datasets for SVM and regression
    svm_dataset = []
    regression_dataset = []
    for feature, label in zip(features, labels):
        if label[2] == 'positive':
            svm_dataset.append((feature, label[1]))
            ground_truth_bbox = next((item['bbox'] for item in annotations if item['class'] == label[1]), None)
            if ground_truth_bbox:
                tx = ground_truth_bbox[0] - label[0][0]
                ty = ground_truth_bbox[1] - label[0][1]
                tw = ground_truth_bbox[2] - label[0][2]
                th = ground_truth_bbox[3] - label[0][3]
                regression_dataset.append((feature, (tx, ty, tw, th)))
    
    print('Done with dataset preparation')
    break

Done with selective search
Done with annotations
Done with feature extraction
Done with label assignment
Done with dataset preparation


In [159]:
svm_dataset[0][0].shape

(2048, 1, 1)

In [153]:
preprocess = transforms.Compose([
    transforms.Resize((227, 227)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


gs = selective_search.get_selective_search()

for image, annnot in voc_dataset:

    # Get proposals from selective search
    image_array = np.array(image)
    selective_search.config(gs, image_array, strategy='q')
    rects = selective_search.get_rects(gs)
    print('Done with selective search')

    # Prepare annotations
    annotations = []
    for obj in annnot['annotation']['object']:
        annotations.append({
            'class': obj['name'],
            'bbox': [int(obj['bndbox']['xmin']), int(obj['bndbox']['ymin']), int(obj['bndbox']['xmax']), int(obj['bndbox']['ymax'])]
        })
    print('Done with annotations')

    # Feature extraction per image 

    features = []
    lables = []
    for proposal in rects:
        
        x1, y1, x2, y2 = proposal
        region = image.crop((x1, y1, x2, y2))
        
        # Make it (1, 3, 227, 227)
        region = preprocess(region).unsqueeze(0)
        
        # Move to GPU if available
        if torch.cuda.is_available():
            region_preprocessed = region.cuda()

        # Extract features
        with torch.no_grad():
            feature = modified_resnet50(region_preprocessed)
            features.append(feature.cpu().numpy().flatten())  # Flatten the features
        
        # Calculate IOU and assign the class
        proposal_iou = {cls['class']: calculate_iou(proposal, cls['bbox']) for cls in annotations}
        best_iou = max(proposal_iou.values())

        if best_iou > 0.5:
            matched_class = max(proposal_iou, key=proposal_iou.get)
            lables.append((proposal, matched_class, 'positive'))
        else:
            lables.append((proposal, None, 'negative'))

    print('Done with feature extraction and labelling')

    svm_dataset = []
    regression_dataset = []

    for feature, label in zip(features, lables):
        if label[2] == 'positive':
            svm_dataset.append((feature, label[1]))

            ground_truth_bbox = next((item['bbox'] for item in annotations if item['class'] == matched_class), None)
            if ground_truth_bbox:
                # Calculate offsets
                gx1, gy1, gx2, gy2 = ground_truth_bbox
                px1, py1, px2, py2 = proposal
                tx = gx1 - px1
                ty = gy1 - py1
                tw = gx2 - px2
                th = gy2 - py2
                regression_dataset.append((feature, (tx, ty, tw, th)))
    
    print('Done with dataset preparation')

    break


Done with selective search
Done with annotations
Done with feature extraction and labelling
Done with dataset preparation
