In [1]:
%matplotlib inline
import os
import sys
import pickle

from datetime import datetime

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init

import torchvision
from torchvision import models

import torchvision.transforms as transforms
from torch.utils.data import ConcatDataset
import torch.optim as optim
from torchmetrics import Accuracy
from torchinfo import summary
from torch.utils.tensorboard import SummaryWriter
from torchvision.datasets import VOCDetection

sys.path.append('../../')  
from Object_Detection.RCNN import selective_search
from Object_Detection.RCNN.bbox import calculate_iou
from PIL import Image

import matplotlib.pyplot as plt
import numpy as np

# Settings 

torch.set_printoptions(precision=3)

In [2]:
voc_dataset = VOCDetection(root='./data', 
                           year='2012', 
                           image_set='train',
                           download=True, 
                        #    transform=transform
                           )

Using downloaded and verified file: ./data\VOCtrainval_11-May-2012.tar
Extracting ./data\VOCtrainval_11-May-2012.tar to ./data


# Load model and its finetuned weights (PASCAL VOC)

In [3]:
# Step 1: Load the ResNet50 model without pretrained weights
resnet50 = models.resnet50(weights=False)

resnet50.fc = torch.nn.Linear(resnet50.fc.in_features, 20)



In [4]:
# Step 2: Load your finetuned weights
# Make sure that the path 'voc_finetune_resnet_weights.pth' is correct and accessible
resnet50.load_state_dict(torch.load('resnet50_pascal_voc.pth'))

<All keys matched successfully>

In [None]:
summary(model=resnet50, input_size=(1, 3, 227, 227), col_width=20,
                  col_names=['input_size', 'output_size', 'num_params', 'trainable'], row_settings=['var_names'], verbose=0)

In [6]:
class ResNet50Modified(nn.Module):
    def __init__(self, original_model):
        super(ResNet50Modified, self).__init__()
        # Adapt this line if your model was changed differently
        self.features = nn.Sequential(*list(original_model.children())[:-1])  # Typically removes the final fully connected layer and avg pooling layer

    def forward(self, x):
        x = self.features(x)
        return x

# Create the modified model with loaded weights
modified_resnet50 = ResNet50Modified(resnet50)

# Put the model in evaluation mode if you are doing inference or feature extraction
modified_resnet50.eval()

if torch.cuda.is_available():
    modified_resnet50 = modified_resnet50.cuda()

In [57]:
summary(model=modified_resnet50, input_size=(1, 3, 227, 227), col_width=20,
                  col_names=['input_size', 'output_size', 'num_params', 'trainable'], row_settings=['var_names'], verbose=0)

Layer (type (var_name))                       Input Shape          Output Shape         Param #              Trainable
ResNet50Modified (ResNet50Modified)           [1, 3, 227, 227]     [1, 2048, 1, 1]      --                   True
├─Sequential (features)                       [1, 3, 227, 227]     [1, 2048, 1, 1]      --                   True
│    └─Conv2d (0)                             [1, 3, 227, 227]     [1, 64, 114, 114]    9,408                True
│    └─BatchNorm2d (1)                        [1, 64, 114, 114]    [1, 64, 114, 114]    128                  True
│    └─ReLU (2)                               [1, 64, 114, 114]    [1, 64, 114, 114]    --                   --
│    └─MaxPool2d (3)                          [1, 64, 114, 114]    [1, 64, 57, 57]      --                   --
│    └─Sequential (4)                         [1, 64, 57, 57]      [1, 256, 57, 57]     --                   True
│    │    └─Bottleneck (0)                    [1, 64, 57, 57]      [1, 256, 57, 57]    

# Generate features 

In [6]:
voc_dataset

Dataset VOCDetection
    Number of datapoints: 5717
    Root location: ./data

In [109]:
# Single image load from the VOC dataset and in numpy format

image, target = voc_dataset[0]
image_array = np.array(image)

In [16]:
preprocess = transforms.Compose([
    transforms.Resize((227, 227)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def batch_process_features(model, regions):
    """ Process a batch of regions through the model """
    # Convert list of PIL images to tensor
    regions_tensor = torch.stack([preprocess(region) for region in regions]).cuda()
    
    # Extract features
    with torch.no_grad():
        features = model(regions_tensor)
    return features.cpu().numpy()

gs = selective_search.get_selective_search()

counter, start_index = 1500, 1500
svm_dataset = []
regression_dataset = []
# Assuming 'voc_dataset' is iterable with (image, annot)
for i in range(start_index, len(voc_dataset)):
    image, annot = voc_dataset[i]
    
    counter += 1
    # Counter for images processed
    print(f'Processing image {counter}')

    # Get proposals and annotations similar to your original code
    image_array = np.array(image)
    selective_search.config(gs, image_array, strategy='q')
    rects = selective_search.get_rects(gs)

    # Prepare annotations
    annotations = []
    for obj in annot['annotation']['object']:
        annotations.append({
            'class': obj['name'],
            'bbox': [int(obj['bndbox']['xmin']), int(obj['bndbox']['ymin']), int(obj['bndbox']['xmax']), int(obj['bndbox']['ymax'])]
        })

    # Collect regions for batch processing
    regions = [image.crop((x1, y1, x2, y2)) for x1, y1, x2, y2 in rects]

    # Process all regions at once in batches
    features = []
    for i in range(0, len(regions), 16):  # batch_size e.g., 32, 64, 128, 256, 512
        batch = regions[i:i + 16]
        batch_features = batch_process_features(modified_resnet50, batch)
        features.extend(batch_features)


    # Calculate IOUs and assign labels after extracting all features
    labels = []
    for proposal, feature in zip(rects, features):
        proposal_iou = {cls['class']: calculate_iou(proposal, cls['bbox']) for cls in annotations}
        best_iou = max(proposal_iou.values())
        if best_iou > 0.5:
            matched_class = max(proposal_iou, key=proposal_iou.get)
            labels.append((proposal, matched_class, 'positive'))
        else:
            labels.append((proposal, None, 'negative'))


    # Prepare datasets for SVM and regression

    for feature, label in zip(features, labels):
        if label[2] == 'positive':
            svm_dataset.append((feature, label[1]))
            ground_truth_bbox = next((item['bbox'] for item in annotations if item['class'] == label[1]), None)
            if ground_truth_bbox:
                tx = ground_truth_bbox[0] - label[0][0]
                ty = ground_truth_bbox[1] - label[0][1]
                tw = ground_truth_bbox[2] - label[0][2]
                th = ground_truth_bbox[3] - label[0][3]
                regression_dataset.append((feature, (tx, ty, tw, th)))
    print(f'Number of data in SVM dataset: {len(svm_dataset)}')
    print(f'Number of data in regression dataset: {len(regression_dataset)}')
    # break after 10 images
    if counter == 2200:
        break

Processing image 1501
Number of data in SVM dataset: 221
Number of data in regression dataset: 221
Processing image 1502
Number of data in SVM dataset: 333
Number of data in regression dataset: 333
Processing image 1503
Number of data in SVM dataset: 426
Number of data in regression dataset: 426
Processing image 1504
Number of data in SVM dataset: 565
Number of data in regression dataset: 565
Processing image 1505
Number of data in SVM dataset: 747
Number of data in regression dataset: 747
Processing image 1506
Number of data in SVM dataset: 869
Number of data in regression dataset: 869
Processing image 1507
Number of data in SVM dataset: 978
Number of data in regression dataset: 978
Processing image 1508
Number of data in SVM dataset: 1164
Number of data in regression dataset: 1164
Processing image 1509
Number of data in SVM dataset: 1358
Number of data in regression dataset: 1358
Processing image 1510
Number of data in SVM dataset: 1474
Number of data in regression dataset: 1474
Proc

In [18]:
# Size in MB of svm_dataset and regression_dataset in memory

print('Size of svm_dataset:', sys.getsizeof(svm_dataset) / 1024 / 1024, 'MB')
print('Size of regression_dataset:', sys.getsizeof(regression_dataset) / 1024 / 1024, 'MB')


Size of svm_dataset: 0.7638778686523438 MB
Size of regression_dataset: 0.7638778686523438 MB


In [4]:
with open('svm_dataset_2.pkl', 'rb') as f:
    svm_dataset_one = pickle.load(f)

In [19]:
# save the svm_dataset and regression_dataset to disk
with open('svm_dataset_1_5.pkl', 'wb') as f:
    pickle.dump(svm_dataset, f)

# with open('regression_dataset.pkl', 'wb') as f:
#     pickle.dump(regression_dataset, f)