In [16]:
import os
import sys
import numpy as np
from datetime import datetime

import copy
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms

from torch.utils.data import DataLoader, Dataset
from torchvision.datasets import VOCDetection
import torchvision.transforms as transforms
from torchvision.models import alexnet, resnet50
from torchvision.transforms import functional as F
from xml.etree import ElementTree as ET
import pandas as pd
from PIL import Image

# models from torchvision
import torchvision.models as models
from torchvision.ops import RoIPool

sys.path.append('../../')  
from Object_Detection.RCNN import selective_search
from torchmetrics import Accuracy
from torchinfo import summary

from torch.utils.tensorboard import SummaryWriter


# Get Fast RCNN from torchvision


In [2]:
def parse_annotations(annotation):
    objects = annotation['annotation']['object']
    boxes = []
    classes = []
    for obj in objects:
        xmin = float(obj['bndbox']['xmin'])
        ymin = float(obj['bndbox']['ymin'])
        xmax = float(obj['bndbox']['xmax'])
        ymax = float(obj['bndbox']['ymax'])
        boxes.append([xmin, ymin, xmax, ymax])
        classes.append(obj['name'])
    return torch.tensor(boxes), classes  # Convert boxes to tensors, keep classes as list or map them to integers


In [3]:
# Define the transformation
transform = transforms.Compose([
    transforms.Resize((800, 800)),  # Resize images to a common size
    transforms.ToTensor(),  # Convert images to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

class CustomVOCDataset(VOCDetection):
    def __getitem__(self, index):
        img, target = super(CustomVOCDataset, self).__getitem__(index)
        boxes, classes = parse_annotations(target)
        return img, boxes, classes

# Update the dataset instance with the custom class
dataset = CustomVOCDataset(root='./data/VOCdevkit/VOC2012', year='2012', image_set='train', download=True, transform=transform)
data_loader = DataLoader(dataset, batch_size=2, shuffle=True, num_workers=0)


Using downloaded and verified file: ./data/VOCdevkit/VOC2012\VOCtrainval_11-May-2012.tar
Extracting ./data/VOCdevkit/VOC2012\VOCtrainval_11-May-2012.tar to ./data/VOCdevkit/VOC2012


In [40]:
img, boxes, classes = dataset[0]

In [42]:
img

tensor([[[ 2.2489,  2.2489,  2.2489,  ...,  1.3584,  1.3755,  1.3755],
         [ 2.2489,  2.2489,  2.2489,  ...,  1.3584,  1.3584,  1.3584],
         [ 2.2489,  2.2489,  2.2489,  ...,  1.3584,  1.3413,  1.3413],
         ...,
         [ 1.1358,  1.1358,  1.1529,  ..., -0.3369, -0.2171, -0.1143],
         [ 1.0159,  1.0159,  1.0331,  ..., -0.6623, -0.5767, -0.5082],
         [ 0.9303,  0.9474,  0.9646,  ..., -0.8507, -0.7993, -0.7479]],

        [[ 2.4286,  2.4286,  2.4286,  ...,  1.5707,  1.5882,  1.5882],
         [ 2.4286,  2.4286,  2.4286,  ...,  1.5707,  1.5707,  1.5707],
         [ 2.4286,  2.4286,  2.4286,  ...,  1.5707,  1.5532,  1.5532],
         ...,
         [ 1.0455,  1.0455,  1.0630,  ..., -0.3025, -0.1625, -0.0399],
         [ 0.9230,  0.9230,  0.9405,  ..., -0.6352, -0.5301, -0.4426],
         [ 0.8354,  0.8529,  0.8704,  ..., -0.8277, -0.7577, -0.6877]],

        [[ 2.6400,  2.6400,  2.6400,  ...,  2.5354,  2.5529,  2.5529],
         [ 2.6400,  2.6400,  2.6400,  ...,  2

In [31]:
# opencv switchToSelectiveSearchQuality takes not [3, H, W] but [H, W, 3]
img_for_sc = img.permute(1, 2, 0).numpy()

In [32]:
# Check selectivesearch

gs = selective_search.get_selective_search()


In [33]:
# image_array = np.array(img)
selective_search.config(gs, img_for_sc, strategy='q')


In [34]:
base_model = models.resnet50(weights=True)

# Remove the final fully connected layer (fc) to use as a feature extractor
modules = list(base_model.children())[:-2]  # Remove the last fully connected layer and avgpool
base_model = torch.nn.Sequential(*modules)




In [51]:
# Intermediate RoI pooling layer that takes selected region proposals and extracts features
# RoI pooling
roi_pool = RoIPool(output_size=(7, 7), spatial_scale=1/32)


In [36]:
rects = selective_search.get_rects(gs)

In [37]:
rects[0]

array([176, 566, 223, 585])

In [44]:
f = base_model(img.unsqueeze(0))

In [45]:
f.shape

torch.Size([1, 2048, 25, 25])

In [52]:
# pass the featire through the RoI pooling layer with the region proposals
roi_pool_feats = roi_pool(f, [torch.tensor([rects[0]]).float()])

In [53]:
roi_pool_feats

tensor([[[[0.9616, 0.9616, 0.9616,  ..., 1.2233, 1.2233, 1.2233],
          [0.9616, 0.9616, 0.9616,  ..., 1.2233, 1.2233, 1.2233],
          [0.9616, 0.9616, 0.9616,  ..., 1.2233, 1.2233, 1.2233],
          ...,
          [0.9616, 0.9616, 0.9616,  ..., 1.2233, 1.2233, 1.2233],
          [0.9616, 0.9616, 0.9616,  ..., 1.2233, 1.2233, 1.2233],
          [0.9616, 0.9616, 0.9616,  ..., 1.2233, 1.2233, 1.2233]],

         [[0.0000, 0.0000, 0.0000,  ..., 0.3243, 0.3243, 0.3243],
          [0.0000, 0.0000, 0.0000,  ..., 0.3243, 0.3243, 0.3243],
          [0.0000, 0.0000, 0.0000,  ..., 0.3243, 0.3243, 0.3243],
          ...,
          [0.0000, 0.0000, 0.0000,  ..., 0.3243, 0.3243, 0.3243],
          [0.0000, 0.0000, 0.0000,  ..., 0.3243, 0.3243, 0.3243],
          [0.0000, 0.0000, 0.0000,  ..., 0.3243, 0.3243, 0.3243]],

         [[0.8508, 0.8508, 0.8508,  ..., 1.3069, 1.3069, 1.3069],
          [0.8508, 0.8508, 0.8508,  ..., 1.3069, 1.3069, 1.3069],
          [0.8508, 0.8508, 0.8508,  ..., 1

In [15]:
num_classes = 21  # PASCAL VOC classes + background
classifier = torch.nn.Sequential(
    torch.nn.Linear(512 * 7 * 7, 4096),
    torch.nn.ReLU(inplace=True),
    torch.nn.Linear(4096, 4096),
    torch.nn.ReLU(inplace=True),
    torch.nn.Linear(4096, num_classes + 4)  # class scores and bbox regressor outputs
)

