In [13]:
import torchvision

dataset = torchvision.datasets.VOCDetection(
    root="./PASCAL VOC/", year='2012', image_set='trainval', download=False,
    transform=torchvision.transforms.Compose([
        torchvision.transforms.Resize(size=(300,300)),
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize(mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5]),
    ])
)

In [14]:
dataset

Dataset VOCDetection
    Number of datapoints: 11540
    Root location: ./PASCAL VOC/
    StandardTransform
Transform: Compose(
               Resize(size=(300, 300), interpolation=bilinear, max_size=None, antialias=True)
               ToTensor()
               Normalize(mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5])
           )

In [10]:
model = torchvision.models.detection.faster_rcnn.fasterrcnn_resnet50_fpn_v2(weights=torchvision.models.detection.FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT)


Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_v2_coco-dd69338a.pth" to C:\Users\PMLS/.cache\torch\hub\checkpoints\fasterrcnn_resnet50_fpn_v2_coco-dd69338a.pth
100%|██████████| 167M/167M [00:48<00:00, 3.64MB/s] 


In [11]:
model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(
    model.roi_heads.box_predictor.cls_score.in_features,
    20+1 # +1 for background
)

In [22]:
import torch


dataloader_dataset = torch.utils.data.DataLoader(dataset, batch_size=8, collate_fn=lambda x: x, shuffle=True)

In [72]:
import os

# need to convert classname which is string to integer for training
classes = set([f.split('_')[0] for f in os.listdir(f"{dataset.root}/VOCdevkit/VOC2012/ImageSets/Main/") if '_' in f])
classes = list(classes)
class_to_index = {class_:i for i, class_ in enumerate(classes, start=1)}
class_to_index

{'boat': 1,
 'sheep': 2,
 'diningtable': 3,
 'cat': 4,
 'cow': 5,
 'sofa': 6,
 'bicycle': 7,
 'pottedplant': 8,
 'horse': 9,
 'person': 10,
 'tvmonitor': 11,
 'aeroplane': 12,
 'train': 13,
 'dog': 14,
 'chair': 15,
 'bird': 16,
 'car': 17,
 'bus': 18,
 'motorbike': 19,
 'bottle': 20}

In [91]:
import torch
import numpy as np


opt = torch.optim.SGD(params=model.parameters())

n_epochs = 1
total_steps = np.int64(np.floor(len(dataloader_dataset) / dataloader_dataset.batch_size) + 1)
for epoch in range(n_epochs):
    loss_per_epoch = 0
    print(f"Epoch {epoch+1}/{n_epochs}")
    for i, data in enumerate(dataloader_dataset, start=1):
        print(f"\t- Step {i}/{total_steps}")
        X = []
        y = []
        for d in data:
            X.append(d[0])
            # take first object label by indexing 0
            objs = d[1]['annotation']['object']
            bboxes = []
            labels = []
            for i in range(len(objs)):
                bbox_dict = objs[i]['bndbox']
                # must be in this order or height/width may be negative due to wrong order
                bbox = [int(bbox_dict[key]) for key in ['xmin', 'ymin', 'xmax', 'ymax']]
                bboxes.append(bbox)

                label = objs[i]['name']
                label = class_to_index[label]
                labels.append(label)
                
            bboxes = torch.as_tensor(bboxes, dtype=torch.int64)
            labels = torch.as_tensor(labels, dtype=torch.int64)
            y.append({
                'boxes': bboxes,
                'labels': labels,
            })
        
        loss_dict = model(X, y)
        loss = sum(v for v in loss_dict.values())
        loss_per_epoch += loss
        
        opt.zero_grad()
        loss.backward()
        opt.step()
        break
    print(f"Loss: {loss_per_epoch}")
    print()
    break

Epoch 1
	- Step 1/181
Loss: 4.924716949462891



In [137]:
model.eval()

for batch in dataloader_dataset:
    d = batch[0]
    img = d[0]
    # add batch dimension: (3,300,300) -> (1,3,300,300)
    img = np.expand_dims(img, axis=0)
    img = torch.from_numpy(img)

    objs = d[1]['annotation']['object']
    bboxes = []
    labels = []
    for i in range(len(objs)):
        bbox_dict = objs[i]['bndbox']
        bbox = [int(bbox_dict[key]) for key in ['xmin', 'ymin', 'xmax', 'ymax']]
        bboxes.append(bbox)

        label = objs[i]['name']
        label = class_to_index[label]
        labels.append(label)
        
    bboxes = torch.as_tensor(bboxes, dtype=torch.int64)
    labels = torch.as_tensor(labels, dtype=torch.int64)
    break

In [106]:
pred = model(img)
pred

[{'boxes': tensor([[3.0004e+01, 2.9835e+02, 9.1283e+01, 3.0000e+02],
          [1.9040e+02, 2.9842e+02, 2.5421e+02, 3.0000e+02],
          [6.8908e+01, 2.9842e+02, 1.3257e+02, 3.0000e+02],
          [1.3494e+02, 2.9842e+02, 1.9856e+02, 3.0000e+02],
          [1.0794e+02, 2.9842e+02, 1.7157e+02, 3.0000e+02],
          [1.6193e+02, 2.9842e+02, 2.2557e+02, 3.0000e+02],
          [1.4511e-01, 2.8413e+02, 1.3287e+01, 3.0000e+02],
          [2.1369e+02, 2.9818e+02, 2.7307e+02, 3.0000e+02],
          [1.4533e+00, 2.7946e+02, 2.1947e+01, 2.9901e+02],
          [0.0000e+00, 2.7450e+02, 9.7799e+00, 2.9895e+02],
          [6.7298e-02, 2.7475e+02, 3.4308e+01, 2.9929e+02],
          [0.0000e+00, 2.5985e+02, 1.9412e+01, 2.9651e+02],
          [3.4232e-02, 2.5114e+02, 3.0434e+01, 2.9811e+02],
          [2.4701e+02, 2.6910e+02, 2.9565e+02, 2.9784e+02],
          [0.0000e+00, 5.7069e+01, 8.3958e+01, 1.4363e+02],
          [4.0032e-01, 5.7685e+01, 7.4772e+01, 9.9499e+01],
          [4.5953e+01, 2.1070e+

In [112]:
pred_boxes = torchvision.ops.nms(pred[0]['boxes'], pred[0]['scores'], 0.5)

In [129]:
display_img = img[0].permute(1,2,0).detach().numpy()
display_img = (display_img + 127.5) * 127.5
display_img = display_img.astype('uint8')

In [146]:
from torchmetrics.detection import MeanAveragePrecision

mAP = MeanAveragePrecision(iou_type='bbox')
mAP.update(
    dict(
        boxes=torch.stack([torch.tensor(box) for box in pred_boxes]).astype(torch.float32),
        scores=pred[0]['scores'],
        label=pred[9]['labels'],
    ),
    dict(
        boxes=X[0],
        labels=labels,
    )
)

ModuleNotFoundError: No module named 'regex._regex'