In [1]:
!pip install torchvision torch



In [3]:
# may need this
# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [1]:
import torchvision

dataset = torchvision.datasets.VOCDetection(
    root="./PASCAL VOC/", year='2012', image_set='trainval', download=False,
    transform=torchvision.transforms.Compose([
        torchvision.transforms.Resize(size=(300,300)),
        torchvision.transforms.ToTensor(),
        # data is already in [0, 1], so normalize it to [-1, 1]
        torchvision.transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
    ])
)

In [2]:
dataset

Dataset VOCDetection
    Number of datapoints: 11540
    Root location: ./PASCAL VOC/
    StandardTransform
Transform: Compose(
               Resize(size=(300, 300), interpolation=bilinear, max_size=None, antialias=True)
               ToTensor()
               Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
           )

In [3]:
model = torchvision.models.detection.faster_rcnn.fasterrcnn_resnet50_fpn_v2(weights=torchvision.models.detection.FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT)

In [4]:
model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(
    model.roi_heads.box_predictor.cls_score.in_features,
    20+1  # +1 for background
)

In [5]:
import torch

dataloader_dataset = torch.utils.data.DataLoader(dataset, batch_size=16, collate_fn=lambda x: x, shuffle=True)

In [6]:
import os

# need to convert label to integer for training
classes = set([f.split('_')[0] for f in os.listdir(f"{dataset.root}/VOCdevkit/VOC2012/ImageSets/Main/") if '_' in f])
classes = list(classes)
class_to_index = {class_:i for i, class_ in enumerate(classes, start=1)}
class_to_index

{'bottle': 1,
 'pottedplant': 2,
 'chair': 3,
 'bird': 4,
 'bus': 5,
 'cat': 6,
 'dog': 7,
 'car': 8,
 'cow': 9,
 'boat': 10,
 'person': 11,
 'sofa': 12,
 'train': 13,
 'sheep': 14,
 'tvmonitor': 15,
 'horse': 16,
 'bicycle': 17,
 'diningtable': 18,
 'motorbike': 19,
 'aeroplane': 20}

In [7]:
# for finetuning, only train roi_head parameters
for name, param in model.named_parameters():
  if 'roi_head' not in name:
    param.requires_grad = False

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [9]:
# confirm data is in [-1, 1]
for data in dataloader_dataset:
  for d in data:
    d = d[0]
    print(d.min(), d.max())
    break
  break

tensor(-0.9922) tensor(1.)


In [13]:
import numpy as np
from time import time


opt = torch.optim.SGD(params=model.parameters())
bce = torch.nn.CrossEntropyLoss()
mse = torch.nn.L1Loss()

model.to(device)

n_epochs = 1
total_batches = len(dataloader_dataset)
for epoch in range(1, n_epochs+1):
    start = time()
    print(f"Epoch {epoch}/{n_epochs}")

    classifier_loss_per_epoch = 0
    bbox_loss_per_epoch = 0

    for i, data in enumerate(dataloader_dataset, start=1):
        print(f"\t- Step {i}/{total_batches}", end='')

        X = []
        y = []
        for d in data:
            objs = d[1]['annotation']['object']
            bboxes = []
            labels = []
            for i in range(len(objs)):
                bbox_dict = objs[i]['bndbox']
                # must be in this order or height/width may be negative due to wrong order
                bbox = [int(bbox_dict[key]) for key in ['xmin', 'ymin', 'xmax', 'ymax']]
                bboxes.append(bbox)

                label = objs[i]['name']
                label = class_to_index[label]
                labels.append(label)

            bboxes = torch.as_tensor(bboxes, dtype=torch.int64)
            labels = torch.as_tensor(labels, dtype=torch.int64)

            # need to move data to GPU using cuda()
            X.append(d[0].cuda())
            y.append({
                'boxes': bboxes.cuda(),
                'labels': labels.cuda(),
            })

        loss_dict = model(X, y)
        loss = sum(v for v in loss_dict.values())
        # print(loss_dict)
        classifier_loss_per_epoch += loss_dict['loss_classifier']
        bbox_loss_per_epoch += loss_dict['loss_box_reg']

        print(f" - Losses: {loss_dict['loss_classifier']} + {loss_dict['loss_box_reg']}", end='\n')

        opt.zero_grad()
        loss.backward()
        opt.step()

    end = time()
    print(f"Classifier Loss: {classifier_loss_per_epoch}")
    print(f"BBox Regression Loss: {bbox_loss_per_epoch}")
    print(f"Took {end - start}s")
    print()


Epoch 1/1
	- Step 1/722 - Losses: 2.809926748275757 + 0.059281934052705765
	- Step 2/722 - Losses: 2.6568870544433594 + 0.025595858693122864
	- Step 3/722 - Losses: 2.5669920444488525 + 0.05643651261925697
	- Step 4/722 - Losses: 2.3812992572784424 + 0.06163829192519188
	- Step 5/722 - Losses: 2.28155779838562 + 0.03883455693721771
	- Step 6/722 - Losses: 2.1434621810913086 + 0.02869715541601181
	- Step 7/722 - Losses: 2.0219149589538574 + 0.03044021874666214
	- Step 8/722 - Losses: 1.9284614324569702 + 0.04599500447511673
	- Step 9/722 - Losses: 1.7847416400909424 + 0.035258494317531586
	- Step 10/722 - Losses: 1.6808841228485107 + 0.043107181787490845
	- Step 11/722 - Losses: 1.6064794063568115 + 0.04273711144924164
	- Step 12/722 - Losses: 1.4956637620925903 + 0.07307571917772293
	- Step 13/722 - Losses: 1.4263076782226562 + 0.06863649189472198
	- Step 14/722 - Losses: 1.3210011720657349 + 0.038249120116233826
	- Step 15/722 - Losses: 1.2404628992080688 + 0.061552368104457855
	- Ste

In [18]:
# save model
torch.save(model.state_dict(), "./finetuned-faster-rcnn.pth")

In [9]:
# reload
model.load_state_dict(torch.load("./finetuned-faster-rcnn.pth", weights_only=True))

<All keys matched successfully>

In [9]:
model.eval()
model.to(device)

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
       

In [10]:
clf_loss = 0
bbox_loss = 0
X = []
y = []
y_preds = []
for data in dataloader_dataset:
        for d in data:
            objs = d[1]['annotation']['object']
            bboxes = []
            labels = []
            for i in range(len(objs)):
                bbox_dict = objs[i]['bndbox']
                bbox = [int(bbox_dict[key]) for key in ['xmin', 'ymin', 'xmax', 'ymax']]
                bboxes.append(bbox)

                label = objs[i]['name']
                label = class_to_index[label]
                labels.append(label)

            bboxes = torch.as_tensor(bboxes, dtype=torch.int64)
            labels = torch.as_tensor(labels, dtype=torch.int64)

            X.append(d[0].cuda())
            y.append({
                'boxes': bboxes.cuda(),
                'labels': labels.cuda(),
            })

        y_preds = model(X)
        break # only loop for 1 batch

In [13]:
y[0]

{'boxes': tensor([[  1, 215, 103, 287],
         [ 23, 195,  55, 218]], device='cuda:0'),
 'labels': tensor([8, 8], device='cuda:0')}

In [25]:
y_preds[0]

{'boxes': tensor([[185.0494, 114.7610, 194.3344, 124.6680],
         [182.6682, 116.0456, 192.1408, 123.5601],
         [129.4860, 143.7688, 193.9598, 186.5196],
         [  3.4847, 109.3897,  24.7011, 130.3596],
         [284.0595,   8.6814, 298.8501, 152.1658],
         [180.2316, 113.9339, 194.6161, 131.0658],
         [115.2738, 131.2514, 201.0613, 199.6128],
         [204.9786, 157.8374, 213.5469, 179.5437],
         [145.7702, 173.2375, 193.5998, 211.5951],
         [ 30.3494, 139.3583, 160.5053, 206.2996],
         [191.2969,  27.0418, 199.0635,  34.3089],
         [275.8835, 131.6127, 298.4948, 188.8290],
         [276.4227, 155.6070, 293.3477, 189.3974],
         [190.1504,  23.3716, 200.9082,  36.1661],
         [  8.1414, 109.4583,  23.2716, 124.0882],
         [  0.0000, 112.6425,  38.9359, 151.6787],
         [265.9276, 139.8323, 284.7711, 148.8715],
         [114.3245, 159.9930, 178.9964, 187.3891],
         [182.1668, 115.7636, 193.0047, 123.8126],
         [179.6382, 11

In [16]:
!pip install torchmetrics

Collecting torchmetrics
  Downloading torchmetrics-1.6.1-py3-none-any.whl.metadata (21 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.12.0-py3-none-any.whl.metadata (5.6 kB)
Downloading torchmetrics-1.6.1-py3-none-any.whl (927 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m927.3/927.3 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.12.0-py3-none-any.whl (28 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.12.0 torchmetrics-1.6.1


In [40]:
from torchmetrics.detection import MeanAveragePrecision

mAP = MeanAveragePrecision()

preds = [dict(
    boxes=y_preds[0]['boxes'],
    labels=y_preds[0]['labels'],
    scores=y_preds[0]['scores'],
)]
targets = [dict(
    boxes=y[0]['boxes'],
    labels=y[0]['labels'],
)]

mAP.update(preds, targets)

mAP.compute()

{'map': tensor(0.),
 'map_50': tensor(0.),
 'map_75': tensor(0.),
 'map_small': tensor(0.),
 'map_medium': tensor(0.),
 'map_large': tensor(-1.),
 'mar_1': tensor(0.),
 'mar_10': tensor(0.),
 'mar_100': tensor(0.),
 'mar_small': tensor(0.),
 'mar_medium': tensor(0.),
 'mar_large': tensor(-1.),
 'map_per_class': tensor(-1.),
 'mar_100_per_class': tensor(-1.),
 'classes': tensor([ 1,  2,  3,  7,  8,  9, 10, 13, 14, 18, 19], dtype=torch.int32)}

In [41]:
from torchmetrics.detection import IntersectionOverUnion

iou = IntersectionOverUnion()
preds = [dict(
    boxes=y_preds[0]['boxes'],
    labels=y_preds[0]['labels']
)]
targets = [dict(
    boxes=y[0]['boxes'],
    labels=y[0]['labels']
)]

iou.update(preds, targets)
iou.compute()

{'iou': tensor(0.0403, device='cuda:0')}