In [1]:
import numpy as np
from lxml import objectify
import os
import torch

from PIL import Image
import torchvision
import torch
from torchvision.io import read_image
from torchvision.transforms.functional import pil_to_tensor
from torchvision.transforms.v2 import functional as F
from torchvision import tv_tensors
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.transforms import v2 as T

In [13]:
class HRSC2016(torch.utils.data.Dataset):
    def __init__(self, root, transforms=None, imageset='train'):
        self.root = root
        self.transforms = transforms

        with open(os.path.join(root, "ImageSets", f'{imageset}.txt'), 'r') as f:
            required_imgs = set(f.read().split())

        self.imgs = list(filter(lambda x: x.split('.')[0] in required_imgs, sorted(os.listdir(os.path.join(root, "AllImages")))))
        self.annots = list(filter(lambda x: x.split('.')[0] in required_imgs, sorted(os.listdir(os.path.join(root, "Annotations")))))

    def __getitem__(self, idx):
        img_path = os.path.join(self.root, "AllImages", self.imgs[idx])
        annot_path = os.path.join(self.root, "Annotations", self.annots[idx])
        
        img = F.pil_to_tensor(Image.open(img_path))
        with open(annot_path, 'rb') as f:
            annot_root = objectify.fromstring(f.read())

        try:
            num_obj = len(annot_root.object)
        except:
            num_obj = 0
        
        bbs = []
        if num_obj > 0:
            for obj in annot_root.object:
                bbox_xml = obj.bndbox
                bbox = [int(bbox_xml.xmin), int(bbox_xml.ymin), int(bbox_xml.xmax), int(bbox_xml.ymax)]
                bbox = torch.tensor(bbox)
                bbs.append(bbox)

        bboxes = torch.stack(bbs)
        areas = (bboxes[:, 3] - bboxes[:, 1]) * (bboxes[:, 2] - bboxes[:, 0])
        
        bboxes = tv_tensors.BoundingBoxes(bboxes, format='XYXY', canvas_size=F.get_size(img))
        labels = torch.ones((num_obj,), dtype=torch.int64)
        iscrowd = torch.zeros((num_obj,), dtype=torch.int64)
        
        img = tv_tensors.Image(img)
        target = dict()
        target['boxes'] = bboxes
        target['labels'] = labels
        target['image_id'] = idx
        target['area'] = areas
        target['iscrowd'] = iscrowd
        
        if self.transforms is not None:
            img, target = self.transforms(img, target)

        return img, target

    def __len__(self):
        return len(self.imgs)

In [14]:
dataset = HRSC2016('/kaggle/input/hrsc2016-ms-dataset', imageset='trainval')

len(dataset)

1070

In [5]:
os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/engine.py")
os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/utils.py")
os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_utils.py")
os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_eval.py")
os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/transforms.py")

0

In [6]:
from engine import train_one_epoch, evaluate
import utils

In [15]:
train_dataset = HRSC2016('/kaggle/input/hrsc2016-ms-dataset', imageset='trainval',
                         transforms=T.ToDtype(torch.float, scale=True))
test_dataset = HRSC2016('/kaggle/input/hrsc2016-ms-dataset', imageset='test',
                        transforms=T.ToDtype(torch.float, scale=True))

In [16]:
batch_size=64
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=utils.collate_fn
)

test_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=utils.collate_fn
)

In [17]:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")

num_classes = 2  # ship + background
in_features = model.roi_heads.box_predictor.cls_score.in_features

model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

model.to(device)

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [18]:
params = [p for p in model.parameters() if p.requires_grad]

optimizer = torch.optim.SGD(
    params,
    lr=0.05
)

lr_scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer,
    step_size=3,
    gamma=0.1
)

In [19]:
num_epochs = 2

for epoch in range(num_epochs):
    train_one_epoch(model, optimizer, train_loader, device, epoch, print_freq=10)
    
    lr_scheduler.step()
    evaluate(model, test_loader, device=device)

RuntimeError: stack expects a non-empty TensorList

In [12]:
!cat /kaggle/input/hrsc2016-ms-dataset/Annotations/100000686.xml

<?xml version="1.0" encoding="utf-8"?>
<annotation>
	<copyright>Aurora Group, VIP Lab, Xidian University</copyright>
	<contributors>Weiming Chen, Zizheng Ren, Bing Han, Zheng Yang, Yang Zhou, Xiaoyue Huang</contributors>
	<dataset>HRSC2016-MS</dataset>
	<source_dataset>HRSC2016</source_dataset>
	<label_level>L1</label_level>
	<filename>100000686</filename>
	<size>
		<width>1142</width>
		<height>830</height>
		<depth>3</depth>
	</size>
</annotation>