In [1]:
import torch
import os
from torchvision import transforms
from torchvision.models.detection import FasterRCNN_ResNet50_FPN_Weights
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torch.utils.data import DataLoader
from torchvision.datasets import CocoDetection
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.transforms import functional as F
from PIL import Image
from xml.etree import ElementTree as ET

In [2]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, root, transforms=None):
        self.root = root
        self.transforms = transforms
        self.annotations = self.load_annotations()

    def __getitem__(self, idx):
        # Load image and annotations
        img_path = os.path.join(self.root, 
                                'sfrs_' + self.annotations[idx]['filename'][2], 
                                self.annotations[idx]['filename'][:-7], 
                                self.annotations[idx]['filename'])
        
        img = Image.open(img_path).convert("RGB")
        target = self.annotations[idx]['annotation']
        
        if self.transforms is not None:
            img = self.transforms(img)
        
        return img, target

    def __len__(self):
        return len(self.annotations)

    def load_annotations(self):
        annotations = []
        # Parse XML files
        for root, dirs, files in os.walk(self.root):
            for xml_file in files:
                if xml_file.endswith(".xml"):
                    tree = ET.parse(os.path.join(root, xml_file))
                    root = tree.getroot()
                    annotation = {
                        'filename': root.find('filename').text,
                        'annotation': {
                            'boxes': [],
                            'labels': []
                        }
                    }
                    for obj in root.findall('object'):
                        box = obj.find('bndbox')
                        xmin = int(box.find('xmin').text)
                        ymin = int(box.find('ymin').text)
                        xmax = int(box.find('xmax').text)
                        ymax = int(box.find('ymax').text)

                        annotation['annotation']['boxes'].append([xmin, ymin, xmax, ymax])
                        annotation['annotation']['labels'].append(int(obj.find('name').text))

                    annotations.append(annotation)

        return annotations

In [3]:
# Define the transformation
transform = transforms.Compose([
    transforms.ToTensor(),
])

In [4]:
dataset = CustomDataset(root='../data/data', transforms=transform)

In [5]:
# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

In [6]:
# Define data loaders
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False, num_workers=0)

In [7]:
# Load a pre-trained Faster R-CNN model
model = fasterrcnn_resnet50_fpn(weights=FasterRCNN_ResNet50_FPN_Weights.DEFAULT)

In [8]:
num_classes = 1  
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

In [9]:
# Define the device (GPU if available, otherwise CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [10]:
# Define the optimizer and the learning rate scheduler
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

In [12]:
for images, targets in train_loader[:1]:
    

TypeError: 'DataLoader' object is not subscriptable

In [11]:
# Define the training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
   
    for images, targets in train_loader:
        
        images = list(image.to(device) for image in images)
        targets = [{
                'boxes': torch.tensor([[b[i].item() for b in targets['boxes'][0]] for i in range(2)][i], dtype=torch.float32),
                'labels': torch.tensor([l.item() for l in targets['labels'][0]][i], dtype=torch.int64)
            } for i in range(2)]
        
        print(targets)
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

    # Update the learning rate
    lr_scheduler.step()

    # Validation
    model.eval()
    with torch.no_grad():
        for images, targets in val_loader:
            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            # Forward pass
            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {losses.item()}")


[{'boxes': tensor([ 497.,  322., 1966.,  646.]), 'labels': tensor(1)}, {'boxes': tensor([ 449.,  236., 1907.,  591.]), 'labels': tensor(1)}]


AssertionError: Expected target boxes to be a tensor of shape [N, 4], got torch.Size([4]).

In [None]:
torch.save(model.state_dict(), '../models')