In [1]:
!pip install pycocotools

Collecting pycocotools
  Downloading pycocotools-2.0.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Downloading pycocotools-2.0.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (477 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m477.3/477.3 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: pycocotools
Successfully installed pycocotools-2.0.10


In [11]:
import torch
import torchvision
from torchvision.datasets import CocoDetection
import torchvision.transforms as T
from torch.utils.data import DataLoader

In [12]:
image_dir = "synth"
ann_file = "synth/synth_image_annotations.json"

transform = T.Compose([
    T.ToTensor()
])

dataset = CocoDetection(root=image_dir, annFile=ann_file, transform=transform)

loading annotations into memory...
Done (t=0.01s)
creating index...
index created!


In [13]:
def convert_target(target):
    """
    Konvertiert eine Liste von COCO-Annotationen in das Format,
    das vom Faster R-CNN erwartet wird:
      - "boxes": [x_min, y_min, x_max, y_max]
      - "labels": Kategorie-IDs
    """
    boxes = []
    labels = []
    for obj in target:
        bbox = obj["bbox"]  # Format: [x, y, width, height]
        boxes.append([bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3]])
        labels.append(obj["category_id"])
    if len(boxes) == 0:
        boxes = torch.zeros((0, 4), dtype=torch.float32)
        labels = torch.zeros((0,), dtype=torch.int64)
    else:
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.int64)
    return {"boxes": boxes, "labels": labels}

def collate_fn(batch):
    images, targets = list(zip(*batch))
    images = list(images)
    targets = [convert_target(t) for t in targets]
    return images, targets

dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

In [14]:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
model.train()

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

optimizer = torch.optim.SGD(model.parameters(), lr=0.005, momentum=0.9, weight_decay=0.0005)

num_epochs = 10

for epoch in range(num_epochs):
    epoch_loss = 0.0
    for images, targets in dataloader:

        images = [img.to(device) for img in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        epoch_loss += losses.item()

    print(f"Epoche {epoch+1}/{num_epochs}, durchschnittlicher Loss: {epoch_loss:.4f}")


Epoche 1/10, durchschnittlicher Loss: 24.9315
Epoche 2/10, durchschnittlicher Loss: 14.5345
Epoche 3/10, durchschnittlicher Loss: 11.9186
Epoche 4/10, durchschnittlicher Loss: 9.3234
Epoche 5/10, durchschnittlicher Loss: 7.9085
Epoche 6/10, durchschnittlicher Loss: 7.0247
Epoche 7/10, durchschnittlicher Loss: 6.2426
Epoche 8/10, durchschnittlicher Loss: 5.4677
Epoche 9/10, durchschnittlicher Loss: 4.7587
Epoche 10/10, durchschnittlicher Loss: 4.6294
