I go with hugginface api because i wanted to try and learn how to use it 🫡

In [1]:
# A dataset class for reading COCO-style AU-AIR annotations.
import torch
from torch.utils.data import Dataset
from PIL import Image
import os
import json

class AUAirCocoDataset(Dataset):
    def __init__(self, images_dir, annotation_path, processor, transforms=None):
        # Load COCO-format JSON
        with open(annotation_path) as f:
            coco = json.load(f)

        self.images_dir = images_dir
        self.processor = processor
        self.transforms = transforms

        # Build id-to-image dictionary
        self.image_id_to_info = {img['id']: img for img in coco['images']}

        # Organize annotations by image_id
        self.image_id_to_annotations = {}
        for ann in coco['annotations']:
            img_id = ann['image_id']
            if img_id not in self.image_id_to_annotations:
                self.image_id_to_annotations[img_id] = []
            self.image_id_to_annotations[img_id].append(ann)

        self.ids = list(self.image_id_to_info.keys())
        self.categories = {cat["id"]: cat["name"] for cat in coco["categories"]}

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        img_id = self.ids[idx]
        img_info = self.image_id_to_info[img_id]
        file_path = os.path.join(self.images_dir, img_info['file_name'])

        # Load image
        image = Image.open(file_path).convert("RGB")

        # Get annotations
        anns = self.image_id_to_annotations.get(img_id, [])

        boxes = [ann['bbox'] for ann in anns]
        labels = [ann['category_id'] for ann in anns]

        # Convert (x, y, w, h) to (x_min, y_min, x_max, y_max) (coordinate format)
        boxes = torch.tensor(boxes, dtype=torch.float)
        boxes[:, 2:] += boxes[:, :2]

        target = {
            "image_id": torch.tensor([img_id]),
            "class_labels": labels,
            "boxes": boxes
        }

        # Apply DETR processor
        encoding = self.processor(
            image,
            annotations={"image_id": img_id, "annotations": anns},
            return_tensors="pt"
        )

        # Only squeeze tensor fields
        encoding = {
            k: (v.squeeze(0) if isinstance(v, torch.Tensor) else v)
            for k, v in encoding.items()
        }

        return encoding


In [2]:
# Instantiate the Dataset for Train / Val / Test
# load the DetrImageProcessor point it to images and instances_*.json, and create dataset objects.

from transformers import DetrImageProcessor

# Load DETR processor
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")

# File paths
base_dir = "auair2019"
img_dir = os.path.join(base_dir, "images")
split_dir = os.path.join(base_dir, "splits")

# Dataset instances
train_dataset = AUAirCocoDataset(
    images_dir=img_dir,
    annotation_path=os.path.join(split_dir, "instances_train.json"),
    processor=processor
)

val_dataset = AUAirCocoDataset(
    images_dir=img_dir,
    annotation_path=os.path.join(split_dir, "instances_val.json"),
    processor=processor
)

test_dataset = AUAirCocoDataset(
    images_dir=img_dir,
    annotation_path=os.path.join(split_dir, "instances_test.json"),
    processor=processor
)

print("Datasets ready")
print(f"Train samples: {len(train_dataset)}")
print(f"Val samples: {len(val_dataset)}")
print(f"Test samples: {len(test_dataset)}")


  from .autonotebook import tqdm as notebook_tqdm


Datasets ready
Train samples: 19693
Val samples: 3283
Test samples: 9847


##  HuggingFace TrainingArguments + Trainer for DETR
DetrForObjectDetection (pretrained on COCO)\
Trainer \
wandb

### Set Up Training

In [3]:
# !pip install timm
# -> i did: pip install datasets==2.18.0 --force-reinstall :(

In [4]:
from transformers import DetrForObjectDetection, TrainingArguments, Trainer
import torch

# Load pretrained DETR (COCO, 91 classes) and adapt to 8 AU-AIR classes
model = DetrForObjectDetection.from_pretrained(
    "facebook/detr-resnet-50",
    num_labels=8,  # AU-AIR has 8 categories
    ignore_mismatched_sizes=True  # allows head resizing
).to('cuda')

# log in to wandb
import wandb
wandb.login()

training_args = TrainingArguments(
    output_dir="./detr-auair-output",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=100,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    save_total_limit=2,
    remove_unused_columns=False,  # must be False for object detection (it disables HuggingFace’s input sanitization which would break images/targets)
    report_to="wandb",            # change to "none" if not using wandb
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    logging_dir="./logs"
)


Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DetrForObjectDetection were not initialized from the model checkpoin

In [5]:
#from transformers import TrainingArguments
#help(TrainingArguments)
# i needed to look for an argument name


### Add COCO mAP Metric (via pycocotools)
-> pycocotools: HuggingFace's internal structure to convert predictions + targets into COCO-compatible format


In [6]:
# !pip install pycocotools

In [7]:
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
import numpy as np
import tempfile
import json
import os

def compute_coco_map(p, dataset):
    """
    Compute COCO-style mAP using pycocotools
    Args:
        p: EvalPrediction object with `predictions` and `label_ids`
        dataset: The original dataset with annotation metadata
    Returns:
        Dictionary of COCO metrics (e.g., mAP, mAP50, mAP75, etc.)
    """
    # Temporary COCO files
    with tempfile.TemporaryDirectory() as tmpdir:
        gt_path = os.path.join(tmpdir, "gt.json")
        pred_path = os.path.join(tmpdir, "pred.json")

        # --- Ground truth ---
        coco_gt = {
            "images": [],
            "annotations": [],
            "categories": dataset.categories
        }

        ann_id = 1
        for i, data in enumerate(dataset):
            img_info = dataset.image_id_to_info[dataset.ids[i]]
            image_id = img_info['id']

            coco_gt["images"].append({
                "id": image_id,
                "file_name": img_info['file_name'],
                "width": img_info['width'],
                "height": img_info['height']
            })

            for ann in dataset.image_id_to_annotations.get(image_id, []):
                bbox = ann["bbox"]
                coco_gt["annotations"].append({
                    "id": ann_id,
                    "image_id": image_id,
                    "category_id": ann["category_id"],
                    "bbox": bbox,
                    "area": bbox[2] * bbox[3],
                    "iscrowd": 0
                })
                ann_id += 1

        # Save GT
        with open(gt_path, "w") as f:
            json.dump(coco_gt, f)

        # --- Predictions ---
        predictions = []
        for i, pred in enumerate(p.predictions):
            target_image_id = dataset.ids[i]
            target_info = dataset.image_id_to_info[target_image_id]

            boxes = pred["boxes"].tolist()
            scores = pred["scores"].tolist()
            labels = pred["labels"].tolist()

            for box, score, label in zip(boxes, scores, labels):
                # Convert (x_min, y_min, x_max, y_max) to (x, y, w, h)
                x, y, x2, y2 = box
                predictions.append({
                    "image_id": target_info['id'],
                    "category_id": int(label),
                    "bbox": [x, y, x2 - x, y2 - y],
                    "score": float(score)
                })

        # Save predictions
        with open(pred_path, "w") as f:
            json.dump(predictions, f)

        # Evaluate with COCO API
        coco = COCO(gt_path)
        coco_dt = coco.loadRes(pred_path)
        coco_eval = COCOeval(coco, coco_dt, iouType="bbox")
        coco_eval.evaluate()
        coco_eval.accumulate()
        coco_eval.summarize()

        # Return key metrics
        metrics = {
            "mAP": coco_eval.stats[0],
            "mAP50": coco_eval.stats[1],
            "mAP75": coco_eval.stats[2],
        }
        return metrics


### Trainer Setup
Pass train_dataset, val_dataset\
Define a custom compute_metrics function that returns mAP from pycocotools\
Wrap it all into Trainer

In [8]:
def dummy_compute_metrics(eval_pred):
    # We skip custom mAP logic for now to avoid runtime issues
    # We'll visualize results later instead of COCOEval during training
    return {}


*****************************************************************
ValueError: could not determine the shape of object type 'BatchFeature'
*****************************************************************
DETR model, which expects a batch of tensors for pixel inputs and a list of dicts for targets (labels), so we must override Hugging Face’s default collation.

In [9]:
from torch.utils.data.dataloader import default_collate

def detr_collate_fn(batch):
    """
    Collates a batch of data points for DETR.
    - Tensors (pixel_values, pixel_mask) are collated into a single tensor.
    - Labels (which are a list of dicts) are left untouched (list of dicts).
    """
    batch_out = {}

    # Assumes all items have the same keys
    keys = batch[0].keys()

    for key in keys:
        if key == "labels":
            # Keep as a list of dicts (no collation)
            batch_out[key] = [item[key][0] if isinstance(item[key], list) else item[key] for item in batch]
        else:
            # Stack tensors
            batch_out[key] = default_collate([item[key] for item in batch])

    return batch_out


In [10]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=processor,
    data_collator=detr_collate_fn,  # >- override default
    compute_metrics=dummy_compute_metrics,  # we will replace with real MAP later
)


  trainer = Trainer(


Log training and eval loss\
Save best checkpoint\
Automatically resume if interrupted\
Send metrics to Weights & Biases

In [11]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,1.6792,1.604897
2,1.5542,1.578166
3,1.6156,1.53196
4,1.475,1.516945
5,1.5015,1.478626
6,1.3938,1.468685
7,1.4357,1.415659
8,1.4215,1.367974
9,1.3956,1.34507
10,1.339,1.326881


TrainOutput(global_step=49240, training_loss=1.4957362581713232, metrics={'train_runtime': 39020.2088, 'train_samples_per_second': 5.047, 'train_steps_per_second': 1.262, 'total_flos': 1.4699139089039938e+20, 'train_loss': 1.4957362581713232, 'epoch': 10.0})