In [62]:
import fiftyone as fo
from utils.dataset_loader import FiftyOneTorchDatasetCOCO, TorchToHFDatasetCOCO
from transformers import AutoProcessor, AutoModelForObjectDetection, EarlyStoppingCallback, Trainer, TrainingArguments
from datasets import Split
import numpy as np
import torch
from functools import partial
from PIL import Image

In [63]:
# Dataset conversion into Hugging Face format
dataset_v51 = fo.load_dataset("fisheye8k-100")
dataset_torch = FiftyOneTorchDatasetCOCO(dataset_v51, gt_field="detections")
converter_torch2hf = TorchToHFDatasetCOCO(dataset_torch)
dataset_hf = converter_torch2hf.convert()

# Small dataset has only train split. Split into train, test, val
train_test_split = dataset_hf["train"].train_test_split(test_size=0.4)
test_val_split = train_test_split["test"].train_test_split(test_size=0.5)
dataset_hf["train"] = train_test_split["train"]
dataset_hf["test"] = test_val_split["train"]
dataset_hf["validation"] = test_val_split["test"]

Generating torch dataset from Voxel51 dataset: 100%|██████████| 100/100 [00:00<00:00, 179.45it/s]
Generating train split: 100 examples [00:00, 176.76 examples/s]


In [64]:
# Class Mapping from ID to Name
classes = dataset_v51.default_classes
id2label = {i: class_name for i, class_name in enumerate(classes)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
# Object Detection Finetuning
# https://huggingface.co/docs/transformers/en/tasks/object_detection

MODEL_NAME = "microsoft/conditional-detr-resnet-50"
MAX_SIZE = 512 # If tiny GPU memory

# Preprocess setup for Hugging Face
image_processor = AutoProcessor.from_pretrained(
    MODEL_NAME,
    size={"max_height": MAX_SIZE, "max_width": MAX_SIZE},
    do_pad=True,
    pad_size={"height": MAX_SIZE, "width": MAX_SIZE})

image_processor expects the annotations to be in the following format: {'image_id': int, 'annotations': List[Dict]}, where each dictionary is a COCO object annotation

In [66]:
def transform_batch(
    batch,
    image_processor,
    return_pixel_mask=False,
):
    images = []
    annotations = []

    for image_path, annotation in zip(batch["image_path"], batch["objects"]):
        image = Image.open(image_path).convert("RGB")
        image_np = np.array(image)
        images.append(image_np)

        coco_annotations = []
        for i, bbox in enumerate(annotation["bbox"]):
            coco_annotation = {
                "image_id": annotation["image_id"],
                "bbox": bbox,
                "category_id": annotation["category_id"][i],
                "area": annotation["area"][i],
                "iscrowd": 0,
            }
            coco_annotations.append(coco_annotation)
        detr_annotation = {
            "image_id": annotation["image_id"],
            "annotations": coco_annotations,
        }
        annotations.append(detr_annotation)

        # Apply the image processor transformations: resizing, rescaling, normalization
        result = image_processor(
            images=images, annotations=annotations, return_tensors="pt"
        )

    if not return_pixel_mask:
        result.pop("pixel_mask", None)

    return result

In [None]:
def collate_fn(batch):
    """Collate function for batching data during training and inference."""
    data = {}
    data["pixel_values"] = torch.stack([x["pixel_values"] for x in batch])
    data["labels"] = [x["labels"] for x in batch]
    if "pixel_mask" in batch[0]:
        data["pixel_mask"] = torch.stack([x["pixel_mask"] for x in batch])
    return data

In [68]:
split_transform_batch = partial(
            transform_batch,
            image_processor=image_processor,
        )

dataset_hf[Split.TRAIN] = dataset_hf[Split.TRAIN].with_transform(
            split_transform_batch)
dataset_hf[Split.VALIDATION] = dataset_hf[Split.VALIDATION].with_transform(
            split_transform_batch)
dataset_hf[Split.TEST] = dataset_hf[Split.TEST].with_transform(
            split_transform_batch)

In [69]:
model = AutoModelForObjectDetection.from_pretrained(
    MODEL_NAME,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
)

Some weights of ConditionalDetrForObjectDetection were not initialized from the model checkpoint at microsoft/conditional-detr-resnet-50 and are newly initialized because the shapes did not match:
- class_labels_classifier.bias: found shape torch.Size([91]) in the checkpoint and torch.Size([5]) in the model instantiated
- class_labels_classifier.weight: found shape torch.Size([91, 256]) in the checkpoint and torch.Size([5, 256]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [70]:
training_args = TrainingArguments(
    run_name=MODEL_NAME,
    num_train_epochs=12,
    fp16=True,
    per_device_train_batch_size=16,
    auto_find_batch_size=True,
    dataloader_num_workers=8,
    learning_rate=5e-05,
    lr_scheduler_type="cosine",
    weight_decay=0.0001,
    max_grad_norm=0.01,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    load_best_model_at_end=True,
    eval_strategy="epoch",
    save_strategy="best",
    save_total_limit=1,
    remove_unused_columns=False,
    eval_do_concat_batches=False,
    save_safetensors=False,
    push_to_hub=False,
)

In [71]:
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=5,
    early_stopping_threshold=0,
)

In [72]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_hf[Split.TRAIN],
    eval_dataset=dataset_hf[Split.VALIDATION],
    tokenizer=image_processor,
    data_collator=collate_fn,
    callbacks=[early_stopping_callback],
)

  trainer = Trainer(


In [None]:
trainer.train()

[{'pixel_values': tensor([[[-1.8268, -1.8268, -1.8439,  ..., -0.6794, -0.4054, -0.8507],
         [-1.8268, -1.8268, -1.8439,  ..., -0.3198, -0.3712, -0.7650],
         [-1.8439, -1.8439, -1.8439,  ..., -0.6109, -0.5424, -1.0219],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[-1.7731, -1.7906, -1.8081,  ..., -0.7752, -0.4776, -0.9503],
         [-1.7906, -1.7906, -1.8081,  ..., -0.3901, -0.4426, -0.8452],
         [-1.8081, -1.8081, -1.8081,  ..., -0.7052, -0.6352, -1.1253],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[-1.7173, -1.7173, -1.7347,  ..., -0.9330, -0.6890, -1.1770],
         [-1.7173, -1.7173,

Epoch,Training Loss,Validation Loss
1,No log,13.669957
2,No log,7.699037
3,No log,5.115666


[{'pixel_values': tensor([[[-2.0152, -2.0152, -2.0152,  ..., -1.5870, -1.5185, -1.5699],
         [-2.0152, -2.0152, -2.0152,  ..., -1.0390, -1.0562, -1.0390],
         [-2.0152, -2.0152, -2.0152,  ..., -1.2788, -1.0733, -1.2274],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[-1.9307, -1.9307, -1.9307,  ..., -1.4755, -1.4055, -1.4580],
         [-1.9307, -1.9307, -1.9307,  ..., -0.8978, -0.9153, -0.8803],
         [-1.9132, -1.9132, -1.9307,  ..., -1.1429, -0.9328, -1.0903],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[-1.6650, -1.6650, -1.6650,  ..., -1.3339, -1.2816, -1.3164],
         [-1.6999, -1.6999,

In [None]:
metrics = trainer.evaluate(eval_dataset=dataset_hf[Split.TEST])
print(f"Model training completed. Evaluation results: {metrics}")