In [1]:
import os

os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

In [2]:
%pip install -U datasets transformers[torch] evaluate timm albumentations accelerate

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
# https://huggingface.co/learn/computer-vision-course/unit3/vision-transformers/vision-transformer-for-objection-detection

In [4]:
import albumentations
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoImageProcessor,
    AutoModelForObjectDetection,
    Trainer,
    TrainingArguments,
)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
data = load_dataset("moyanxinxu/container")

In [6]:
data

DatasetDict({
    train: Dataset({
        features: ['image', 'image_id', 'width', 'height', 'objects'],
        num_rows: 1049
    })
    test: Dataset({
        features: ['image', 'image_id', 'width', 'height', 'objects'],
        num_rows: 263
    })
})

In [7]:
preprocessor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50")

In [8]:
data["train"][0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1120x1080>,
 'image_id': 475,
 'width': 1120,
 'height': 1080,
 'objects': {'area': [23562.0],
  'bbox': [[415.0, 592.0, 374.0, 63.0]],
  'category': ['container'],
  'id': [0]}}

In [9]:
aug = albumentations.Compose(
    transforms=[
        albumentations.Resize(480, 480),
        albumentations.HorizontalFlip(p=1),
        albumentations.RandomBrightnessContrast(p=1.0),
    ],
    bbox_params=albumentations.BboxParams(format="coco", label_fields=["category"]),
)

In [10]:
def datapipe(data):
    images, bboxes, areas, categories, targets = [], [], [], [], []

    image_ids = data["image_id"]

    for image, objects in zip(data["image"], data["objects"]):
        image = np.array(image.convert("RGB"))[:, :, ::-1]
        out = aug(image=image, bboxes=objects["bbox"], category=objects["id"])

        areas.append(objects["area"])

        images.append(out["image"])
        bboxes.append(out["bboxes"])
        categories.append(out["category"])

    for image_id, category, area, box in zip(image_ids, categories, areas, bboxes):
        annotations = []

        for _category, _area, _box in zip(category, area, box):
            new_ann = {
                "image_id": image_id,
                "category_id": _category,
                "isCrowd": 0,
                "area": _area,
                "bbox": list(_box),
            }
            annotations.append(new_ann)
        targets.append({"image_id": image_id, "annotations": annotations})
    return preprocessor(images=images, annotations=targets, return_tensors="pt")

In [11]:
train_data = data["train"].with_transform(datapipe)
# val_data = data["validation"].with_transform(datapipe)
test_data = data["test"].with_transform(datapipe)

In [12]:
def collate_fn(batch):
    pixel_values = [item["pixel_values"] for item in batch]
    output = preprocessor.pad(pixel_values, return_tensors="pt")
    labels = [item["labels"] for item in batch]

    ret = {}

    ret["pixel_values"] = output["pixel_values"]
    ret["pixel_mask"] = output["pixel_mask"]
    ret["labels"] = labels

    return ret

In [13]:
print(train_data[0])

The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


{'pixel_values': tensor([[[-0.3198, -0.3712, -0.4568,  ...,  0.0569,  0.0398,  0.0398],
         [-0.3883, -0.4397, -0.5424,  ...,  0.0569,  0.0398,  0.0398],
         [-0.4739, -0.5424, -0.6623,  ...,  0.0398,  0.0227,  0.0227],
         ...,
         [ 0.3138,  0.2796,  0.2282,  ...,  0.2282,  0.2624,  0.2796],
         [ 0.2453,  0.2453,  0.2624,  ...,  0.2624,  0.2796,  0.2624],
         [ 0.1939,  0.2282,  0.2967,  ...,  0.2967,  0.2796,  0.2624]],

        [[-0.1275, -0.2150, -0.3375,  ...,  0.1001,  0.0826,  0.0826],
         [-0.1975, -0.3025, -0.4251,  ...,  0.1001,  0.0826,  0.0826],
         [-0.3200, -0.4251, -0.5651,  ...,  0.0826,  0.0651,  0.0651],
         ...,
         [ 0.5028,  0.4678,  0.4328,  ...,  0.1527,  0.2052,  0.2402],
         [ 0.4328,  0.4328,  0.4678,  ...,  0.1702,  0.1702,  0.1702],
         [ 0.3803,  0.4153,  0.4853,  ...,  0.1702,  0.1527,  0.1352]],

        [[ 0.0953,  0.0082, -0.1138,  ...,  0.2348,  0.2173,  0.1999],
         [ 0.0082, -0.0964, 

In [14]:
id2label = {0: "container"}
label2id = {"container": 0}


model = AutoModelForObjectDetection.from_pretrained(
    "facebook/detr-resnet-50",
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
)

Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DetrForObjectDetection were not initialized from the model checkpoin

In [19]:
training_args = TrainingArguments(
    output_dir="detr-resnet-50-container-finetuned",
    per_device_train_batch_size=10,
    num_train_epochs=20,
    eval_strategy="epoch",
    learning_rate=1e-5,
    weight_decay=1e-4,
    save_total_limit=2,
    remove_unused_columns=False,
)

# Define the trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer=preprocessor,
)

Detected kernel version 4.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,2.409549
2,No log,1.270554
3,No log,1.10066
4,No log,1.064438
5,1.888600,0.94241
6,1.888600,0.879503
7,1.888600,0.828298
8,1.888600,0.816551
9,1.888600,0.780028
10,0.844300,0.736942


In [None]:
!tar -zcvf model.tar.gz detr-resnet-50-container-finetuned/