In [9]:
import os

import pandas as pd

data = pd.read_json(os.path.join("advanced", "vlm.jsonl"), lines=True)

# build unique classes in dataset
id2label = ["missile", "cargo", "commercial", "light", "helicopter", "fighter", "drone", "others"]
label2id = {}
for idx, label in enumerate(id2label):
    label2id[label] = idx
print(len(id2label))
id2label[7]

8


'others'

In [10]:
# Val images and labels
from PIL import Image

new_dir = "datasets/vlm_detr"
old_dir = "advanced"
dataset_type = "val"

# Delete the directory if it exists
if os.path.exists(os.path.join(new_dir, dataset_type)):
    os.system(f"rm -rf {os.path.join(new_dir, dataset_type)}")

# Create the coco format
for idx, row in data.iterrows():
    image_path = row["image"]
    annotations = row["annotations"]
    new_image_path = os.path.join(new_dir, dataset_type, 'images', image_path)
    new_txt_path = new_image_path.replace("jpg", "txt").replace("images", "labels")
    # Copy the image to the new directory
    os.makedirs(os.path.dirname(new_image_path), exist_ok=True)
    os.makedirs(os.path.dirname(new_txt_path), exist_ok=True)
    os.system(f"cp {os.path.join(old_dir, 'images', image_path)} {new_image_path}")

    # Get the size of the image
    with Image.open(os.path.join(old_dir, "images", image_path)) as img:
        width, height = img.size
        # Since the image is not the actual image
        width = 1520
        height = 870

    with open(new_txt_path, "w") as f:
        for annotation in annotations:
            caption = annotation["caption"]
            bbox = annotation["bbox"]
            x, y, w, h = bbox
            x_center = x + w / 2
            y_center = y + h / 2
            
            class_id = label2id["others"]
            for word in caption.split(" "):
                if word in label2id.keys():
                    class_id = label2id[word]

            # Normalize the values to be between 0 and 1
            x_center /= width
            y_center /= height
            w /= width
            h /= height
            
            f.write(f"{class_id} {x_center} {y_center} {w} {h}\n")

In [11]:
# Train images and labels
from PIL import Image

new_dir = "datasets/vlm_detr"
old_dir = "advanced"
dataset_type = "train"

# Delete the directory if it exists
if os.path.exists(os.path.join(new_dir, dataset_type)):
    os.system(f"rm -rf {os.path.join(new_dir, dataset_type)}")

# Create the coco format
for idx, row in data.iterrows():
    image_path = row["image"]
    annotations = row["annotations"]
    new_image_path = os.path.join(new_dir, dataset_type, 'images', image_path)
    new_txt_path = new_image_path.replace("jpg", "txt").replace("images", "labels")
    # Copy the image to the new directory
    os.makedirs(os.path.dirname(new_image_path), exist_ok=True)
    os.makedirs(os.path.dirname(new_txt_path), exist_ok=True)
    os.system(f"cp {os.path.join(old_dir, 'images', image_path)} {new_image_path}")

    # Get the size of the image
    with Image.open(os.path.join(old_dir, "images", image_path)) as img:
        width, height = img.size
        # Since the image is not the actual image
        width = 1520
        height = 870

    with open(new_txt_path, "w") as f:
        for annotation in annotations:
            caption = annotation["caption"]
            bbox = annotation["bbox"]
            x, y, w, h = bbox
            x_center = x + w / 2
            y_center = y + h / 2
            
            class_id = label2id["others"]
            for word in caption.split(" "):
                if word in label2id.keys():
                    class_id = label2id[word]

            # Normalize the values to be between 0 and 1
            x_center /= width
            y_center /= height
            w /= width
            h /= height
            
            f.write(f"{class_id} {x_center} {y_center} {w} {h}\n")

In [12]:
# Check if the files have been created
train_dir = os.path.join(new_dir, "train")
val_dir = os.path.join(new_dir, "val")
train_dir_images = os.path.join(train_dir, "images")
val_dir_images = os.path.join(val_dir, "images")

train_dir_labels = os.path.join(train_dir, "labels")
val_dir_labels = os.path.join(val_dir, "labels")

print(f"Number of images in train: {len(os.listdir(train_dir_images))}")
print(f"Number of images in val: {len(os.listdir(val_dir_images))}")

print(f"Number of labels in train: {len(os.listdir(train_dir_labels))}")
print(f"Number of labels in val: {len(os.listdir(val_dir_labels))}")

Number of images in train: 5107
Number of images in val: 5107
Number of labels in train: 5107
Number of labels in val: 5107


In [13]:
images_path_list = os.listdir(train_dir_images) 

for x in images_path_list:
    path = os.path.join(train_dir_images, x)
    image = Image.open(path)
    if image.size != (1520, 870):
        print("No right size")

In [14]:
# Add data.yaml file into data directory
# Format of the data.yaml file
"""
train: ../train/images
val: ../val/images

nc: number of classes
names: [class1, class2, class3, ...]
"""

data_yaml_path = os.path.join(new_dir, "data.yaml")

with open(data_yaml_path, "w") as f:
    f.write(f"train: ../train/images\n")
    f.write(f"val: ../val/images\n\n")
    f.write(f"nc: {len(id2label)}\n")
    f.write(f"names: {id2label}\n")

In [15]:
from ultralytics import RTDETR
from ultralytics.data.augment import Albumentations
from ultralytics.utils import LOGGER, colorstr

In [16]:
# best_model_path = os.path.join("runs", "detect", "train93", "weights", "best.pt")
model = RTDETR("models/detr/rtdetr-x.pt")

In [17]:
new_dir = "datasets/vlm_detr"
data_yaml_path = os.path.join(new_dir, "data.yaml")
data_yaml_path

'datasets/vlm_detr/data.yaml'

In [18]:
def __init__(self, p=1.0):
        """Initialize the transform object for YOLO bbox formatted params."""
        self.p = p
        self.transform = None
        prefix = colorstr("albumentations: ")
        try:
            import albumentations as A
            
            spatial_transforms = {
                "Affine",
                "BBoxSafeRandomCrop",
                "CenterCrop",
                "CoarseDropout",
                "Crop",
                "CropAndPad",
                "CropNonEmptyMaskIfExists",
                "D4",
                "ElasticTransform",
                "Flip",
                "GridDistortion",
                "GridDropout",
                "HorizontalFlip",
                "Lambda",
                "LongestMaxSize",
                "MaskDropout",
                "MixUp",
                "Morphological",
                "NoOp",
                "OpticalDistortion",
                "PadIfNeeded",
                "Perspective",
                "PiecewiseAffine",
                "PixelDropout",
                "RandomCrop",
                "RandomCropFromBorders",
                "RandomGridShuffle",
                "RandomResizedCrop",
                "RandomRotate90",
                "RandomScale",
                "RandomSizedBBoxSafeCrop",
                "RandomSizedCrop",
                "Resize",
                "Rotate",
                "SafeRotate",
                "ShiftScaleRotate",
                "SmallestMaxSize",
                "Transpose",
                "VerticalFlip",
                "XYMasking",
            }  # from https://albumentations.ai/docs/getting_started/transforms_and_targets/#spatial-level-transforms

            # Insert required transformation here
            T = [
                # A.RandomRain(p=0.4, slant_lower=-10, slant_upper=10, 
                #               drop_length=20, drop_width=1, drop_color=(200, 200, 200), 
                #               blur_value=5, brightness_coefficient=0.9, rain_type=None),
                # A.Rotate(limit = 10, p=0.5),
                # A.Blur(p=0.1),
                # A.HorizontalFlip(p=0.5),  # Adds horizontal flipping with a 50% probability
                # A.VerticalFlip(p=0.5),    # Adds 
                # A.MedianBlur(p=0.1),
                # A.ImageCompression(quality_lower=75, p=0.0),
                A.Rotate(limit=15, p=0.3),
                A.Blur(blur_limit=(3, 5), p=0.3),
                A.RandomSizedCrop(min_max_height=(int(0.8 * 640), 640), height=640, width=640, p=0.3)
            ]

            self.contains_spatial = any(transform.__class__.__name__ in spatial_transforms for transform in T)
            self.transform = (
                A.Compose(T, bbox_params=A.BboxParams(format="yolo", label_fields=["class_labels"]))
                if self.contains_spatial
                else A.Compose(T)
            )
            
            LOGGER.info(prefix + ", ".join(f"{x}".replace("always_apply=False, ", "") for x in T if x.p))
        except ImportError:  # package not installed, skip
            print("Importing error")
        except Exception as e:
            LOGGER.info(f"{prefix}{e}")

Albumentations.__init__ = __init__

In [None]:
# Specify the save directory for training runs
save_dir = os.path.join("models", "detr", "logs")
os.makedirs(save_dir, exist_ok=True)

results = model.train(data=data_yaml_path, epochs=100, imgsz=640, augment=True, batch=8, project="models/detr")

New https://pypi.org/project/ultralytics/8.2.27 available 😃 Update with 'pip install -U ultralytics'
Ultralytics YOLOv8.2.22 🚀 Python-3.10.14 torch-1.13.1+cu117 CUDA:0 (Tesla T4, 14918MiB)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=models/detr/rtdetr-x.pt, data=datasets/vlm_detr/data.yaml, epochs=100, time=None, patience=100, batch=8, imgsz=640, save=True, save_period=-1, cache=False, device=None, workers=8, project=models/detr, name=train6, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=True, agnostic_nms=False, classes=None, retina_masks=Fal

[34m[1mtrain: [0mScanning /home/jupyter/datasets/vlm_detr/train/labels... 5107 images, 0 backgrounds, 0 corrupt: 100%|██████████| 5107/5107 [00:04<00:00, 1039.69it/s]


[34m[1mtrain: [0mNew cache created: /home/jupyter/datasets/vlm_detr/train/labels.cache
[34m[1malbumentations: [0mRotate(p=0.3, limit=(-15, 15), interpolation=1, border_mode=4, value=None, mask_value=None), Blur(p=0.3, blur_limit=(3, 5)), RandomSizedCrop(p=0.3, min_max_height=(512, 640), height=640, width=640, w2h_ratio=1.0, interpolation=1)


[34m[1mval: [0mScanning /home/jupyter/datasets/vlm_detr/val/labels... 5107 images, 0 backgrounds, 0 corrupt: 100%|██████████| 5107/5107 [00:04<00:00, 1064.66it/s]


[34m[1mval: [0mNew cache created: /home/jupyter/datasets/vlm_detr/val/labels.cache
Plotting labels to models/detr/train6/labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.000833, momentum=0.9) with parameter groups 193 weight(decay=0.0), 256 weight(decay=0.0005), 276 bias(decay=0.0)
[34m[1mTensorBoard: [0mmodel graph visualization added ✅
Image sizes 640 train, 640 val
Using 4 dataloader workers
Logging results to [1mmodels/detr/train6[0m
Starting training for 100 epochs...

      Epoch    GPU_mem  giou_loss   cls_loss    l1_loss  Instances       Size


      1/100        12G      0.565      2.384      0.096          6        640: 100%|██████████| 639/639 [08:45<00:00,  1.22it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 320/320 [02:26<00:00,  2.18it/s]


                   all       5107      27913      0.776      0.671      0.652      0.484

      Epoch    GPU_mem  giou_loss   cls_loss    l1_loss  Instances       Size


      2/100      11.9G     0.4523     0.5935    0.06132         14        640: 100%|██████████| 639/639 [08:23<00:00,  1.27it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 320/320 [02:27<00:00,  2.18it/s]


                   all       5107      27913      0.916      0.849      0.866      0.646

      Epoch    GPU_mem  giou_loss   cls_loss    l1_loss  Instances       Size


      3/100      11.9G     0.4342     0.5479     0.0568         18        640: 100%|██████████| 639/639 [08:17<00:00,  1.29it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 320/320 [02:26<00:00,  2.18it/s]


                   all       5107      27913      0.928      0.875       0.89      0.607

      Epoch    GPU_mem  giou_loss   cls_loss    l1_loss  Instances       Size


      4/100      11.9G     0.4229     0.5306    0.05391         21        640: 100%|██████████| 639/639 [08:13<00:00,  1.30it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 320/320 [02:26<00:00,  2.18it/s]


                   all       5107      27913      0.943      0.893      0.915      0.636

      Epoch    GPU_mem  giou_loss   cls_loss    l1_loss  Instances       Size


      5/100        12G     0.4093     0.5028    0.05221         27        640: 100%|██████████| 639/639 [08:10<00:00,  1.30it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 320/320 [02:26<00:00,  2.19it/s]


                   all       5107      27913      0.928        0.9      0.927      0.717

      Epoch    GPU_mem  giou_loss   cls_loss    l1_loss  Instances       Size


      6/100        12G      0.401      0.502    0.05097         14        640: 100%|██████████| 639/639 [08:11<00:00,  1.30it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 320/320 [02:26<00:00,  2.19it/s]


                   all       5107      27913      0.949      0.888      0.918      0.707

      Epoch    GPU_mem  giou_loss   cls_loss    l1_loss  Instances       Size


      7/100      11.9G     0.3933     0.4789    0.04979         18        640: 100%|██████████| 639/639 [08:14<00:00,  1.29it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 320/320 [02:29<00:00,  2.14it/s]


                   all       5107      27913      0.962      0.933      0.948       0.73

      Epoch    GPU_mem  giou_loss   cls_loss    l1_loss  Instances       Size


      8/100      11.9G      0.391     0.4667    0.04955         27        640: 100%|██████████| 639/639 [08:12<00:00,  1.30it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 320/320 [02:29<00:00,  2.14it/s]


                   all       5107      27913      0.974      0.942      0.958      0.749

      Epoch    GPU_mem  giou_loss   cls_loss    l1_loss  Instances       Size


      9/100      11.9G     0.3866     0.4617    0.04828         16        640: 100%|██████████| 639/639 [08:11<00:00,  1.30it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 320/320 [02:30<00:00,  2.13it/s]


                   all       5107      27913      0.967      0.942      0.955      0.725

      Epoch    GPU_mem  giou_loss   cls_loss    l1_loss  Instances       Size


     10/100        12G     0.3817     0.4568    0.04832         20        640: 100%|██████████| 639/639 [08:15<00:00,  1.29it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 320/320 [02:47<00:00,  1.91it/s]


                   all       5107      27913      0.949      0.923      0.945      0.711

      Epoch    GPU_mem  giou_loss   cls_loss    l1_loss  Instances       Size


     11/100      11.9G     0.3793     0.4558    0.04728         24        640: 100%|██████████| 639/639 [08:24<00:00,  1.27it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 320/320 [02:34<00:00,  2.08it/s]


                   all       5107      27913        nan      0.969      0.961      0.747

      Epoch    GPU_mem  giou_loss   cls_loss    l1_loss  Instances       Size


     12/100      11.9G     0.3704     0.4371    0.04543         27        640:  31%|███       | 199/639 [02:33<05:47,  1.27it/s]

In [None]:
# Conduct model validation on the COCO8 example dataset
metrics = model.val(data=data_yaml_path)

In [11]:
test_image_path = os.path.join("advanced", "images", "image_0.jpg")

results = model.predict(test_image_path)

results[0].show()


image 1/1 /home/jupyter/advanced/images/image_0.jpg: 384x640 1 blue, yellow, and green fighter plane, 1 grey and white light aircraft, 1 white and blue fighter jet, 1 yellow, red, and blue fighter plane, 1 black and white missile, 1 grey and yellow fighter plane, 1 white and red fighter jet, 19.6ms
Speed: 18.4ms preprocess, 19.6ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)


/usr/bin/xdg-open: 882: www-browser: not found
/usr/bin/xdg-open: 882: links2: not found
/usr/bin/xdg-open: 882: elinks: not found
/usr/bin/xdg-open: 882: links: not found
/usr/bin/xdg-open: 882: lynx: not found
/usr/bin/xdg-open: 882: w3m: not found
xdg-open: no method available for opening '/var/tmp/tmptb9y45dh.PNG'


In [17]:
bboxes = []

bboxes_tensors = results[0].boxes.xywh
for tensor in bboxes_tensors:
    bboxes.append(tensor.cpu().numpy())

In [18]:
bboxes

[array([     576.76,      137.63,      99.661,      41.889], dtype=float32),
 array([        578,      138.45,      98.336,      40.679], dtype=float32),
 array([     1044.1,      93.673,      30.411,      34.963], dtype=float32),
 array([     1044.1,      93.753,      30.103,      34.893], dtype=float32),
 array([     742.72,      538.83,      79.015,      65.385], dtype=float32),
 array([     580.46,      621.92,      17.883,      17.654], dtype=float32),
 array([     743.31,      538.85,      80.516,        67.6], dtype=float32)]

In [19]:
from PIL import Image, ImageDraw
import torch

image = Image.open(test_image_path)

# Create a draw object
draw = ImageDraw.Draw(image)

for bbox in bboxes:
# Extract the center coordinates, width, and height
    x_center, y_center, width, height = bbox

    x1 = x_center - width / 2
    y1 = y_center - height / 2
    x2 = x_center + width / 2
    y2 = y_center + height / 2


    # Draw the rectangle
    draw.rectangle([x1, y1, x2, y2], outline="red", width=2)

# Display the image (if using Jupyter Notebook)
image.show()

# If not in a notebook, save or display the image as needed
image.save('output_image_with_bbox_2.jpg')


/usr/bin/xdg-open: 882: www-browser: not found
/usr/bin/xdg-open: 882: links2: not found
/usr/bin/xdg-open: 882: elinks: not found
/usr/bin/xdg-open: 882: links: not found
/usr/bin/xdg-open: 882: lynx: not found
/usr/bin/xdg-open: 882: w3m: not found
xdg-open: no method available for opening '/var/tmp/tmpspw2nzaq.PNG'
