In [1]:
import os


In [2]:
data_path = "/home/cmurphy/datasets/toma/box-detection-coco-1.1"
output_path = "/home/cmurphy/datasets/toma/box-detection-yolo"

In [18]:
import pandas as pd

train = pd.read_parquet(os.path.join(data_path, "train.parquet"))
val = pd.read_parquet(os.path.join(data_path, "val.parquet"))

In [None]:
from io import BytesIO
from PIL import Image

CATEGORY_MAP = {
    0: "open",
    1: "closed",
}


def process_object_to_annotation(obj: dict) -> list:
    annotations = []
    for i, box, category in zip(obj["id"], obj["bbox"], obj["category"]):
        annotations.append(
            {
                "category": CATEGORY_MAP.get(category, None),
                "bbox": box,
            }
        )
    return annotations


train["image"] = train["image"].apply(lambda x: Image.open(BytesIO(x)))
train["image_type"] = "image"
train["annotations"] = train["objects"].apply(process_object_to_annotation)
train["label_type"] = "xywh"
train["split"] = "train"

val["image"] = val["image"].apply(lambda x: Image.open(BytesIO(x)))
val["image_type"] = "image"
val["annotations"] = val["objects"].apply(process_object_to_annotation)
val["label_type"] = "xywh"
val["split"] = "val"

dataset = pd.concat([train, val], ignore_index=True)[
    ["image", "image_type", "annotations", "label_type", "split"]
]


In [None]:
import os
import json
import requests
import base64
import numpy as np
import pandas as pd
from PIL import Image
from io import BytesIO
from tqdm import tqdm
import uuid

"""
Example of using the PrepareYoloData class:

Assuming df_train and df_val are already loaded and have the following columns:
- url: str, the url of the image
- annotations: str, the annotations of the image, which is a list of dicts, 
    - item: {
        "left": int,
        "top": int,
        "width": int,
        "height": int,
        "label": str
    }
df_all = pd.concat([df_train, df_val])
df_all['image'] = df_all['url']
df_all['image_type'] = 'url'
df_all['annotations'] = df_all['annotations'].apply(lambda x_list: [{"category": x['label'], "bbox": [x['left'], x['top'], x['width'], x['height']]} for x in eval(x_list)])
df_all['label_type'] = 'xywh'

PrepareYoloData().process_df(df_all, output_dir="data/custom_dataset")

"""


class PrepareYoloData:
    def _extract_unique_categories(self, df):
        """
        Extract unique category labels from the DataFrame.

        Args:
            df: DataFrame with a "annotations" column containing lists of label dictionaries

        Returns:
            Dictionary mapping category names to category IDs
        """
        unique_labels = set()
        for labels in df["annotations"]:
            if isinstance(labels, list):
                for label in labels:
                    if isinstance(label, dict) and "category" in label:
                        unique_labels.add(label["category"])
                    else:
                        raise ValueError(
                            f"Unrecognized label format: {label}, expecting dict with 'category' and 'bbox'"
                        )
            else:
                raise ValueError(f"Label is not a list of dicts: {labels}")

        return {cat: i + 1 for i, cat in enumerate(sorted(unique_labels))}

    def process_df(self, df, output_dir="yolo_dataset", category_mapping=None):
        """
        Convert a DataFrame with image and label information to COCO format.

        Args:
            df: DataFrame with columns ["image", "annotations", "image_type", "label_type", "split"]
            output_dir: Directory to save the dataset
            annotations is a list of dicts, each dict contains "category" and "bbox"
            category_mapping: Optional dictionary mapping label names to category IDs

        Returns:
            Dictionary containing paths to the created dataset files
        """
        # Collect unique values in split
        unique_splits = df["split"].unique()

        # Create output directories
        os.makedirs(output_dir, exist_ok=True)
        for split in unique_splits:
            os.makedirs(os.path.join(output_dir, "images", split), exist_ok=True)
        os.makedirs(os.path.join(output_dir, "annotations"), exist_ok=True)

        # Initialize COCO format dictionaries for train and val
        coco_data = {
            split: {"images": [], "annotations": [], "categories": []}
            for split in unique_splits
        }

        # Create category list if not provided
        if category_mapping is None:
            category_mapping = self._extract_unique_categories(df)

        # Populate categories in COCO format
        for category_name, category_id in category_mapping.items():
            category_info = {
                "id": category_id,
                "name": category_name,
                "supercategory": "none",
            }
            for split in unique_splits:
                coco_data[split]["categories"].append(category_info)

        # Process each row in the DataFrame
        annotation_id = 1
        for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing images"):
            split = row["split"]
            if split not in unique_splits:
                continue

            # Get image
            image_data = row["image"]
            image_type = row["image_type"]

            # Load image based on type
            if image_type == "url":
                try:
                    response = requests.get(image_data, timeout=10)
                    img = Image.open(BytesIO(response.content))
                except Exception as e:
                    print(f"Error downloading image from URL {image_data}: {e}")
                    continue
            elif image_type == "image_path":
                try:
                    img = Image.open(image_data)
                except Exception as e:
                    print(f"Error opening image from path {image_data}: {e}")
                    continue
            elif image_type == "base64":
                try:
                    img_bytes = base64.b64decode(image_data)
                    img = Image.open(BytesIO(img_bytes))
                except Exception as e:
                    print(f"Error decoding base64 image: {e}")
                    continue
            elif image_type == "image":
                try:
                    img = image_data
                except Exception as e:
                    print(f"Error converting image from array: {e}")
                    continue
            else:
                print(f"Unsupported image type: {image_type}")
                continue

            # Generate a unique filename
            image_id = idx
            file_name = f"{image_id:012d}.jpg"
            img_path = os.path.join(output_dir, "images", split, file_name)

            # Save image
            img = img.convert("RGB")
            img.save(img_path)

            # Get image dimensions
            width, height = img.size

            # Add image info to COCO format
            image_info = {
                "id": image_id,
                "file_name": file_name,
                "width": width,
                "height": height,
                "date_captured": "",
                "license": 1,
                "coco_url": "",
                "flickr_url": "",
            }
            coco_data[split]["images"].append(image_info)

            # Process labels
            annotations = row["annotations"]
            if not isinstance(annotations, list):
                continue

            for annotation in annotations:
                if not isinstance(annotation, dict) or (
                    "category" not in annotation or "bbox" not in annotation
                ):
                    raise ValueError(
                        f"Unrecognized label format: {annotation}, expecting dict with 'category' and 'bbox'"
                    )

                category_name = annotation["category"]
                if category_name not in category_mapping:
                    raise ValueError(f"Unrecognized category: {category_name}")

                category_id = category_mapping[category_name]
                bbox = annotation["bbox"]

                # Convert bbox based on label_type
                label_type = row["label_type"]
                if label_type == "xyxy":
                    # [x1, y1, x2, y2] to COCO [x, y, width, height]
                    x1, y1, x2, y2 = bbox
                    x, y = x1, y1
                    w, h = x2 - x1, y2 - y1
                elif label_type == "xywh":
                    # [x, y, w, h] to COCO [x, y, width, height]
                    x, y, w, h = bbox
                else:
                    print(f"Unsupported label type: {label_type}")
                    continue

                # Create segmentation (simple polygon from bbox)

                segmentation = [[x, y, x + w, y, x + w, y + h, x, y + h]]
                segmentation = [[float(p) for p in segmentation[0]]]

                # Calculate area
                area = w * h

                # Add annotation to COCO format
                annotation_info = {
                    "id": annotation_id,
                    "image_id": image_id,
                    "category_id": category_id,
                    "bbox": [float(x), float(y), float(w), float(h)],
                    "area": float(area),
                    "segmentation": segmentation,
                    "iscrowd": 0,
                }
                coco_data[split]["annotations"].append(annotation_info)
                annotation_id += 1

        # Save COCO format annotations
        train_json_path = os.path.join(
            output_dir, "annotations", "instances_train.json"
        )
        val_json_path = os.path.join(output_dir, "annotations", "instances_val.json")

        with open(train_json_path, "w") as f:
            json.dump(coco_data["train"], f)

        with open(val_json_path, "w") as f:
            json.dump(coco_data["val"], f)

        return {
            "train_images": os.path.join(output_dir, "images", "train"),
            "val_images": os.path.join(output_dir, "images", "val"),
            "train_annotations": train_json_path,
            "val_annotations": val_json_path,
        }

In [36]:
PrepareYoloData().process_df(
    dataset, output_dir=output_path, category_mapping={"open": 0, "closed": 1}
)

Processing images: 100%|██████████| 7709/7709 [00:11<00:00, 684.29it/s]


TypeError: Object of type int64 is not JSON serializable

In [22]:
import json

with open(os.path.join(output_path, "annotations", "instances_train.json")) as f:
    train_data = json.load(f)

In [24]:
train_data["annotations"]

[{'id': 1,
  'image_id': 0,
  'category_id': 1,
  'bbox': [350.0, 222.0, 109.0, 46.0],
  'area': 5014.0,
  'segmentation': [[0.546875,
    0.4625,
    0.7171875,
    0.4625,
    0.7171875,
    0.5583333333333333,
    0.546875,
    0.5583333333333333]],
  'iscrowd': 0},
 {'id': 2,
  'image_id': 1,
  'category_id': 1,
  'bbox': [135.0, 160.0, 419.0, 257.0],
  'area': 107683.0,
  'segmentation': [[0.2109375,
    0.3333333333333333,
    0.865625,
    0.3333333333333333,
    0.865625,
    0.86875,
    0.2109375,
    0.86875]],
  'iscrowd': 0},
 {'id': 3,
  'image_id': 2,
  'category_id': 0,
  'bbox': [90.0, 9.0, 464.0, 471.0],
  'area': 218544.0,
  'segmentation': [[0.140625,
    0.01875,
    0.865625,
    0.01875,
    0.865625,
    1.0,
    0.140625,
    1.0]],
  'iscrowd': 0},
 {'id': 4,
  'image_id': 3,
  'category_id': 1,
  'bbox': [0.0, 90.0, 638.0, 456.0],
  'area': 290928.0,
  'segmentation': [[0.0,
    0.140625,
    0.996875,
    0.140625,
    0.996875,
    0.853125,
    0.0,
    0.

In [None]:
import torch

print(output_path)
train = torch.load(os.path.join(output_path, "train.cache"))

/home/cmurphy/datasets/toma/box-detection-yolo


UnpicklingError: Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL pathlib.PosixPath was not an allowed global by default. Please use `torch.serialization.add_safe_globals([PosixPath])` or the `torch.serialization.safe_globals([PosixPath])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about types accepted by default with weights_only https://pytorch.org/docs/stable/generated/torch.load.html.

In [None]:
torch.load(
    "/home/cmurphy/datasets/toma/box-detection-yolo/train.cache", weights_only=False
)

[(PosixPath('/home/cmurphy/datasets/toma/box-detection-yolo/images/train/000000000000.jpg'),
  tensor([[1.0000e+00, 8.5449e-04, 9.6354e-04, 1.1206e-03, 1.1632e-03]],
         dtype=torch.float64),
  0.0),
 (PosixPath('/home/cmurphy/datasets/toma/box-detection-yolo/images/train/000000000001.jpg'),
  tensor([[1.0000e+00, 3.2959e-04, 6.9444e-04, 1.3525e-03, 1.8099e-03]],
         dtype=torch.float64),
  0.0),
 (PosixPath('/home/cmurphy/datasets/toma/box-detection-yolo/images/train/000000000002.jpg'),
  tensor([[0.0000e+00, 2.1973e-04, 3.9063e-05, 1.3525e-03, 2.0833e-03]],
         dtype=torch.float64),
  0.0),
 (PosixPath('/home/cmurphy/datasets/toma/box-detection-yolo/images/train/000000000003.jpg'),
  tensor([[1.0000e+00, 0.0000e+00, 2.1973e-04, 1.5576e-03, 1.3330e-03]],
         dtype=torch.float64),
  0.0),
 (PosixPath('/home/cmurphy/datasets/toma/box-detection-yolo/images/train/000000000004.jpg'),
  tensor([[0.0000, 0.0004, 0.0009, 0.0008, 0.0020]], dtype=torch.float64),
  0.0),
 (Po