In [42]:
import json
from pathlib import Path
import random
from collections import defaultdict

def create_yolo_dataset(folder_path: str):
    """
    Creates train.txt, test.txt, val.txt, and dataset.yaml inside `folder_path`
    based on PNG images in `folder_path` (excluding *.mask.png files).
    
    Splits data by original_id (to avoid data leakage) into
    80% train, 10% test, 10% val.
    
    NOTE: The paths inside train.txt/test.txt/val.txt will be relative
    to `folder_path`.
    """
    dir_images = Path(folder_path).resolve()

    image_paths = [
        p for p in dir_images.glob("*.png")
        if not p.name.endswith(".mask.png")
    ]

    groups = defaultdict(list)

    for img_path in image_paths:
        json_path = img_path.with_suffix(".json")
        if not json_path.exists():
            print(f"No matching JSON for {img_path.name}, skipping.")
            continue

        with open(json_path, 'r') as f:
            data = json.load(f)

        original_id = data["sourceImage"]
        groups[original_id].append(img_path)

    original_ids = list(groups.keys())
    random.shuffle(original_ids)

    n = len(original_ids)
    train_end = int(0.80 * n)
    test_end  = int(0.90 * n)

    train_ids = original_ids[:train_end]
    test_ids  = original_ids[train_end:test_end]
    val_ids   = original_ids[test_end:]

    train_paths = [img for oid in train_ids for img in groups[oid]]
    test_paths  = [img for oid in test_ids for img in groups[oid]]
    val_paths   = [img for oid in val_ids for img in groups[oid]]

    # Helper function to write one relative path per line
    def write_split(filename: Path, paths: list[Path]):
        with filename.open("w") as f:
            for p in paths:
                f.write(p.name + "\n")


    write_split(dir_images / "train.txt", train_paths)
    write_split(dir_images / "test.txt", test_paths)
    write_split(dir_images / "val.txt", val_paths)


# Example usage:
create_yolo_dataset("p_BaseImages")
create_yolo_dataset("p_VariantImages")
create_yolo_dataset("p_VariantImagesWithOcclusion")

In [44]:
import os
from pathlib import Path

def process_folder_structure_with_labels(folder_path: str):
    """
    Reads train.txt, test.txt, val.txt from `folder_path` (which was created by create_yolo_dataset).
    Builds a YOLO directory structure in `datasets/<folder_name>`, placing:
    
      datasets/<folder_name>/
        images/train/ -> symlinks to the train images
        images/test/  -> symlinks to the test images
        images/val/   -> symlinks to the val images
        labels/train/ -> symlinks to the corresponding train labels
        labels/test/  -> symlinks to the corresponding test labels
        labels/val/   -> symlinks to the corresponding val labels
        dataset.yaml  -> references these subfolders

    :param folder_path: The path to your original folder (e.g. "p_BaseImages"),
                        which must have train.txt, test.txt, val.txt, dataset.yaml
                        already generated by create_yolo_dataset().
    """

    src_dir = Path(folder_path).resolve()
    folder_name = src_dir.name  # e.g. "p_BaseImages"

    # The new dataset directory, e.g. datasets/p_BaseImages
    dst_base = Path("datasets") / folder_name
    dst_base.mkdir(parents=True, exist_ok=True)

    # keep the standard YOLO layout:
    # images/train, images/test, images/val, plus labels/train, labels/test, labels/val
    images_dir = dst_base / "images"
    labels_dir = dst_base / "labels"
    train_img_dir = images_dir / "train"
    test_img_dir  = images_dir / "test"
    val_img_dir   = images_dir / "val"
    train_lbl_dir = labels_dir / "train"
    test_lbl_dir  = labels_dir / "test"
    val_lbl_dir   = labels_dir / "val"

    train_img_dir.mkdir(parents=True, exist_ok=True)
    test_img_dir.mkdir(parents=True, exist_ok=True)
    val_img_dir.mkdir(parents=True, exist_ok=True)
    train_lbl_dir.mkdir(parents=True, exist_ok=True)
    test_lbl_dir.mkdir(parents=True, exist_ok=True)
    val_lbl_dir.mkdir(parents=True, exist_ok=True)

    train_file = src_dir / "train.txt"
    test_file  = src_dir / "test.txt"
    val_file   = src_dir / "val.txt"

    for required_file in [train_file, test_file, val_file]:
        if not required_file.exists():
            print(f"[ERROR] {required_file} does not exist. "
                  "Did you run create_yolo_dataset first?")
            return

    def symlink_split(txt_file: Path, dst_img_dir: Path, dst_lbl_dir: Path):
        """
        For each filename in txt_file (one per line), create a symlink in dst_img_dir for the image,
        and a symlink in dst_lbl_dir for the label (same base name, .txt).
        """
        with txt_file.open("r") as f:
            lines = [line.strip() for line in f if line.strip()]

        for filename in lines:
            # The original image file in src_dir
            src_img = src_dir / filename  # e.g. p_BaseImages/image001.png
            if not src_img.exists():
                print(f"[WARNING] Image file not found: {src_img}, skipping.")
                continue

            # The corresponding label (same base name with .txt)
            # e.g. p_BaseImages/image001.txt
            src_lbl = src_img.with_suffix(".txt")

            if not src_lbl.exists():
                print(f"[WARNING] Label not found for {src_img.name}, expected {src_lbl.name}, skipping label symlink.")

            # symlinks:
            dst_img_link = dst_img_dir / src_img.name
            dst_lbl_link = dst_lbl_dir / src_lbl.name

            if dst_img_link.exists():
                dst_img_link.unlink()
            if dst_lbl_link.exists():
                dst_lbl_link.unlink()

            os.symlink(src_img, dst_img_link)

            if src_lbl.exists():
                os.symlink(src_lbl, dst_lbl_link)

    # Symlink the train, test, val sets
    symlink_split(train_file, train_img_dir, train_lbl_dir)
    symlink_split(test_file,  test_img_dir,  test_lbl_dir)
    symlink_split(val_file,   val_img_dir,   val_lbl_dir)

    print(f"[INFO] Created symlinks in:")
    print(f"  {train_img_dir}, {test_img_dir}, {val_img_dir}")
    print(f"  {train_lbl_dir}, {test_lbl_dir}, {val_lbl_dir}")

    # Create (or overwrite) dataset.yaml referencing these subfolders
    dataset_yaml_content = f"""# YOLO dataset config
# Automatically generated for {folder_name}

path: {folder_name}
train: images/train
val: images/val
test: images/test

names:
  0: runway
"""

    dst_dataset_yaml = dst_base / "dataset.yaml"
    with dst_dataset_yaml.open("w") as f:
        f.write(dataset_yaml_content)

    print(f"[INFO] Wrote dataset.yaml to {dst_dataset_yaml.resolve()}")
    print(f"[DONE] The dataset folder is ready at: {dst_base.resolve()}")



[INFO] Created symlinks in:
  datasets/p_BaseImages/images/train, datasets/p_BaseImages/images/test, datasets/p_BaseImages/images/val
  datasets/p_BaseImages/labels/train, datasets/p_BaseImages/labels/test, datasets/p_BaseImages/labels/val
[INFO] Wrote dataset.yaml to /home/gustavo-depaula/SyntheticRunwayDataset/datasets/p_BaseImages/dataset.yaml
[DONE] The dataset folder is ready at: /home/gustavo-depaula/SyntheticRunwayDataset/datasets/p_BaseImages
[INFO] Created symlinks in:
  datasets/p_VariantImages/images/train, datasets/p_VariantImages/images/test, datasets/p_VariantImages/images/val
  datasets/p_VariantImages/labels/train, datasets/p_VariantImages/labels/test, datasets/p_VariantImages/labels/val
[INFO] Wrote dataset.yaml to /home/gustavo-depaula/SyntheticRunwayDataset/datasets/p_VariantImages/dataset.yaml
[DONE] The dataset folder is ready at: /home/gustavo-depaula/SyntheticRunwayDataset/datasets/p_VariantImages
[INFO] Created symlinks in:
  datasets/p_VariantImagesWithOcclusio

In [None]:
process_folder_structure_with_labels("p_BaseImages")

In [47]:
process_folder_structure_with_labels("p_VariantImages")
process_folder_structure_with_labels("p_VariantImagesWithOcclusion")

[INFO] Created symlinks in:
  datasets/p_VariantImages/images/train, datasets/p_VariantImages/images/test, datasets/p_VariantImages/images/val
  datasets/p_VariantImages/labels/train, datasets/p_VariantImages/labels/test, datasets/p_VariantImages/labels/val
[INFO] Wrote dataset.yaml to /home/gustavo-depaula/SyntheticRunwayDataset/datasets/p_VariantImages/dataset.yaml
[DONE] The dataset folder is ready at: /home/gustavo-depaula/SyntheticRunwayDataset/datasets/p_VariantImages
[INFO] Created symlinks in:
  datasets/p_VariantImagesWithOcclusion/images/train, datasets/p_VariantImagesWithOcclusion/images/test, datasets/p_VariantImagesWithOcclusion/images/val
  datasets/p_VariantImagesWithOcclusion/labels/train, datasets/p_VariantImagesWithOcclusion/labels/test, datasets/p_VariantImagesWithOcclusion/labels/val
[INFO] Wrote dataset.yaml to /home/gustavo-depaula/SyntheticRunwayDataset/datasets/p_VariantImagesWithOcclusion/dataset.yaml
[DONE] The dataset folder is ready at: /home/gustavo-depaula