In [4]:
from pathlib import Path
import random
import shutil
import yaml

In [5]:
dataset_dir = Path("/mnt/d/DataSets/whiteboard").resolve()
val_size = 0.1
seed = 42

random.seed(seed)

images_train = dataset_dir / "images" / "train"
labels_train = dataset_dir / "labels" / "train"

images_val = dataset_dir / "images" / "val"
labels_val = dataset_dir / "labels" / "val"

images_val.mkdir(parents=True, exist_ok=True)
labels_val.mkdir(parents=True, exist_ok=True)


def flatten(dir_path: Path):
    for sub in list(dir_path.iterdir()):
        if sub.is_dir():
            for f in sub.iterdir():
                target = dir_path / f.name
                if not target.exists():
                    shutil.move(str(f), target)
            sub.rmdir()


flatten(images_train)
flatten(labels_train)

pairs = []
for img in images_train.iterdir():
    if img.suffix.lower() in {".jpg", ".jpeg", ".png"}:
        lbl = labels_train / f"{img.stem}.txt"
        if lbl.exists():
            pairs.append((img, lbl))

random.shuffle(pairs)
val_count = int(len(pairs) * val_size)
val_pairs = pairs[:val_count]

for img, lbl in val_pairs:
    shutil.move(img, images_val / img.name)
    shutil.move(lbl, labels_val / lbl.name)

data_yaml = dataset_dir / "data.yaml"

with open(data_yaml, "r", encoding="utf-8") as f:
    data = yaml.safe_load(f) or {}

for k in ["train", "val", "test", "path"]:
    data.pop(k, None)

data["path"] = str(dataset_dir)
data["train"] = "images/train"
data["val"] = "images/val"

if isinstance(data.get("names"), dict):
    data["names"] = [data["names"][i] for i in sorted(data["names"])]
    data["nc"] = len(data["names"])

with open(data_yaml, "w", encoding="utf-8") as f:
    yaml.dump(data, f, allow_unicode=True, sort_keys=False)

print(f"Train: {len(pairs) - val_count}, Val: {val_count}")
print(f"path = {dataset_dir}")

Train: 687, Val: 76
path = /mnt/d/DataSets/whiteboard
