In [1]:
import os
import shutil
import random
from pathlib import Path

def split_dataset(src_dir, train_dir, test_dir, test_ratio=0.2, seed=42):
    random.seed(seed)
    src_dir = Path(src_dir)
    train_dir = Path(train_dir)
    test_dir = Path(test_dir)

    train_dir.mkdir(parents=True, exist_ok=True)
    test_dir.mkdir(parents=True, exist_ok=True)

    for class_folder in os.listdir(src_dir):
        class_path = src_dir / class_folder
        if not class_path.is_dir():
            continue

        images = list(class_path.glob("*"))
        if not images:
            continue

        random.shuffle(images)
        num_test = int(len(images) * test_ratio)

        test_images = images[:num_test]
        train_images = images[num_test:]

        # Create target folders
        train_class_dir = train_dir / class_folder
        test_class_dir = test_dir / class_folder
        train_class_dir.mkdir(parents=True, exist_ok=True)
        test_class_dir.mkdir(parents=True, exist_ok=True)

        # Move images
        for img_path in train_images:
            shutil.copy2(str(img_path), str(train_class_dir / img_path.name))

        for img_path in test_images:
            shutil.copy2(str(img_path), str(test_class_dir / img_path.name))

        print(f"{class_folder}: {len(train_images)} train, {len(test_images)} test")

# Example usage
split_dataset("final_data", "train_data", "test_data", test_ratio=0.2)


Normal: 159 train, 39 test
Osteopenia: 186 train, 46 test
Osteoporosis: 114 train, 28 test
