In [1]:
import os
from collections import defaultdict
import shutil
import random
from sklearn.model_selection import train_test_split

In [2]:
# --- Config ---
real_dir = "data/BUSI"                # contains class subfolders
augmented_dir = "data/LoRA_TI_ControlNet_refined"        # contains class subfolders
output_dataset_dir = "data/dataset_lora_ti_controlnet_refined"     # where split train/val folders go

val_ratio = 0.2                       # 80/20 train/val split
classes = ['benign', 'malignant', 'normal']
target_per_class = 350
random.seed(42)

In [3]:
def count_images_in_split(dataset_dir):
    split_counts = defaultdict(dict)

    for split in ['train', 'val']:
        split_path = os.path.join(dataset_dir, split)
        print(f"\n📂 Split: {split}")
        total = 0

        for cls in ['benign', 'malignant', 'normal']:
            class_path = os.path.join(split_path, cls)
            num_images = len([
                f for f in os.listdir(class_path)
                if f.lower().endswith(('.png', '.jpg', '.jpeg'))
            ])
            split_counts[split][cls] = num_images
            total += num_images
            print(f"  - {cls:<10}: {num_images} images")

        print(f"  ✅ Total in {split}: {total} images")

    return split_counts


In [4]:
train_class_counts = {}

# Create output folders
for split in ['train', 'val']:
    for cls in ['benign', 'malignant', 'normal']:
        os.makedirs(os.path.join(output_dataset_dir, split, cls), exist_ok=True)

# Split and copy files
for cls in ['benign', 'malignant', 'normal']:
    class_dir = os.path.join(real_dir, cls)
    all_images = [f for f in os.listdir(class_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg')) and '_mask' not in f]

    train_imgs, val_imgs = train_test_split(all_images, test_size=val_ratio, random_state=42)

    train_class_counts[cls] = len(train_imgs)

    for img in train_imgs:
        src = os.path.join(class_dir, img)
        dst = os.path.join(output_dataset_dir, 'train', cls, img)
        shutil.copy(src, dst)

    for img in val_imgs:
        src = os.path.join(class_dir, img)
        dst = os.path.join(output_dataset_dir, 'val', cls, img)
        shutil.copy(src, dst)

print(f"✅ Added real images to {output_dataset_dir}")

✅ Added real images to data/dataset_lora_ti_controlnet_refined


In [5]:
# Change this to your dataset directory
counts = count_images_in_split(output_dataset_dir)


📂 Split: train
  - benign    : 350 images
  - malignant : 350 images
  - normal    : 350 images
  ✅ Total in train: 1050 images

📂 Split: val
  - benign    : 88 images
  - malignant : 42 images
  - normal    : 27 images
  ✅ Total in val: 157 images


In [6]:
# Add synthetic images to fill the gap
for cls in classes:
    output_cls_dir = os.path.join(output_dataset_dir, 'train', cls)
    augmented_cls_dir = os.path.join(augmented_dir, cls)
    
    real_count = len([f for f in os.listdir(output_cls_dir) if f.endswith(('.png', '.jpg'))])
    needed = target_per_class - real_count

    if needed <= 0:
        print(f"✅ {cls} is already balanced with {real_count} images.")
        continue

    augmented_imgs = [f for f in os.listdir(augmented_cls_dir) if f.endswith(('.png', '.jpg'))]
    selected_imgs = random.sample(augmented_imgs, k=needed)

    for img in selected_imgs:
        src = os.path.join(augmented_cls_dir, img)
        dst_name = f"gen_{img}"
        dst = os.path.join(output_cls_dir, dst_name)
        shutil.copy(src, dst)

    print(f"✅ Added {needed} synthetic {cls} images (now ~{target_per_class} total)")

✅ benign is already balanced with 350 images.
✅ malignant is already balanced with 350 images.
✅ normal is already balanced with 350 images.


In [7]:
# Change this to your dataset directory
counts = count_images_in_split(output_dataset_dir)


📂 Split: train
  - benign    : 350 images
  - malignant : 350 images
  - normal    : 350 images
  ✅ Total in train: 1050 images

📂 Split: val
  - benign    : 88 images
  - malignant : 42 images
  - normal    : 27 images
  ✅ Total in val: 157 images
