In [1]:
import os
import shutil
import random
from tqdm import tqdm

In [2]:
# ⚠️ Set these paths manually before running
INPUT_DATASET_FOLDER = "./Unsplitted_Ready_Sets/set_01_class_balanced_augs_applied"  # Path to the full dataset
OUTPUT_DATASET_FOLDER = "./Unsplitted_Ready_Sets/set_01_class_balanced_augs_applied_splitted"  # Path to store train, val, test splits

# Emotion categories (Folder names must match)
EMOTION_CATEGORIES = ["Angry", "Disgust", "Fear", "Happy", "Neutral", "Sad", "Surprise"]

# Splitting Ratios
TRAIN_RATIO = 0.80  # 80% for training
VAL_RATIO = 0.10    # 10% for validation
TEST_RATIO = 0.10   # 10% for testing

# Set random seed for reproducibility
random.seed(42)

In [3]:
# Step 1: Create output directories
def initialize_output_folders():
    for split in ["train", "val", "test"]:
        split_folder = os.path.join(OUTPUT_DATASET_FOLDER, split)
        if not os.path.exists(split_folder):
            os.makedirs(split_folder)

        # Create subfolders for each emotion class
        for emotion in EMOTION_CATEGORIES:
            class_folder = os.path.join(split_folder, emotion)
            if not os.path.exists(class_folder):
                os.makedirs(class_folder)

In [4]:
# Step 2: Split dataset and distribute images
def split_dataset():
    for emotion_category in EMOTION_CATEGORIES:
        source_folder = os.path.join(INPUT_DATASET_FOLDER, emotion_category)

        # Check if source folder exists
        if not os.path.exists(source_folder):
            print(f"❌ Skipping {emotion_category}: No such folder in dataset.")
            continue

        # Get all images in the category folder
        image_paths = [
            os.path.join(source_folder, file) for file in os.listdir(source_folder)
            if file.lower().endswith((".jpg", ".jpeg", ".png"))
        ]

        # Shuffle images for randomness
        random.shuffle(image_paths)

        # Compute split sizes
        total_images = len(image_paths)
        train_count = int(total_images * TRAIN_RATIO)
        val_count = int(total_images * VAL_RATIO)
        test_count = total_images - train_count - val_count  # Remaining images for test

        # Split the images
        train_images = image_paths[:train_count]
        val_images = image_paths[train_count:train_count + val_count]
        test_images = image_paths[train_count + val_count:]

        # Move images to respective directories
        for split, split_images in zip(["train", "val", "test"], [train_images, val_images, test_images]):
            target_folder = os.path.join(OUTPUT_DATASET_FOLDER, split, emotion_category)

            with tqdm(total=len(split_images), desc=f"Copying {emotion_category} -> {split}") as pbar:
                for img_path in split_images:
                    shutil.copy(img_path, os.path.join(target_folder, os.path.basename(img_path)))
                    pbar.update(1)

    print("\n✅ Dataset splitting completed!")

In [5]:
# Run the script
if __name__ == "__main__":
    initialize_output_folders()
    split_dataset()

    print(f"\n✅ Splitted dataset is stored in: {OUTPUT_DATASET_FOLDER}")

Copying Angry -> train: 100%|██████████| 17716/17716 [12:41<00:00, 23.27it/s] 
Copying Angry -> val: 100%|██████████| 2214/2214 [01:20<00:00, 27.63it/s]
Copying Angry -> test: 100%|██████████| 2216/2216 [01:32<00:00, 23.98it/s]
Copying Disgust -> train: 100%|██████████| 17716/17716 [13:33<00:00, 21.79it/s] 
Copying Disgust -> val: 100%|██████████| 2214/2214 [01:40<00:00, 22.00it/s]
Copying Disgust -> test: 100%|██████████| 2216/2216 [01:33<00:00, 23.72it/s]
Copying Fear -> train: 100%|██████████| 17716/17716 [13:15<00:00, 22.26it/s] 
Copying Fear -> val: 100%|██████████| 2214/2214 [01:26<00:00, 25.66it/s]
Copying Fear -> test: 100%|██████████| 2216/2216 [01:24<00:00, 26.23it/s]
Copying Happy -> train: 100%|██████████| 17716/17716 [12:31<00:00, 23.58it/s]
Copying Happy -> val: 100%|██████████| 2214/2214 [01:19<00:00, 27.69it/s]
Copying Happy -> test: 100%|██████████| 2216/2216 [01:19<00:00, 27.89it/s]
Copying Neutral -> train: 100%|██████████| 17716/17716 [11:16<00:00, 26.17it/s]
Copyin


✅ Dataset splitting completed!

✅ Splitted dataset is stored in: ./Unsplitted_Ready_Sets/set_01_class_balanced_augs_applied_splitted



