In [20]:
import os
from pathlib import Path
from sklearn.model_selection import train_test_split
from PIL import Image
from torchvision import transforms
from tqdm import tqdm
import torch
import random

In [22]:
# Define input and output paths
INPUT_FOLDER = "YOLO-OUTPUT"  # Path to the main dataset folder with subfolders for each emotion
OUTPUT_FOLDER = "YOLO-PREPROCESSED2" # Output folder for the preprocessed dataset

In [23]:
# Train, Validation, Test Split Ratios
TRAIN_RATIO = 0.7
VALIDATION_RATIO = 0.2
TEST_RATIO = 0.1

# Target size for resizing
TARGET_SIZE = (224, 224)

In [24]:
# Define target sizes per split with higher weights for Happy, Sad, and Neutral
TARGET_IMAGES = {
    "train": {
        "Anger": 1500,
        "Disgust": 1300,
        "Fear": 1400,
        "Happy": 2500,  # Higher weight for depression-related classes
        "Neutral": 2500,  # Higher weight for depression-related classes
        "Sad": 2500,  # Higher weight for depression-related classes
        "Surprise": 1500,
    },
    "validation": {
        "Anger": 450,
        "Disgust": 390,
        "Fear": 420,
        "Happy": 750,  # Higher weight for depression-related classes
        "Neutral": 750,  # Higher weight for depression-related classes
        "Sad": 750,  # Higher weight for depression-related classes
        "Surprise": 450,
    },
    "test": {
        "Anger": 225,
        "Disgust": 195,
        "Fear": 210,
        "Happy": 375,  # Higher weight for depression-related classes
        "Neutral": 375,  # Higher weight for depression-related classes
        "Sad": 375,  # Higher weight for depression-related classes
        "Surprise": 225,
    },
}

In [25]:
# Create output directories
for split in ["train", "validation", "test"]:
    for emotion in os.listdir(INPUT_FOLDER):
        os.makedirs(os.path.join(OUTPUT_FOLDER, split, emotion), exist_ok=True)

In [26]:
# Data augmentation using torchvision transforms
augmentation = transforms.Compose([
    transforms.Resize(TARGET_SIZE),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
])

resize_only = transforms.Compose([
    transforms.Resize(TARGET_SIZE),
])

In [27]:
# Function to augment data until target size is met
def augment_class(images, target_size, folder_path, save_path, device):
    """
    Augments the dataset to achieve the target size for the given class.
    """
    print(f"Balancing class in {save_path} to {target_size} images.")
    current_size = len(images)
    while len(images) < target_size:
        # Pick a random image to augment
        image_path = random.choice(images)
        try:
            # Load and augment image
            img = Image.open(image_path).convert("RGB")
            img_tensor = transforms.ToTensor()(img).unsqueeze(0).to(device)
            augmented_tensor = augmentation(img_tensor.squeeze(0)).to("cpu")
            augmented_img = transforms.ToPILImage()(augmented_tensor)
            
            # Save augmented image
            save_name = f"aug_{len(images)}_{Path(image_path).name}"
            augmented_img.save(os.path.join(save_path, save_name))
            images.append(os.path.join(save_path, save_name))
        except Exception as e:
            print(f"Error augmenting {image_path}: {e}")

In [28]:
# Function to process and split dataset using GPU
def process_and_split():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    for emotion in os.listdir(INPUT_FOLDER):
        emotion_path = os.path.join(INPUT_FOLDER, emotion)
        images = list(Path(emotion_path).rglob("*.jpg")) + list(Path(emotion_path).rglob("*.png"))
        
        # Train, validation, test split
        train_images, temp_images = train_test_split(images, test_size=(1 - TRAIN_RATIO), random_state=42)
        val_images, test_images = train_test_split(temp_images, test_size=(TEST_RATIO / (TEST_RATIO + VALIDATION_RATIO)), random_state=42)
        
        # Save splits to respective folders and augment underrepresented classes
        for split, split_images in zip(["train", "validation", "test"], [train_images, val_images, test_images]):
            save_path = os.path.join(OUTPUT_FOLDER, split, emotion)
            # Resize and save original images
            for image_path in tqdm(split_images, desc=f"Processing {emotion} - {split}"):
                try:
                    img = Image.open(image_path).convert("RGB")
                    img_tensor = transforms.ToTensor()(img).unsqueeze(0).to(device)
                    resized_tensor = resize_only(img_tensor.squeeze(0)).to("cpu")
                    resized_img = transforms.ToPILImage()(resized_tensor)
                    resized_img.save(os.path.join(save_path, Path(image_path).name))
                except Exception as e:
                    print(f"Error processing {image_path}: {e}")
            
            # Augment images to balance the class
            augment_class(
                images=split_images,
                target_size=TARGET_IMAGES[split][emotion],
                folder_path=emotion_path,
                save_path=save_path,
                device=device
            )

In [29]:
# Execute the function
process_and_split()
print("Dataset successfully processed, balanced, and split!")

Using device: cuda


Processing Anger - train: 100%|██████████| 2060/2060 [00:09<00:00, 228.89it/s]


Balancing class in YOLO-PREPROCESSED2\train\Anger to 1500 images.


Processing Anger - validation: 100%|██████████| 588/588 [00:02<00:00, 247.04it/s]


Balancing class in YOLO-PREPROCESSED2\validation\Anger to 450 images.


Processing Anger - test: 100%|██████████| 295/295 [00:01<00:00, 227.21it/s]


Balancing class in YOLO-PREPROCESSED2\test\Anger to 225 images.


Processing Disgust - train: 100%|██████████| 1852/1852 [00:07<00:00, 248.09it/s]


Balancing class in YOLO-PREPROCESSED2\train\Disgust to 1300 images.


Processing Disgust - validation: 100%|██████████| 530/530 [00:02<00:00, 250.17it/s]


Balancing class in YOLO-PREPROCESSED2\validation\Disgust to 390 images.


Processing Disgust - test: 100%|██████████| 265/265 [00:01<00:00, 256.91it/s]


Balancing class in YOLO-PREPROCESSED2\test\Disgust to 195 images.


Processing Fear - train: 100%|██████████| 1964/1964 [00:07<00:00, 249.97it/s]


Balancing class in YOLO-PREPROCESSED2\train\Fear to 1400 images.


Processing Fear - validation: 100%|██████████| 561/561 [00:02<00:00, 251.24it/s]


Balancing class in YOLO-PREPROCESSED2\validation\Fear to 420 images.


Processing Fear - test: 100%|██████████| 281/281 [00:01<00:00, 252.03it/s]


Balancing class in YOLO-PREPROCESSED2\test\Fear to 210 images.


Processing Happy - train: 100%|██████████| 326/326 [00:01<00:00, 251.75it/s]


Balancing class in YOLO-PREPROCESSED2\train\Happy to 2500 images.


Processing Happy - validation: 100%|██████████| 93/93 [00:00<00:00, 245.38it/s]


Balancing class in YOLO-PREPROCESSED2\validation\Happy to 750 images.


Processing Happy - test: 100%|██████████| 47/47 [00:00<00:00, 231.53it/s]


Balancing class in YOLO-PREPROCESSED2\test\Happy to 375 images.


Processing Neutral - train: 100%|██████████| 724/724 [00:02<00:00, 274.65it/s]


Balancing class in YOLO-PREPROCESSED2\train\Neutral to 2500 images.


Processing Neutral - validation: 100%|██████████| 207/207 [00:00<00:00, 284.33it/s]


Balancing class in YOLO-PREPROCESSED2\validation\Neutral to 750 images.


Processing Neutral - test: 100%|██████████| 104/104 [00:00<00:00, 265.11it/s]


Balancing class in YOLO-PREPROCESSED2\test\Neutral to 375 images.


Processing Sad - train: 100%|██████████| 1581/1581 [00:06<00:00, 253.13it/s]


Balancing class in YOLO-PREPROCESSED2\train\Sad to 2500 images.


Processing Sad - validation: 100%|██████████| 452/452 [00:01<00:00, 280.98it/s]


Balancing class in YOLO-PREPROCESSED2\validation\Sad to 750 images.


Processing Sad - test: 100%|██████████| 226/226 [00:00<00:00, 276.90it/s]


Balancing class in YOLO-PREPROCESSED2\test\Sad to 375 images.


Processing Surprise - train: 100%|██████████| 2256/2256 [00:08<00:00, 260.56it/s]


Balancing class in YOLO-PREPROCESSED2\train\Surprise to 1500 images.


Processing Surprise - validation: 100%|██████████| 645/645 [00:02<00:00, 243.85it/s]


Balancing class in YOLO-PREPROCESSED2\validation\Surprise to 450 images.


Processing Surprise - test: 100%|██████████| 323/323 [00:01<00:00, 253.85it/s]

Balancing class in YOLO-PREPROCESSED2\test\Surprise to 225 images.
Dataset successfully processed, balanced, and split!



