In [19]:
import os
from pathlib import Path
from sklearn.model_selection import train_test_split
from PIL import Image
from torchvision import transforms
from tqdm import tqdm
import torch

In [20]:
# Define input and output paths
INPUT_FOLDER = "YOLO-OUTPUT"  # Path to the main dataset folder with subfolders for each emotion
OUTPUT_FOLDER = "YOLO-PREPROCESSED"  # Path to the output folder

In [21]:
# Train, Validation, Test Split Ratios
TRAIN_RATIO = 0.7
VALIDATION_RATIO = 0.2
TEST_RATIO = 0.1

# Target size for resizing
TARGET_SIZE = (224, 224)

In [22]:
# Create output directories
for split in ["train", "validation", "test"]:
    for emotion in os.listdir(INPUT_FOLDER):
        os.makedirs(os.path.join(OUTPUT_FOLDER, split, emotion), exist_ok=True)

In [23]:
# Data augmentation using torchvision transforms
augmentation = transforms.Compose([
    transforms.Resize(TARGET_SIZE),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
])

resize_only = transforms.Compose([
    transforms.Resize(TARGET_SIZE),
])

In [24]:
# Function to process and split dataset using GPU
def process_and_split():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    for emotion in os.listdir(INPUT_FOLDER):
        emotion_path = os.path.join(INPUT_FOLDER, emotion)
        images = list(Path(emotion_path).rglob("*.jpg")) + list(Path(emotion_path).rglob("*.png"))
        
        # Train, validation, test split
        train_images, temp_images = train_test_split(images, test_size=(1 - TRAIN_RATIO), random_state=42)
        val_images, test_images = train_test_split(temp_images, test_size=(TEST_RATIO / (TEST_RATIO + VALIDATION_RATIO)), random_state=42)
        
        # Save splits to respective folders
        for split, split_images in zip(["train", "validation", "test"], [train_images, val_images, test_images]):
            for image_path in tqdm(split_images, desc=f"Processing {emotion} - {split}"):
                try:
                    # Open image
                    img = Image.open(image_path).convert("RGB")
                    
                    # Convert PIL image to tensor and move to GPU
                    img_tensor = transforms.ToTensor()(img).unsqueeze(0).to(device)

                    # Apply augmentation for training; resize only for others
                    if split == "train":
                        img_tensor = augmentation(img_tensor.squeeze(0))
                    else:
                        img_tensor = resize_only(img_tensor.squeeze(0))
                    
                    # Move back to CPU and convert to PIL image
                    img_tensor = img_tensor.to("cpu")
                    img = transforms.ToPILImage()(img_tensor)

                    # Save image to corresponding split folder
                    save_path = os.path.join(OUTPUT_FOLDER, split, emotion, image_path.name)
                    img.save(save_path)
                except Exception as e:
                    print(f"Error processing {image_path}: {e}")

In [25]:
# Execute the function
process_and_split()
print("Dataset successfully processed and split!")

Using device: cuda


Processing Anger - train: 100%|██████████| 2060/2060 [00:18<00:00, 112.24it/s]
Processing Anger - validation: 100%|██████████| 588/588 [00:02<00:00, 278.96it/s]
Processing Anger - test: 100%|██████████| 295/295 [00:01<00:00, 281.60it/s]
Processing Disgust - train: 100%|██████████| 1852/1852 [00:15<00:00, 122.99it/s]
Processing Disgust - validation: 100%|██████████| 530/530 [00:01<00:00, 301.80it/s]
Processing Disgust - test: 100%|██████████| 265/265 [00:00<00:00, 281.36it/s]
Processing Fear - train: 100%|██████████| 1964/1964 [00:16<00:00, 122.59it/s]
Processing Fear - validation: 100%|██████████| 561/561 [00:03<00:00, 141.45it/s]
Processing Fear - test: 100%|██████████| 281/281 [00:02<00:00, 110.42it/s]
Processing Happy - train: 100%|██████████| 326/326 [00:04<00:00, 65.54it/s]
Processing Happy - validation: 100%|██████████| 93/93 [00:00<00:00, 109.15it/s]
Processing Happy - test: 100%|██████████| 47/47 [00:00<00:00, 110.40it/s]
Processing Neutral - train: 100%|██████████| 724/724 [00

Dataset successfully processed and split!



