In [1]:
!pip install -U albumentations


Collecting albumentations
  Downloading albumentations-2.0.1-py3-none-any.whl.metadata (38 kB)
Collecting albucore==0.0.23 (from albumentations)
  Downloading albucore-0.0.23-py3-none-any.whl.metadata (5.3 kB)
Collecting simsimd>=5.9.2 (from albucore==0.0.23->albumentations)
  Downloading simsimd-6.2.1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.0/66.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Downloading albumentations-2.0.1-py3-none-any.whl (276 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.8/276.8 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading albucore-0.0.23-py3-none-any.whl (14 kB)
Downloading simsimd-6.2.1-cp310-cp310-manylinux_2_28_x86_64.whl (632 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m632.7/632.7 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: simsimd, albucore, albumentations
  Att

In [2]:
import os
from albumentations import (
    RandomBrightnessContrast,
    ShiftScaleRotate,
    GaussianBlur,
    Compose,
    HueSaturationValue
)
from PIL import Image, ImageOps
import numpy as np
import shutil
import random
import logging
from zipfile import ZipFile

# Setup logging
log_file = "/kaggle/working/dataset_preparation.log"
logging.basicConfig(
    filename=log_file,
    filemode='w',
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO
)

def log_and_print(message, level=logging.INFO):
    """Logs and prints a message."""
    logging.log(level, message)
    print(message)

# Augmentation Pipeline
augmentation_pipeline = Compose([
    RandomBrightnessContrast(p=0.5),  # Brightness and contrast adjustment
    GaussianBlur(blur_limit=(3, 5), p=0.3),  # Slight blur
    ShiftScaleRotate(shift_limit=0.05, scale_limit=0.02, rotate_limit=0, p=0.7),  # Subtle shifts
    HueSaturationValue(hue_shift_limit=10, sat_shift_limit=15, val_shift_limit=10, p=0.5)  # Hue and saturation
])

# Analyze dataset to determine class counts and augmentation needs
def analyze_dataset(dataset_dir):
    """
    Analyze the dataset to calculate total images, maximum class count,
    and how many more images are needed to balance the dataset.

    Args:
        dataset_dir (str): Path to the dataset directory.

    Returns:
        dict: A dictionary with the following keys:
              - class_counts: Number of images per class.
              - total_images: Total number of images in the dataset.
              - max_class_count: Maximum number of images in a class.
              - augment_needed: Number of augmentations needed per class to balance.
    """
    class_dirs = [os.path.join(dataset_dir, d) for d in os.listdir(dataset_dir) if os.path.isdir(os.path.join(dataset_dir, d))]
    class_counts = {}
    augment_needed = {}

    total_images = 0
    max_class_count = 0

    for class_dir in class_dirs:
        class_name = os.path.basename(class_dir)
        images = [f for f in os.listdir(class_dir) if f.endswith(('.jpg', '.png'))]
        image_count = len(images)
        class_counts[class_name] = image_count
        total_images += image_count

        if image_count > max_class_count:
            max_class_count = image_count

    # Calculate augmentations needed for each class
    for class_name, count in class_counts.items():
        augment_needed[class_name] = max_class_count - count

    # Print summary
    log_and_print("Dataset Analysis:")
    log_and_print("-----------------")
    for class_name, count in class_counts.items():
        log_and_print(f"Class '{class_name}': {count} images (Needs {augment_needed[class_name]} more to balance)")
    log_and_print(f"\nTotal images in dataset: {total_images}")
    log_and_print(f"Largest class size: {max_class_count}")
    log_and_print(f"Total augmentations needed: {sum(augment_needed.values())}")

    return {
        "class_counts": class_counts,
        "total_images": total_images,
        "max_class_count": max_class_count,
        "augment_needed": augment_needed,
    }

# Function to balance classes (prioritize original images, flip, and augment if needed)
def balance_classes(input_dir, output_dir, target_size):
    class_dirs = [os.path.join(input_dir, d) for d in os.listdir(input_dir) if os.path.isdir(os.path.join(input_dir, d))]

    for class_dir in class_dirs:
        class_name = os.path.basename(class_dir)
        output_class_dir = os.path.join(output_dir, class_name)
        os.makedirs(output_class_dir, exist_ok=True)

        images = [f for f in os.listdir(class_dir) if f.endswith(('.jpg', '.png'))]
        current_size = len(images)

        # Copy all original images first
        log_and_print(f"Copying original images for class '{class_name}' ({current_size} images)...")
        for img_name in images:
            img_path = os.path.join(class_dir, img_name)
            shutil.copy(img_path, os.path.join(output_class_dir, img_name))
            log_and_print(f"[Copied] {img_path} -> {output_class_dir}")

        # Augment if the class size is smaller than the target
        if current_size < target_size:
            augment_needed = target_size - current_size
            log_and_print(f"Augmenting class '{class_name}' ({current_size}/{target_size}) with {augment_needed} images...")

            for i in range(augment_needed):
                img_name = images[i % len(images)]
                img_path = os.path.join(class_dir, img_name)
                img = Image.open(img_path)

                # Apply horizontal flip
                flipped_img = ImageOps.mirror(img)
                flipped_img_array = np.expand_dims(np.array(flipped_img), axis=0)

                # Apply additional augmentations
                augmented = augmentation_pipeline(image=np.array(flipped_img))['image']

                # Save augmented image
                augmented_img = Image.fromarray(augmented)
                augmented_img.save(os.path.join(output_class_dir, f"aug_{i}_{img_name}"))
                log_and_print(f"[Augmented] {img_name} (flipped) -> Saved in {output_class_dir}")

            log_and_print(f"Finished augmenting class '{class_name}'.")

        log_and_print(f"Class '{class_name}' balanced and saved to '{output_class_dir}'.")

# Create a ZIP file of the final dataset
def create_zip(output_dir, zip_file):
    log_and_print(f"Creating ZIP file at {zip_file}...")
    with ZipFile(zip_file, 'w') as zipf:
        for root, _, files in os.walk(output_dir):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, output_dir)
                zipf.write(file_path, arcname)
    log_and_print(f"ZIP file created: {zip_file}")

# Main pipeline
def main():
    INPUT_DIR = "/kaggle/input/cropped-dataset-clean/cropped_dataset (())"
    OUTPUT_DIR = "/kaggle/working/final_dataset"
    ZIP_FILE = "/kaggle/working/final_dataset.zip"

    # Analyze train and test datasets separately
    log_and_print("Analyzing Train Dataset...")
    train_analysis = analyze_dataset(os.path.join(INPUT_DIR, "Train"))

    log_and_print("Analyzing Test Dataset...")
    test_analysis = analyze_dataset(os.path.join(INPUT_DIR, "Test"))

    # Get target size based on the analysis (midpoint)
    train_target_size = int((train_analysis["max_class_count"] + min(train_analysis["class_counts"].values())) / 2)
    test_target_size = int((test_analysis["max_class_count"] + min(test_analysis["class_counts"].values())) / 2)

    # Balance train dataset
    log_and_print(f"Balancing Train Dataset with target size {train_target_size}...")
    balance_classes(os.path.join(INPUT_DIR, "Train"), os.path.join(OUTPUT_DIR, "Train"), train_target_size)

    # Balance test dataset
    log_and_print(f"Balancing Test Dataset with target size {test_target_size}...")
    balance_classes(os.path.join(INPUT_DIR, "Test"), os.path.join(OUTPUT_DIR, "Test"), test_target_size)

    # Create ZIP file for the final dataset
    log_and_print("Zipping final dataset...")
    create_zip(OUTPUT_DIR, ZIP_FILE)

    log_and_print("Dataset preparation complete.")

if __name__ == "__main__":
    main()


  original_init(self, **validated_kwargs)


Analyzing Train Dataset...
Dataset Analysis:
-----------------
Class 'surprise': 1088 images (Needs 1242 more to balance)
Class 'fear': 1559 images (Needs 771 more to balance)
Class 'neutral': 1846 images (Needs 484 more to balance)
Class 'sad': 1599 images (Needs 731 more to balance)
Class 'happy': 2330 images (Needs 0 more to balance)
Class 'anger': 1469 images (Needs 861 more to balance)

Total images in dataset: 9891
Largest class size: 2330
Total augmentations needed: 4089
Analyzing Test Dataset...
Dataset Analysis:
-----------------
Class 'surprise': 493 images (Needs 499 more to balance)
Class 'fear': 702 images (Needs 290 more to balance)
Class 'neutral': 798 images (Needs 194 more to balance)
Class 'sad': 802 images (Needs 190 more to balance)
Class 'happy': 992 images (Needs 0 more to balance)
Class 'anger': 619 images (Needs 373 more to balance)

Total images in dataset: 4406
Largest class size: 992
Total augmentations needed: 1546
Balancing Train Dataset with target size 17

In [3]:
# Main pipeline
def main():
    INPUT_DIR = "/kaggle/input/cropped-dataset-clean/cropped_dataset (())"
    OUTPUT_DIR = "/kaggle/working/final_dataset"
    ZIP_FILE = "/kaggle/working/final_dataset.zip"

    # Analyze train and test datasets separately
    log_and_print("Analyzing Train Dataset...")
    train_analysis = analyze_dataset(os.path.join(INPUT_DIR, "Train"))

    log_and_print("Analyzing Test Dataset...")
    test_analysis = analyze_dataset(os.path.join(INPUT_DIR, "Test"))

    # Get target size based on the analysis (midpoint)
    train_target_size = int((train_analysis["max_class_count"] + min(train_analysis["class_counts"].values())) / 2)
    test_target_size = int((test_analysis["max_class_count"] + min(test_analysis["class_counts"].values())) / 2)

    # Balance train dataset
    log_and_print(f"Balancing Train Dataset with target size {train_target_size}...")
    balance_classes(os.path.join(INPUT_DIR, "Train"), os.path.join(OUTPUT_DIR, "Train"), train_target_size)

    # Balance test dataset
    log_and_print(f"Balancing Test Dataset with target size {test_target_size}...")
    balance_classes(os.path.join(INPUT_DIR, "Test"), os.path.join(OUTPUT_DIR, "Test"), test_target_size)

    # Create ZIP file for the final dataset
    log_and_print("Zipping final dataset...")
    create_zip(OUTPUT_DIR, ZIP_FILE)

    log_and_print("Dataset preparation complete.")

if __name__ == "__main__":
    main()


Analyzing Train Dataset...
Dataset Analysis:
-----------------
Class 'surprise': 1088 images (Needs 1242 more to balance)
Class 'fear': 1559 images (Needs 771 more to balance)
Class 'neutral': 1846 images (Needs 484 more to balance)
Class 'sad': 1599 images (Needs 731 more to balance)
Class 'happy': 2330 images (Needs 0 more to balance)
Class 'anger': 1469 images (Needs 861 more to balance)

Total images in dataset: 9891
Largest class size: 2330
Total augmentations needed: 4089
Analyzing Test Dataset...
Dataset Analysis:
-----------------
Class 'surprise': 493 images (Needs 499 more to balance)
Class 'fear': 702 images (Needs 290 more to balance)
Class 'neutral': 798 images (Needs 194 more to balance)
Class 'sad': 802 images (Needs 190 more to balance)
Class 'happy': 992 images (Needs 0 more to balance)
Class 'anger': 619 images (Needs 373 more to balance)

Total images in dataset: 4406
Largest class size: 992
Total augmentations needed: 1546
Balancing Train Dataset with target size 17