In [1]:
!pip install -U albumentations

Collecting albumentations
  Downloading albumentations-2.0.1-py3-none-any.whl.metadata (38 kB)
Collecting albucore==0.0.23 (from albumentations)
  Downloading albucore-0.0.23-py3-none-any.whl.metadata (5.3 kB)
Collecting simsimd>=5.9.2 (from albucore==0.0.23->albumentations)
  Downloading simsimd-6.2.1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.0/66.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Downloading albumentations-2.0.1-py3-none-any.whl (276 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.8/276.8 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading albucore-0.0.23-py3-none-any.whl (14 kB)
Downloading simsimd-6.2.1-cp310-cp310-manylinux_2_28_x86_64.whl (632 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m632.7/632.7 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: simsimd, albucore, albumentations
  Att

In [2]:
import os
import shutil
from albumentations import (
    RandomBrightnessContrast,
    ShiftScaleRotate,
    GaussianBlur,
    Compose,
    HueSaturationValue
)
from PIL import Image, ImageOps
import numpy as np
from zipfile import ZipFile

# Augmentation Pipeline
augmentation_pipeline = Compose([
    RandomBrightnessContrast(p=0.5),
    GaussianBlur(blur_limit=(3, 5), p=0.3),
    ShiftScaleRotate(shift_limit=0.05, scale_limit=0.02, rotate_limit=0, p=0.7),
    HueSaturationValue(hue_shift_limit=10, sat_shift_limit=15, val_shift_limit=10, p=0.5)
])

# Ensure target size is the largest class size
def get_largest_class_size(dataset_dir):
    largest_size = 0
    for class_name in os.listdir(dataset_dir):
        class_dir = os.path.join(dataset_dir, class_name)
        if os.path.isdir(class_dir):
            largest_size = max(largest_size, len(os.listdir(class_dir)))
    return largest_size

# Balance classes
def balance_classes(input_dir, output_dir, target_size):
    for class_name in os.listdir(input_dir):
        class_dir = os.path.join(input_dir, class_name)
        output_class_dir = os.path.join(output_dir, class_name)
        os.makedirs(output_class_dir, exist_ok=True)

        if os.path.isdir(class_dir):
            images = [f for f in os.listdir(class_dir) if f.endswith(('.jpg', '.png', '.jpeg'))]
            # Copy existing images
            for img_name in images:
                shutil.copy(os.path.join(class_dir, img_name), os.path.join(output_class_dir, img_name))
            # Augment if necessary
            augment_needed = target_size - len(images)
            for i in range(augment_needed):
                img_name = images[i % len(images)]
                img_path = os.path.join(class_dir, img_name)
                img = Image.open(img_path)
                flipped_img = ImageOps.mirror(img)  # Horizontal flip
                augmented = augmentation_pipeline(image=np.array(flipped_img))['image']
                augmented_img = Image.fromarray(augmented)
                augmented_img.save(os.path.join(output_class_dir, f"aug_{i}_{img_name}"))

# Compress final dataset
def create_zip(output_dir, zip_path):
    with ZipFile(zip_path, 'w') as zipf:
        for root, _, files in os.walk(output_dir):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, output_dir)
                zipf.write(file_path, arcname)

# Main workflow
def main():
    INPUT_DIR = "/kaggle/input/cropped-dataset-clean/cropped_dataset (())"
    OUTPUT_DIR = "/kaggle/working/final_balanced_dataset"
    ZIP_FILE = "/kaggle/working/final_balanced_dataset.zip"

    # Ensure target size is the largest class size
    train_target_size = get_largest_class_size(os.path.join(INPUT_DIR, "Train"))
    test_target_size = get_largest_class_size(os.path.join(INPUT_DIR, "Test"))

    # Balance train and test sets
    balance_classes(os.path.join(INPUT_DIR, "Train"), os.path.join(OUTPUT_DIR, "Train"), train_target_size)
    balance_classes(os.path.join(INPUT_DIR, "Test"), os.path.join(OUTPUT_DIR, "Test"), test_target_size)

    # Create a ZIP file
    create_zip(OUTPUT_DIR, ZIP_FILE)
    print(f"Dataset balanced and saved as ZIP: {ZIP_FILE}")

if __name__ == "__main__":
    main()


  original_init(self, **validated_kwargs)


Dataset balanced and saved as ZIP: /kaggle/working/final_balanced_dataset.zip
