# Processing the images to get more data for each celebrity:

Packages needed:

pip install numpy

pip install tensorflow

pip install scipy

In [8]:
import os
import shutil
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array, array_to_img

# Set paths
base_path = "/workspace/DS4002Project3/DATA/celebrities"  # Original dataset path
output_path = "/workspace/DS4002Project3/DATA/celebrities_extra"  # New folder to save augmented images

# Clear the celebrities_extra folder if it exists
if os.path.exists(output_path):
    shutil.rmtree(output_path)  # Delete the entire folder and its contents
os.makedirs(output_path, exist_ok=True)  # Recreate the empty folder

# Data augmentation settings (generate exactly 3 versions per image)
datagen = ImageDataGenerator(
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Loop through each celebrity folder and generate 3 new images for each of the 100 original images
for celeb in os.listdir(base_path):
    celeb_folder = os.path.join(base_path, celeb)
    save_folder = os.path.join(output_path, celeb)
    os.makedirs(save_folder, exist_ok=True)  # Create sub-folder for each celebrity

    images = [img for img in os.listdir(celeb_folder) if img.endswith('.jpg') or img.endswith('.png')]
    
    # Initialize the image index to start from 1
    image_index = 1

    print(f"Processing {celeb}...")  # Debugging print statement

    # Save the original images (index 1-100) and create augmented versions (index 101-300)
    for image_name in images:
        print(f"Processing image: {image_name}")  # Debugging print statement
        img_path = os.path.join(celeb_folder, image_name)
        img = load_img(img_path)
        x = img_to_array(img)
        x = x.reshape((1,) + x.shape)  # Reshape to (1, height, width, channels)

        # Save the original image (index 1-100)
        original_image_name = f"{image_index:03}.jpg"  # Format with leading zeros (001.jpg, 002.jpg)
        original_image_path = os.path.join(save_folder, original_image_name)
        img.save(original_image_path)
        image_index += 1

        # Generate exactly 3 augmented versions for each original image (index 101-300)
        augmented_count = 0
        for batch in datagen.flow(x, batch_size=1, save_to_dir=save_folder, save_prefix='', save_format='jpg'):
            augmented_count += 1
            # Name the augmented images as "101", "102", ..., "300"
            new_image_name = f"{image_index:03}.jpg"  # Format with leading zeros
            new_image_path = os.path.join(save_folder, new_image_name)

            # Convert and save augmented image
            new_img = array_to_img(batch[0])
            new_img.save(new_image_path)

            image_index += 1
            if augmented_count >= 3:  # Stop after generating 3 versions for each image
                break

    # Remove any unwanted files that don't match the naming convention (1-300.jpg)
    for filename in os.listdir(save_folder):
        if not filename.endswith('.jpg'):
            continue

        # Ensure the filename is strictly 3 digits before the file extension
        try:
            file_index = int(filename.split('.')[0])  # Get the numeric part of the filename (e.g., 001, 002, ...)
            if file_index < 1 or file_index > 300:
                file_path = os.path.join(save_folder, filename)
                os.remove(file_path)  # Remove unwanted files
        except ValueError:
            # Skip files that do not match the expected 3-digit format
            file_path = os.path.join(save_folder, filename)
            os.remove(file_path)

    # Ensure exactly 300 images for each celebrity (100 original + 200 augmented)
    files = os.listdir(save_folder)
    if len(files) != 300:
        print(f"Warning: Expected 300 images for {celeb}, but got {len(files)} images.")



Processing America-Ferrera...
Processing image: 01.jpg
Processing image: 02.jpg
Processing image: 03.jpg
Processing image: 04.jpg
Processing image: 05.jpg
Processing image: 06.jpg
Processing image: 07.jpg
Processing image: 08.jpg
Processing image: 09.jpg
Processing image: 10.jpg
Processing image: 100.jpg
Processing image: 11.jpg
Processing image: 12.jpg
Processing image: 13.jpg
Processing image: 14.jpg
Processing image: 15.jpg
Processing image: 16.jpg
Processing image: 17.jpg
Processing image: 18.jpg
Processing image: 19.jpg
Processing image: 20.jpg
Processing image: 21.jpg
Processing image: 22.jpg
Processing image: 23.jpg
Processing image: 24.jpg
Processing image: 25.jpg
Processing image: 26.jpg
Processing image: 27.jpg
Processing image: 28.jpg
Processing image: 29.jpg
Processing image: 30.jpg
Processing image: 31.jpg
Processing image: 32.jpg
Processing image: 33.jpg
Processing image: 34.jpg
Processing image: 35.jpg
Processing image: 36.jpg
Processing image: 37.jpg
Processing image: 3