# Processing the images to get more data for each celebrity:

Packages needed:

pip install numpy

pip install tensorflow

pip install scipy

In [10]:
import os
import shutil
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array, array_to_img

# Set paths
base_path = "/workspace/DS4002Project3/DATA/celebrities"  # Original dataset path
output_path = "/workspace/DS4002Project3/DATA/celebrities_extra"  # New folder to save augmented images

# Clear the celebrities_extra folder if it exists
if os.path.exists(output_path):
    shutil.rmtree(output_path)  # Delete the entire folder and its contents
os.makedirs(output_path, exist_ok=True)  # Recreate the empty folder

# Data augmentation settings (generate exactly 3 versions per image)
datagen = ImageDataGenerator(
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Loop through each celebrity folder and generate 3 new images for each of the 100 original images
for celeb in os.listdir(base_path):
    celeb_folder = os.path.join(base_path, celeb)
    
    # Debugging: Check if the "Will-Smith" folder exists and print contents
    if celeb == "Will-Smith":
        print(f"Processing {celeb}...")
        print(f"Contents of {celeb} folder: {os.listdir(celeb_folder)}")  # Print the folder contents

    save_folder = os.path.join(output_path, celeb)
    os.makedirs(save_folder, exist_ok=True)  # Create sub-folder for each celebrity

    # Detect images with .jpg, .jpeg, or .png extensions
    images = [img for img in os.listdir(celeb_folder) if img.endswith(('.jpg', '.jpeg', '.png'))]
    
    # Debugging: Print out the images being detected for this celebrity
    if celeb == "Will-Smith":
        print(f"Detected images for {celeb}: {images}")

    # Initialize the image index to start from 1
    image_index = 1

    # Save the original images (index 1-100) and create augmented versions (index 101-300)
    for image_name in images:
        img_path = os.path.join(celeb_folder, image_name)
        img = load_img(img_path)
        x = img_to_array(img)
        x = x.reshape((1,) + x.shape)  # Reshape to (1, height, width, channels)

        # Save the original image (index 1-100)
        original_image_name = f"{image_index:03}.jpg"  # Format with leading zeros (001.jpg, 002.jpg)
        original_image_path = os.path.join(save_folder, original_image_name)
        img.save(original_image_path)
        image_index += 1

        # Generate exactly 3 augmented versions for each original image (index 101-300)
        augmented_count = 0
        for batch in datagen.flow(x, batch_size=1, save_to_dir=save_folder, save_prefix='', save_format='jpg'):
            augmented_count += 1
            # Name the augmented images as "101", "102", ..., "300"
            new_image_name = f"{image_index:03}.jpg"  # Format with leading zeros
            new_image_path = os.path.join(save_folder, new_image_name)

            # Convert and save augmented image
            new_img = array_to_img(batch[0])
            new_img.save(new_image_path)

            image_index += 1
            if augmented_count >= 3:  # Stop after generating 3 versions for each image
                break

    # Remove any unwanted files that don't match the naming convention (1-300.jpg)
    for filename in os.listdir(save_folder):
        if not filename.endswith('.jpg'):
            continue

        # Ensure the filename is strictly 3 digits before the file extension
        try:
            file_index = int(filename.split('.')[0])  # Get the numeric part of the filename (e.g., 001, 002, ...)
            if file_index < 1 or file_index > 300:
                file_path = os.path.join(save_folder, filename)
                os.remove(file_path)  # Remove unwanted files
        except ValueError:
            # Skip files that do not match the expected 3-digit format
            file_path = os.path.join(save_folder, filename)
            os.remove(file_path)

    # Ensure exactly 300 images for each celebrity (100 original + 200 augmented)
    files = os.listdir(save_folder)
    if len(files) != 300:
        print(f"Warning: Expected 300 images for {celeb}, but got {len(files)} images.")

Processing Will-Smith...
Contents of Will-Smith folder: ['01.jpeg', '02.jpeg', '03.jpeg', '04.jpeg', '05.jpeg', '06.jpeg', '07.jpeg', '08.jpeg', '09.jpeg', '10.jpeg', '100.jpeg', '11.jpeg', '12.jpeg', '13.jpeg', '14.jpeg', '15.jpeg', '16.jpeg', '17.jpeg', '18.jpeg', '19.jpeg', '20.jpeg', '21.jpeg', '22.jpeg', '23.jpeg', '24.jpeg', '25.jpeg', '26.jpeg', '27.jpeg', '28.jpeg', '29.jpeg', '30.jpeg', '31.jpeg', '32.jpeg', '33.jpeg', '34.jpeg', '35.jpeg', '36.jpeg', '37.jpeg', '38.jpeg', '39.jpeg', '40.jpeg', '41.jpeg', '42.jpeg', '43.jpeg', '44.jpeg', '45.jpeg', '46.jpeg', '47.jpeg', '48.jpeg', '49.jpeg', '50.jpeg', '51.jpeg', '52.jpeg', '53.jpeg', '54.jpeg', '55.jpeg', '56.jpeg', '57.jpeg', '58.jpeg', '59.jpeg', '60.jpeg', '61.jpeg', '62.jpeg', '63.jpeg', '64.jpeg', '65.jpeg', '66.jpeg', '67.jpeg', '68.jpeg', '69.jpeg', '70.jpeg', '71.jpeg', '72.jpeg', '73.jpeg', '74.jpeg', '75.jpeg', '76.jpeg', '77.jpeg', '78.jpeg', '79.jpeg', '80.jpeg', '81.jpeg', '82.jpeg', '83.jpeg', '84.jpeg', '85.jpe