# Preprocess Images for Classification
Crop and standardize image sizes for classification.

In [15]:
import os
import json
from PIL import Image, ImageOps
import shutil

## Load data and paths

In [16]:
# Base directory paths
base_dir = "../filtered_data"

# Output directory paths
train_output_dir = os.path.join(base_dir, "train")
val_output_dir = os.path.join(base_dir, "val")
# Create output directories
os.makedirs(train_output_dir, exist_ok=True)
os.makedirs(val_output_dir, exist_ok=True)

# Output image and annotation paths
train_images_dir = os.path.join(train_output_dir, "images")
train_ann_file = os.path.join(train_output_dir, "final_ann.json")
val_images_dir = os.path.join(val_output_dir, "images")
val_ann_file = os.path.join(val_output_dir, "final_ann.json")

print('train_output_dir:', train_output_dir)
print('val_output_dir:', val_output_dir)

print('train_images_dir:', train_images_dir)
print('train_ann_file:', train_ann_file)
print('val_images_dir:', val_images_dir)
print('val_ann_file:', val_ann_file)

# Load COCO annotations
with open(train_ann_file) as f:
    train_data = json.load(f)

with open(val_ann_file) as f:
    val_data = json.load(f)

train_output_dir: ../filtered_data/train
val_output_dir: ../filtered_data/val
train_images_dir: ../filtered_data/train/images
train_ann_file: ../filtered_data/train/final_ann.json
val_images_dir: ../filtered_data/val/images
val_ann_file: ../filtered_data/val/final_ann.json


In [17]:
def crop_images(annotations, images, source_dir, output_dir):
    """
    Crops images based on bounding box annotations and saves the cropped images and their category IDs.

    Inputs:
        annotations (list): A list of dictionaries, each containing the bounding box and category ID for an image.
        images (list): A list of dictionaries, each containing the image ID and file name.
        source_dir (str): The directory path where the original images are stored.
        output_dir (str): The base directory path where the cropped images and labels JSON will be saved.

    Returns:
        cropped_images_info (list): A list of dictionaries, each containing information about the original and cropped image file names.
    """
    # Create a directory to save the cropped images
    image_cropped_dir = os.path.join(output_dir, 'cropped_images')
    # Ensure the output directory is clean
    if os.path.exists(image_cropped_dir):
        shutil.rmtree(image_cropped_dir)
    # Recreate the output directory
    os.makedirs(image_cropped_dir)

    labels_file = os.path.join(output_dir, 'cropped_labels.json')

    # Ensure output directories exist
    os.makedirs(image_cropped_dir, exist_ok=True)

    # Map image ids to file names and category ids
    image_id_to_file_name = {image['id']: image['file_name'] for image in images}
    image_cat_ids = {}

    cropped_images_info = []

    for annotation in annotations:
        image_id = annotation['image_id']
        category_id = annotation['category_id']
        if image_id in image_id_to_file_name:
            file_name = image_id_to_file_name[image_id]
            bbox = annotation['bbox']
            left, top, width, height = bbox
            right = left + width
            bottom = top + height
            
            try:
                with Image.open(os.path.join(source_dir, file_name)) as img:
                    cropped_img = img.crop((left, top, right, bottom))
                    new_file_name = f"cropped_{annotation['id']}_{file_name}"
                    output_path = os.path.join(image_cropped_dir, new_file_name)
                    cropped_img.save(output_path)
                    cropped_images_info.append({'original_image': file_name, 'cropped_image': new_file_name})
                    # Save image id as key and category id as value
                    image_cat_ids[new_file_name] = category_id
            except Exception as e:
                print(f"Error processing {file_name}: {e}")
    
    # Save image category IDs to JSON file
    with open(labels_file, 'w') as f:
        json.dump(image_cat_ids, f)

    return cropped_images_info

### Crop Train Images

In [18]:
# Crop and save train images
cropped_train_info = crop_images(train_data['annotations'], train_data['images'], train_images_dir, train_output_dir)

# Display some of the cropped images information to verify the process
cropped_train_info[:5]

[{'original_image': '131094.jpg',
  'cropped_image': 'cropped_184135_131094.jpg'},
 {'original_image': '131094.jpg',
  'cropped_image': 'cropped_184138_131094.jpg'},
 {'original_image': '131094.jpg',
  'cropped_image': 'cropped_184139_131094.jpg'},
 {'original_image': '131094.jpg',
  'cropped_image': 'cropped_184988_131094.jpg'},
 {'original_image': '131094.jpg',
  'cropped_image': 'cropped_184989_131094.jpg'}]

### Crop Validation Images

In [19]:
print(val_images_dir)

# print the number of files in val_images_dir
print(len(os.listdir(val_images_dir)))

../filtered_data/val/images
551


In [20]:
# Crop and save validation images
cropped_val_info = crop_images(val_data['annotations'], val_data['images'], val_images_dir, val_output_dir)

# Display some of the cropped images information to verify the process
cropped_val_info[:5]

[{'original_image': '134535.jpg',
  'cropped_image': 'cropped_188824_134535.jpg'},
 {'original_image': '134535.jpg',
  'cropped_image': 'cropped_188826_134535.jpg'},
 {'original_image': '123727.jpg',
  'cropped_image': 'cropped_174359_123727.jpg'},
 {'original_image': '174956.jpg',
  'cropped_image': 'cropped_300210_174956.jpg'},
 {'original_image': '105173.jpg',
  'cropped_image': 'cropped_149259_105173.jpg'}]

In [21]:
num_cropped_val = len(os.listdir('../filtered_data/train/cropped_images'))
print(f"Number of cropped validation images: {num_cropped_val}")

Number of cropped validation images: 36501


## Standardize Images

In [22]:
# Function to standardize the size of cropped images
def standardize_image_sizes(input_dir, output_size=(224, 224)):
    """
    Resizes all images in a given directory to a specified size, maintaining aspect ratio.

    This function iterates over each image file in the specified directory, resizes the image to the given dimensions
    using the PIL.ImageOps.fit method (which maintains the image's aspect ratio by cropping as necessary),
    and overwrites the original image file with the resized image.

    Inputs:
        input_dir (str): The directory containing the images to be resized.
        output_size (tuple): The target size for the resized images as a tuple (width, height). Default is (224, 224).

    Returns:
        standardized_images_info (list): List of dictionaries, each containing the file name of an image and its new size.
    """
    standardized_images_info = []

    for image_file in os.listdir(input_dir):
        input_path = os.path.join(input_dir, image_file)
        try:
            with Image.open(input_path) as img:
                # Resize image using default resampling method
                img_resized = ImageOps.fit(img, output_size)
                # Save the resized image
                img_resized.save(input_path)
                standardized_images_info.append({'image': image_file, 'new_size': img_resized.size})
        except Exception as e:
            print(f"Error processing {image_file}: {e}")

    return standardized_images_info


### Standardize Train Images

In [23]:
train_cropped_dir = os.path.join(train_output_dir, 'cropped_images')

# Standardize the sizes of the cropped images and display some of the processed images information
standardize_train_info = standardize_image_sizes(train_cropped_dir)

# Since we encountered errors previously due to the directory structure, 
# this code assumes the correction of those errors and that images are now correctly accessible.
# Displaying information of a few standardized images (if available)
standardize_train_info[:5]

[{'image': 'cropped_109264_070674.jpg', 'new_size': (224, 224)},
 {'image': 'cropped_146262_103049.jpg', 'new_size': (224, 224)},
 {'image': 'cropped_76046_042043.jpg', 'new_size': (224, 224)},
 {'image': 'cropped_103881_066829.jpg', 'new_size': (224, 224)},
 {'image': 'cropped_110794_072057.jpg', 'new_size': (224, 224)}]

### Standardize Validation Images

In [24]:
val_cropped_dir = os.path.join(val_output_dir, 'cropped_images')

# Standardize the sizes of the cropped validation images and display some of the processed images information
standardize_val_info = standardize_image_sizes(val_cropped_dir)

# Since we encountered errors previously due to the directory structure,
# this code assumes the correction of those errors and that images are now correctly accessible.
# Displaying information of a few standardized images (if available)
standardize_val_info[:5]

[{'image': 'cropped_172285_122113.jpg', 'new_size': (224, 224)},
 {'image': 'cropped_245743_154073.jpg', 'new_size': (224, 224)},
 {'image': 'cropped_209006_139376.jpg', 'new_size': (224, 224)},
 {'image': 'cropped_246408_153216.jpg', 'new_size': (224, 224)},
 {'image': 'cropped_149259_105173.jpg', 'new_size': (224, 224)}]

In [25]:
# get the number of images in the cropped directory
num_cropped_train = len(os.listdir('../filtered_data/train/cropped_images'))
print(f"Number of cropped train images: {num_cropped_train}")

num_cropped_val = len(os.listdir('../filtered_data/val/cropped_images'))
print(f"Number of cropped validation images: {num_cropped_val}")

Number of cropped train images: 36501
Number of cropped validation images: 782
