In [1]:
import os
import random
from PIL import Image, ImageChops
import numpy as np
from collections import defaultdict
from albumentations import Compose, HorizontalFlip, RandomBrightnessContrast, ShiftScaleRotate, GaussNoise
from sklearn.model_selection import train_test_split
from shutil import copy2

# Set the directory containing your images
image_folder = r'/Users/maximilianstumpf/Desktop/UCLA/Math156 - Machine Learning/Project/Cities/Images'
base_output_folder = r'/Users/maximilianstumpf/Desktop/UCLA/Math156 - Machine Learning/Project/Cities/Processed'

# Load the reference "no imagery" image
reference_image_path = r'/Users/maximilianstumpf/Desktop/UCLA/Math156 - Machine Learning/Project/Cities/No_imagery/no_imagery.jpg'
reference_image = Image.open(reference_image_path).convert('L')  # Convert reference to grayscale

# List of cities to process
desired_cities = [
    "New Yorkd", "Los Angeles", "Washingtonk", "Miami", 
    "San Franciscoh", "Albuquerque", "Chicago", 
    "Kansas City", "Las Vegas", "Tampa"
]

# Create the base output directory
os.makedirs(base_output_folder, exist_ok=True)

target_size = (227, 227)  # Target size for resized images
split_ratios = (0.5, 0.3, 0.2)  # Train, Test, Val split ratios

# Function to check if an image is "no imagery"
def is_no_imagery_image(img_path, reference_image):
    try:
        img = Image.open(img_path).convert('L')  # Convert image to grayscale
        img = img.resize(reference_image.size)  # Resize to match the reference
        diff = ImageChops.difference(reference_image, img)  # Compute pixel differences
        if diff.getbbox() is None:
            return True
        img_array = np.array(img)
        std = np.std(img_array)
        return std < 5  # Low variance indicates uniform image
    except Exception as e:
        print(f"Error checking no imagery for {img_path}: {e}")
        return False

# Function to apply augmentations using Albumentations
def apply_augmentations_with_albumentations(img):
    aug = Compose([
        HorizontalFlip(p=0.5),
        RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.5),
        ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=30, p=0.5),
        GaussNoise(var_limit=(10.0, 50.0), p=0.3)
    ])
    img_array = np.array(img)
    augmented = aug(image=img_array)
    return Image.fromarray(augmented["image"])

# Function to process and resize image
def process_image(img):
    """Convert image to grayscale and resize"""
    return img.resize(target_size)

# Group files by city, skipping "no imagery" files and filtering by desired cities
print("\nGrouping files by city and skipping 'no imagery' files...")
city_files = defaultdict(list)
for filename in os.listdir(image_folder):
    file_path = os.path.join(image_folder, filename)
    if os.path.isfile(file_path):
        if is_no_imagery_image(file_path, reference_image):
            continue  # Skip "no imagery" files
        city_name = filename.split("_")[0]  # Simple city extraction
        if city_name and city_name in desired_cities:  # Only include desired cities
            city_files[city_name].append(filename)

# Process and organize images into city folders
print("\nProcessing images into city folders...")
for city, files in city_files.items():
    city_folder = os.path.join(base_output_folder, "all", city)
    os.makedirs(city_folder, exist_ok=True)
    
    for filename in files:
        file_path = os.path.join(image_folder, filename)
        try:
            with Image.open(file_path) as img:
                processed_img = process_image(img)
                processed_img.save(os.path.join(city_folder, filename))
        except Exception as e:
            print(f"Error processing {filename}: {e}")

# Train-Test-Val Split
print("\nSplitting images into Train, Test, and Validation sets...")
split_folders = ['train', 'test', 'val']
for split in split_folders:
    split_folder = os.path.join(base_output_folder, split)
    os.makedirs(split_folder, exist_ok=True)

for city, files in city_files.items():
    city_all_folder = os.path.join(base_output_folder, "all", city)
    images = [f for f in os.listdir(city_all_folder) if os.path.isfile(os.path.join(city_all_folder, f))]
    
    train_files, temp_files = train_test_split(images, test_size=split_ratios[1] + split_ratios[2], random_state=42)
    val_files, test_files = train_test_split(temp_files, test_size=split_ratios[2] / (split_ratios[1] + split_ratios[2]), random_state=42)
    
    for split, split_files in zip(['train', 'val', 'test'], [train_files, val_files, test_files]):
        split_city_folder = os.path.join(base_output_folder, split, city)
        os.makedirs(split_city_folder, exist_ok=True)
        for file in split_files:
            copy2(os.path.join(city_all_folder, file), split_city_folder)

# Augment the training set
print("\nApplying augmentations to training sets...")
train_folder = os.path.join(base_output_folder, "train")
for city in city_files.keys():
    train_city_folder = os.path.join(train_folder, city)
    train_images = [f for f in os.listdir(train_city_folder) if os.path.isfile(os.path.join(train_city_folder, f))]
    
    for filename in train_images:
        file_path = os.path.join(train_city_folder, filename)
        try:
            with Image.open(file_path) as img:
                for i in range(8):  # Generate up to 8 augmented images
                    try:
                        augmented_img = apply_augmentations_with_albumentations(img)
                        aug_output_path = os.path.join(train_city_folder, f"aug_{i}_{filename}")
                        augmented_img.save(aug_output_path)
                    except Exception as e:
                        print(f"Error augmenting {filename}: {e}")
        except Exception as e:
            print(f"Error processing {filename}: {e}")

print("\nProcessing complete. All images grouped, split, and augmented.")



Grouping files by city and skipping 'no imagery' files...
Error checking no imagery for /Users/maximilianstumpf/Desktop/UCLA/Math156 - Machine Learning/Project/Cities/Images/.DS_Store: cannot identify image file '/Users/maximilianstumpf/Desktop/UCLA/Math156 - Machine Learning/Project/Cities/Images/.DS_Store'

Processing images into city folders...

Splitting images into Train, Test, and Validation sets...

Applying augmentations to training sets...

Processing complete. All images grouped, split, and augmented.
