In [1]:
import os
import random
from PIL import Image, ImageChops, ImageEnhance
import numpy as np
from collections import defaultdict
from albumentations import Compose, RandomCrop, HorizontalFlip, RandomBrightnessContrast, ShiftScaleRotate, GaussNoise

# Set the directory containing your images
image_folder = r'/Users/maximilianstumpf/Desktop/UCLA/Math156 - Machine Learning/Project/Data/Images'
base_output_folder = r'/Users/maximilianstumpf/Desktop/UCLA/Math156 - Machine Learning/Project/Data/Processed'

# Create the base output directory
os.makedirs(base_output_folder, exist_ok=True)

target_size = (227, 227)  # Target size for resized images
max_images_per_city = 5000  # Maximum number of processed images per city

# Load the reference "no imagery" image
reference_image_path = r'/Users/maximilianstumpf/Desktop/UCLA/Math156 - Machine Learning/Project/Data/No imagery/No_photo.jpg'
reference_image = Image.open(reference_image_path).convert('L')  # Convert reference to grayscale

# Statistics tracking
processed_stats = defaultdict(lambda: {
    'total': 0,
    'processed': 0,
    'augmented': 0,
    'errors': 0,
    'no_imagery': 0
})


# Function to check if an image is "no imagery"
def is_no_imagery_image(img_path, reference_image):
    try:
        img = Image.open(img_path).convert('L')  # Convert image to grayscale
        img = img.resize(reference_image.size)  # Resize to match the reference
        diff = ImageChops.difference(reference_image, img)  # Compute pixel differences
        if diff.getbbox() is None:
            return True
        img_array = np.array(img)
        std = np.std(img_array)
        return std < 5  # Low variance indicates uniform image
    except Exception as e:
        print(f"Error checking no imagery for {img_path}: {e}")
        return False


# Function to apply augmentations using Albumentations
def apply_augmentations_with_albumentations(img):
    aug = Compose([
        HorizontalFlip(p=0.5),  # Flip horizontally
        RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.5),  # Adjust brightness and contrast
        ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=30, p=0.5),  # Random shifts, scaling, rotation
        GaussNoise(var_limit=(10.0, 50.0), p=0.3)  # Add random noise
    ])
    
    img_array = np.array(img)
    augmented = aug(image=img_array)
    return Image.fromarray(augmented["image"])


# Function to process and resize image
def process_image(img):
    """Convert image to grayscale and resize"""
    gray_img = img.convert("L")
    return gray_img.resize(target_size)


# Group files by city and remove "no imagery" files
print("\nGrouping files by city and checking for 'no imagery'...")
city_files = defaultdict(list)
for filename in os.listdir(image_folder):
    file_path = os.path.join(image_folder, filename)

    if is_no_imagery_image(file_path, reference_image):  # Skip "no imagery" files
        city_name = filename.split("_")[0]  # Simple city extraction
        processed_stats[city_name]['no_imagery'] += 1
        continue

    city_name = filename.split("_")[0]  # Extract city name
    if city_name:
        city_files[city_name].append(filename)
        processed_stats[city_name]['total'] += 1

print("\nInitial city statistics:")
for city, stats in processed_stats.items():
    print(f"{city}: {stats['total']} usable images found ({stats['no_imagery']} 'no imagery' files skipped)")


# Process images for each city
print("\nProcessing images with augmentations...")
for city, files in city_files.items():
    print(f"\nProcessing {city}...")
    
    city_output_folder = os.path.join(base_output_folder, city)
    os.makedirs(city_output_folder, exist_ok=True)  # Create city folder

    originals_needed = min(len(files), max_images_per_city)

    for filename in files[:originals_needed]:
        file_path = os.path.join(image_folder, filename)

        try:
            with Image.open(file_path) as img:
                # Process original image
                resized_img = process_image(img)  # Resize
                resized_output_path = os.path.join(city_output_folder, f"resized_{filename}")
                resized_img.save(resized_output_path)
                processed_stats[city]['processed'] += 1

                # Apply augmentations
                for i in range(8):  # Generate up to 8 augmented images
                    try:
                        augmented_img = apply_augmentations_with_albumentations(resized_img)
                        aug_output_path = os.path.join(city_output_folder, f"aug_{i}_{filename}")
                        augmented_img.save(aug_output_path)
                        processed_stats[city]['augmented'] += 1
                    except Exception as e:
                        print(f"Error augmenting {filename}: {e}")
                        processed_stats[city]['errors'] += 1

        except Exception as e:
            print(f"Error processing {filename}: {e}")
            processed_stats[city]['errors'] += 1


# Print final processing statistics
print("\nFinal Processing Statistics:")
print("-" * 100)
print(f"{'City':<20} {'Original Files':<15} {'No Imagery':<15} {'Processed':<12} {'Augmented':<12} {'Errors':<12}")
print("-" * 100)
for city, stats in processed_stats.items():
    print(f"{city:<20} {stats['total']:<15} {stats['no_imagery']:<15} {stats['processed']:<12} {stats['augmented']:<12} {stats['errors']:<12}")

# Verify final counts
print("\nVerifying final image counts per city...")
for city in city_files.keys():
    city_folder = os.path.join(base_output_folder, city)
    if os.path.exists(city_folder):
        file_count = len(os.listdir(city_folder))
        print(f"{city}: {file_count} total processed files")
    else:
        print(f"Warning: No output folder found for {city}")

print("\nProcessing complete. All images converted, augmented, and saved to city-specific folders.")



Grouping files by city and checking for 'no imagery'...

Initial city statistics:
San Franciscoh: 193 usable images found (1 'no imagery' files skipped)
Chicago: 165 usable images found (25 'no imagery' files skipped)
Baltimorem: 101 usable images found (1 'no imagery' files skipped)
Sacramento: 176 usable images found (3 'no imagery' files skipped)
Jacksonvillef: 171 usable images found (7 'no imagery' files skipped)
Austin: 177 usable images found (8 'no imagery' files skipped)
Memphis: 179 usable images found (8 'no imagery' files skipped)
Phoenix: 165 usable images found (15 'no imagery' files skipped)
Detroit: 177 usable images found (6 'no imagery' files skipped)
Minneapolis: 193 usable images found (3 'no imagery' files skipped)
Omaha: 179 usable images found (14 'no imagery' files skipped)
Kansas City: 169 usable images found (8 'no imagery' files skipped)
Colorado Springs: 162 usable images found (6 'no imagery' files skipped)
Indianapolisg: 185 usable images found (13 'no im