In [1]:
#import kagglehub
import os
import random
from PIL import Image, ImageChops, ImageEnhance
import re
import numpy as np
from collections import defaultdict

# Download latest version (Set this to False the first time you are running the script to download the data)
is_downloaded = True

if is_downloaded:
    print("Data is already downloaded")
else:
    path = kagglehub.dataset_download("pinstripezebra/google-streetview-top-50-us-cities")
    print("Path to dataset files:", path)

# Set the directory containing your images (Change this to your directory)
image_folder = r'/Users/maximilianstumpf/Desktop/UCLA/Math156 - Machine Learning/Project/Data/Images'
base_output_folder = r'/Users/maximilianstumpf/Desktop/UCLA/Math156 - Machine Learning/Project/Data/Processed'

# Create the base output directory
os.makedirs(base_output_folder, exist_ok=True)

target_size = (227, 227) # Target size for resized images
max_images_per_city = 5000  # Maximum number of processed images per city
num_patches = 5  # Number of patches to extract per image

# Load the reference "no imagery" image
reference_image_path = r'/Users/maximilianstumpf/Desktop/UCLA/Math156 - Machine Learning/Project/Data/No imagery/No_photo.jpg'
reference_image = Image.open(reference_image_path).convert('L')  # Convert reference to grayscale too

# Statistics tracking with augmentation stats added
processed_stats = defaultdict(lambda: {
    'total': 0, 
    'processed': 0, 
    'augmented': 0,
    'errors': 0,
    'no_imagery': 0
})
# Function to check if an image is "no imagery"
def is_no_imagery_image(img_path, reference_image):
    try:
        img = Image.open(img_path).convert('L') # convert image to grayscale
        img = img.resize(reference_image.size) # resize to match the reference
        diff = ImageChops.difference(reference_image, img) # compute pixel differences
        if diff.getbbox() is None: 
            return True
        img_array = np.array(img)
        std = np.std(img_array)
        return std < 5 # low variance indicates uniform image
    except Exception as e:
        print(f"Error checking no imagery for {img_path}: {e}")
        return False

# Function to extract city name from filename
def get_city_name(filename):
    patterns = [
        r"([A-Za-z]+)(?:_\d+)?(?:_ \([-\d.]+, [-\d.]+\))?\.jpg", # CityName_XYZ.jpg format
        r"([A-Za-z]+[ A-Za-z]+)(?:_\d+)?(?:_ \([-\d.]+, [-\d.]+\))?\.jpg", # Multi-word city names
        r"([A-Za-z]+[.]?[ ]?[A-Za-z]+)(?:_\d+)?(?:_ \([-\d.]+, [-\d.]+\))?\.jpg" # edge cases
    ]
    
    for pattern in patterns:
        match = re.match(pattern, filename)
        if match:
            return match.group(1).replace(" ", "_") # replace spaces with underscores
    return None

# Function to apply augmentations
def apply_augmentations(img):
    augmented_images = []
    
    # List of augmentation functions
    augmentations = [
        ('flip_h', lambda x: x.transpose(Image.FLIP_LEFT_RIGHT)), # horizontal flip
        ('flip_v', lambda x: x.transpose(Image.FLIP_TOP_BOTTOM)), # Vertical flip
        ('rotate90', lambda x: x.rotate(90)), # rotate by 90 degress
        ('bright_up', lambda x: ImageEnhance.Brightness(x).enhance(1.2)), # increase brightness
        ('bright_down', lambda x: ImageEnhance.Brightness(x).enhance(0.8)), # decrease brightness
        ('contrast_up', lambda x: ImageEnhance.Contrast(x).enhance(1.2)), # increase contrast
        ('contrast_down', lambda x: ImageEnhance.Contrast(x).enhance(0.8)) # decrease contrast
    ]
    
    # Apply each augmentation
    for aug_name, aug_func in augmentations:
        try:
            aug_img = aug_func(img)
            augmented_images.append((aug_name, aug_img))
        except Exception as e:
            print(f"Error applying {aug_name} augmentation: {e}")
            continue
    
    return augmented_images

# Function to process and resize image
def process_image(img):
    """Convert image to grayscale and resize"""
    gray_img = img.convert('L')
    return gray_img.resize(target_size)

# Function to extract random crops
def random_crop(img):
    width, height = img.size
    if width < 227 or height < 227:
        raise ValueError("Image size must be at least 227x227 for cropping.")
    
    x = random.randint(0, width - 227)
    y = random.randint(0, height - 227)
    
    return img.crop((x, y, x + 227, y + 227))

# Function to extract random patches and apply augmentation to each patch
def extract_patches(img, num_patches=5):
    patches = []
    gray_img = img if img.mode == 'L' else img.convert('L')
    
    for i in range(num_patches):
        try:
            patch = random_crop(gray_img) # Extract original patch
            patches.append(('original', patch)) # Add original patch with identifier
            augmented_patches = apply_augmentations(patch) # Add augmented versions of the patch
            patches.extend(augmented_patches)
        except ValueError as e:
            print(f"Error extracting patch {i}: {e}")
            continue
            
    return patches

# Group files by city and remove no imagery files
print("\nGrouping files by city and checking for no imagery...")
city_files = defaultdict(list)
for filename in os.listdir(image_folder):
    file_path = os.path.join(image_folder, filename)
    
    if is_no_imagery_image(file_path, reference_image): # skip "no imagery" files
        city_name = get_city_name(filename)
        if city_name:
            processed_stats[city_name]['no_imagery'] += 1
        print(f"Found no imagery file: {filename}")
        continue
        
    city_name = get_city_name(filename)
    if city_name:
        city_files[city_name].append(filename)
        processed_stats[city_name]['total'] += 1

print("\nInitial city statistics:")
for city, stats in processed_stats.items():
    print(f"{city}: {stats['total']} usable images found ({stats['no_imagery']} no imagery files skipped)")

# Process images for each city
print("\nProcessing images with augmentations...")
for city, files in city_files.items():
    print(f"\nProcessing {city}...")
    
    city_output_folder = os.path.join(base_output_folder, city)
    os.makedirs(city_output_folder, exist_ok=True) # create city folder
    
    # Calculate how many original images we need to process
    # Each original image will produce multiple augmented versions
    augmentation_factor = 8  # original + 7 augmentations
    images_per_original = (1 + num_patches) * augmentation_factor
    originals_needed = min(len(files), max_images_per_city // images_per_original)
    
    for filename in files[:originals_needed]:
        file_path = os.path.join(image_folder, filename)
        
        try:
            with Image.open(file_path) as img:
                # Process original image
                processed_img = process_image(img) # resize
                
                # Save original resized version
                resized_output_path = os.path.join(city_output_folder, f"resized_original_{filename}")
                processed_img.save(resized_output_path)
                processed_stats[city]['processed'] += 1
                
                # Save augmented versions of full image
                augmented_images = apply_augmentations(processed_img) # augment resized pic
                for aug_name, aug_img in augmented_images:
                    aug_output_path = os.path.join(city_output_folder, f"resized_{aug_name}_{filename}")
                    aug_img.save(aug_output_path)
                    processed_stats[city]['augmented'] += 1
                
                # Extract and save patches with their augmentations
                try:
                    patches = extract_patches(img, num_patches=num_patches) # extract & augment patches
                    for i, (patch_type, patch) in enumerate(patches):
                        patch_output_path = os.path.join(
                            city_output_folder, 
                            f"patch_{i//augmentation_factor + 1}_{patch_type}_{filename}"
                        )
                        patch.save(patch_output_path)
                        if patch_type == 'original':
                            processed_stats[city]['processed'] += 1
                        else:
                            processed_stats[city]['augmented'] += 1
                            
                    print(f"Successfully processed {filename} for {city} with augmentations")
                except ValueError as e:
                    print(f"Error processing patches for {filename}: {e}")
                    processed_stats[city]['errors'] += 1
                    
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            processed_stats[city]['errors'] += 1

# Print final processing statistics
print("\nFinal Processing Statistics:")
print("-" * 110)
print(f"{'City':<20} {'Original Files':<15} {'No Imagery':<15} {'Processed':<12} {'Augmented':<12} {'Errors':<12}")
print("-" * 110)
for city, stats in processed_stats.items():
    print(f"{city:<20} {stats['total']:<15} {stats['no_imagery']:<15} {stats['processed']:<12} {stats['augmented']:<12} {stats['errors']:<12}")

# Verify final counts
print("\nVerifying final image counts per city...")
for city in city_files.keys():
    city_folder = os.path.join(base_output_folder, city)
    if os.path.exists(city_folder):
        file_count = len(os.listdir(city_folder))
        print(f"{city}: {file_count} total processed files")
    else:
        print(f"Warning: No output folder found for {city}")

print("\nProcessing complete. All images converted to grayscale, augmented, and saved to city-specific folders.")

Data is already downloaded

Grouping files by city and checking for no imagery...
Found no imagery file: Jacksonvillef_ (30.345588, -81.638109).jpg
Found no imagery file: Memphis_ (35.090783, -89.992134).jpg
Found no imagery file: El Paso_ (31.850921, -106.435131).jpg
Found no imagery file: Tulsa_ (36.121028, -95.875602).jpg
Found no imagery file: Oklahoma City_ (35.493458, -97.502302).jpg
Found no imagery file: Nashvillej_ (36.14538, -86.769295).jpg
Found no imagery file: Denveri_ (39.772808, -104.891637).jpg
Found no imagery file: El Paso_ (31.840606, -106.43546).jpg
Found no imagery file: Sacramento_ (38.58537, -121.478973).jpg
Found no imagery file: Mesa_ (33.417699, -111.743102).jpg
Found no imagery file: Detroit_ (42.403928, -83.076705).jpg
Found no imagery file: Tucson_ (32.173772, -110.882009).jpg
Found no imagery file: Las Vegas_ (36.251718, -115.270892).jpg
Found no imagery file: Las Vegas_ (36.210536, -115.270495).jpg
Found no imagery file: San Diego_ (32.697855, -117.1831).