In [1]:
# train test split

import tensorflow as tf
import numpy as np
import os
import shutil
from sklearn.model_selection import train_test_split

# Function for traintestsplit 
def create_train_test_split(source_dir, train_dir, test_dir, test_size = 0.2):
    # ensuring that directories exist, if not they are created
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)

    # iterating through subdirectories (cities)
    for city in os.listdir(source_dir):
        city_dir = os.path.join(source_dir, city)
        
        # skipping non directories
        if not os.path.isdir(city_dir):
            continue
        
        # collecting image files
        images = [f for f in os.listdir(city_dir) if f.endswith((".jpg"))]

        # Splitting images into Training and Testing Sets
        train_images, test_images = train_test_split(
            images,
            test_size = test_size, # specified earlier
            random_state = 42
        )

        # creating subdirectories in train & test directories for each city
        os.makedirs(os.path.join(train_dir, city), exist_ok=True)
        os.makedirs(os.path.join(test_dir, city), exist_ok=True)

        # Copying Training images
        for img in train_images:
            src = os.path.join(city_dir, img)
            dst = os.path.join(train_dir, city, img)
            shutil.copy2(src, dst)

        # Copying Testing images
        for img in test_images:
            src = os.path.join(city_dir, img)
            dst = os.path.join(test_dir, city, img)
            shutil.copy2(src,dst)
        
        print(f'Processed {city}: {len(train_images)} training images, {len(test_images)} test images')


def main():
    # Specifying the path for source, train, and test directories
    source_directory = '/Users/maximilianstumpf/Desktop/UCLA/Math156 - Machine Learning/Project/Data/Processed'
    test_directory = '/Users/maximilianstumpf/Desktop/UCLA/Math156 - Machine Learning/Project/Data/Test'
    train_directory = '/Users/maximilianstumpf/Desktop/UCLA/Math156 - Machine Learning/Project/Data/Train'

    # Calling Split function
    create_train_test_split(source_directory, train_directory, test_directory)

    train_count = sum([len(files) for r, d, files in os.walk(train_directory)])
    test_count = sum([len(files) for r, d, files in os.walk(test_directory)])
    
    # Little summary
    print(f"\nFinal split:")
    print(f"Training images: {train_count}")
    print(f"Test images: {test_count}")
    print(f"Total images: {train_count + test_count}")

# This makes sure, that the script only runs when executed directly and not when imported as a module
if __name__ == "__main__":
    main()

Processed Mesa: 1243 training images, 311 test images
Processed Colorado Springs: 1814 training images, 454 test images
Processed Detroit: 1982 training images, 496 test images
Processed Fort Worth: 1523 training images, 381 test images
Processed Austin: 1982 training images, 496 test images
Processed Sacramento: 1971 training images, 493 test images
Processed Atlanta: 1960 training images, 490 test images
Processed New Yorkd: 2128 training images, 532 test images
Processed Columbus: 2083 training images, 521 test images
Processed Wichita: 1758 training images, 440 test images
Processed Bakersfield: 2016 training images, 504 test images
Processed Baltimorem: 1131 training images, 283 test images
Processed Minneapolis: 2161 training images, 541 test images
Processed Fresno: 1736 training images, 434 test images
Processed Louisvillel: 1568 training images, 392 test images
Processed Washingtonk: 918 training images, 230 test images
Processed Kansas City: 1892 training images, 474 test ima

In [2]:
# Train - Val Split

import os
import shutil
from sklearn.model_selection import train_test_split

# set path to data
original_data_dir = '/Users/maximilianstumpf/Desktop/UCLA/Math156 - Machine Learning/Project/Data/Train'

# new directory for train and val data
train_dir = '/Users/maximilianstumpf/Desktop/UCLA/Math156 - Machine Learning/Project/Data/Train1'
val_dir = '/Users/maximilianstumpf/Desktop/UCLA/Math156 - Machine Learning/Project/Data/Val'

os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)

# For every city
for city in os.listdir(original_data_dir):
    city_path = os.path.join(original_data_dir, city)
    if os.path.isdir(city_path): 
        # List all images in class
        images = os.listdir(city_path)
        
        # Split: 80% Training, 20% Validation
        train_images, val_images = train_test_split(images, test_size=0.2, random_state=42)
        
        # path destination
        train_city_dir = os.path.join(train_dir, city)
        val_city_dir = os.path.join(val_dir, city)
        
        os.makedirs(train_city_dir, exist_ok=True)
        os.makedirs(val_city_dir, exist_ok=True)
        
        # copy images
        for img in train_images:
            shutil.copy(os.path.join(city_path, img), os.path.join(train_city_dir, img))
        
        for img in val_images:
            shutil.copy(os.path.join(city_path, img), os.path.join(val_city_dir, img))
