In [None]:
import tensorflow as tf
import numpy as np
import os
import shutil
from sklearn.model_selection import train_test_split

# Function for traintestsplit 
def create_train_test_split(source_dir, train_dir, test_dir, test_size = 0.2):
    # ensuring that directories exist, if not they are created
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)

    # iterating through subdirectories (cities)
    for city in os.listdir(source_dir):
        city_dir = os.path.join(source_dir, city)
        
        # skipping non directories
        if not os.path.isdir(city_dir):
            continue
        
        # collecting image files
        images = [f for f in os.listdir(city_dir) if f.endswith((".jpg"))]

        # Splitting images into Training and Testing Sets
        train_images, test_images = train_test_split(
            images,
            test_size = test_size, # specified earlier
            random_state = 42
        )

        # creating subdirectories in train & test directories for each city
        os.makedirs(os.path.join(train_dir, city), exist_ok=True)
        os.makedirs(os.path.join(test_dir, city), exist_ok=True)

        # Copying Training images
        for img in train_images:
            src = os.path.join(city_dir, img)
            dst = os.path.join(train_dir, city, img)
            shutil.copy2(src, dst)

        # Copying Testing images
        for img in test_images:
            src = os.path.join(city_dir, img)
            dst = os.path.join(test_dir, city, img)
            shutil.copy2(src,dst)
        
        print(f'Processed {city}: {len(train_images)} training images, {len(test_images)} test images')


def main():
    # Specifying the path for source, train, and test directories
    source_directory = '/Users/maximilianstumpf/Desktop/UCLA/Math156 - Machine Learning/Project/Data/Processed'
    test_directory = '/Users/maximilianstumpf/Desktop/UCLA/Math156 - Machine Learning/Project/Data/Test'
    train_directory = '/Users/maximilianstumpf/Desktop/UCLA/Math156 - Machine Learning/Project/Data/Train'

    # Calling Split function
    create_train_test_split(source_directory, train_directory, test_directory)

    train_count = sum([len(files) for r, d, files in os.walk(train_directory)])
    test_count = sum([len(files) for r, d, files in os.walk(test_directory)])
    
    # Little summary
    print(f"\nFinal split:")
    print(f"Training images: {train_count}")
    print(f"Test images: {test_count}")
    print(f"Total images: {train_count + test_count}")

# This makes sure, that the script only runs when executed directly and not when imported as a module
if __name__ == "__main__":
    main()