In [3]:
import os
import shutil
import random

def split_dataset(images_folder, masks_folder, train_images_folder, train_masks_folder, val_images_folder, val_masks_folder, train_size=900):
    # Create the train and validation directories if they don't exist
    os.makedirs(train_images_folder, exist_ok=True)
    os.makedirs(train_masks_folder, exist_ok=True)
    os.makedirs(val_images_folder, exist_ok=True)
    os.makedirs(val_masks_folder, exist_ok=True)
    
    # Get a list of all image files
    image_files = sorted(os.listdir(images_folder))
    
    # Extract the 4-digit numbers from image filenames
    image_ids = [f.split('_')[1].split('.')[0] for f in image_files]

    # Shuffle the image IDs
    random.shuffle(image_ids)
    
    # Split the IDs into train and validation sets
    train_ids = image_ids[:train_size]
    val_ids = image_ids[train_size:]
    
    # Move the files into the appropriate directories
    for image_id in train_ids:
        image_file = f'rgb_{image_id}.png'
        mask_file = f'semantic_segmentation_{image_id}.png'
        
        # Move the image and mask to the train folder
        shutil.copy(os.path.join(images_folder, image_file), os.path.join(train_images_folder, image_file))
        shutil.copy(os.path.join(masks_folder, mask_file), os.path.join(train_masks_folder, mask_file))
    
    for image_id in val_ids:
        image_file = f'rgb_{image_id}.png'
        mask_file = f'semantic_segmentation_{image_id}.png'
        
        # Move the image and mask to the validation folder
        shutil.copy(os.path.join(images_folder, image_file), os.path.join(val_images_folder, image_file))
        shutil.copy(os.path.join(masks_folder, mask_file), os.path.join(val_masks_folder, mask_file))
    
    print(f"Dataset split complete: {train_size} images for training, {len(val_ids)} images for validation.")

# Example usage:
images_folder = 'C:\\Users\\georg\\Desktop\\Train'
masks_folder = 'C:\\Users\\georg\\Desktop\\Train_mask'
train_images_folder = 'C:\\Users\\georg\\Desktop\\Segmentation_data\\train\\images'
train_masks_folder =  'C:\\Users\\georg\\Desktop\\Segmentation_data\\train\\labels'
val_images_folder = 'C:\\Users\\georg\\Desktop\\Segmentation_data\\validation\\images'
val_masks_folder = 'C:\\Users\\georg\\Desktop\\Segmentation_data\\validation\\lables'

split_dataset(images_folder, masks_folder, train_images_folder, train_masks_folder, val_images_folder, val_masks_folder)


Dataset split complete: 900 images for training, 100 images for validation.
