In [2]:
# Import necessary libraries 
import os
import random
import glob
import shutil

## Step 1: Randomly select 10% of the masked images as validation images

In [14]:
# Folder path to the masked images (ALL)
masked_pathways = glob.glob("/Volumes/LN_Images_D/Dina/ALL_DATA/MASKED/*") 

# Extract base names from the masked pathways
base_names = [os.path.basename(name) for name in masked_pathways]

# Create a list with the pathways and corresponding base name
zipped_masked = list(zip(masked_pathways, base_names))

# Calculate the number of validation images (10% of total number of images)
num_images_validation = int(0.1 * len(masked_pathways))

# Checkpoint
print("Number of images to collect for validation:", num_images_validation)

# Randomly select the validation images 
validation_data_masked = random.sample(masked_pathways, num_images_validation)

# Checkpoint
print("Step 1 Done")

Number of images to collect for validation: 0
Step 1 Done


## Step 2: Create two folders, one with training and one with validation masks

In [15]:
# Define output folders:

# Folder for the validation images masked 
folder_validation_masked = "folder_for_validation_mask"

# Folder for the training images masked
folder_train_masked = "folder_for_train_mask"

# Loop through the zipped list with the masked pathways and corresponding base names
for path, name in zipped_masked:
    
    # Check is the masked pathway in present in the validation list
    if path in validation_data_masked:
        # Modify the base name
        new_name = name.replace(" ", "_") # Replace spaces with underscores
        new_name = new_name.replace('masked.png', 'VALIDATION_masked.png') # Replace ending. Tailor
        # Copy the masked images to the masked validation folder with the modified name
        shutil.copy(path, os.path.join(folder_validation_masked, new_name)) 
    # Copy the remaining masks to the training folder
    else:
        # Modify the base name
        new_name = name.replace(" ", "_") # Replace spaces with underscores
        new_name = new_name.replace('masked.png', 'TRAIN_masked.png') # Replace ending. Tailor
        # Copy the masked images to the masked training folder with the modified name
        shutil.copy(path,os.path.join(folder_train_masked, new_name))

# Checkpoint
print("Step 2 Done")

Step 2 Done


## Step 3: Get all the corresponding images for each mask into either training or validation image folders

In [16]:
# Folder with WSIs in PNG format
img_python_folder = "/Volumes/LN_Images_D/Dina/ALL_DATA/IMAGE/*"

# Store the image paths in a list
img_paths = glob.glob(img_python_folder)

#Checkpoint
print(len(img_paths))

# Open empty list to store the basenames for the images
basename_img = []

# Store and modify the basenames for the images in a list
for img in img_paths:
    # Store the basename in a variable
    basename = os.path.basename(img)
    # Tailor this part to your needs. Make sure all basenames are the same "format"
    # Check if the basename ends with 'copied.png'
    if basename.endswith(".copied.png"): # Tailor
        # Modify the basename to remove ending
        mod_basename = basename[:-11] #Here ".copied.png"
    else:
        # Then it only ends with png
        mod_basename = basename[:-4] #Remove ".png"
    
    # Append the modified basename to the list of base names
    basename_img.append(mod_basename)

# Checkpoint
print(len(basename_img))

# Create list with image name and corresponding pathway
zipped = zip(basename_img, img_paths)

# Define folder with all validation masks: Created in step 2
folder_validation_masked = folder_validation_masked + "/*"

# Store all validation mask pathways in a list
mask_validation_paths = glob.glob(folder_validation_masked)

# Open empty list to store validation mask list
basename_mask_validation = []

# Store the basenames for the validation binary masks in a list
for mask in mask_validation_paths:
    # Store the basename in a variable 
    basename = os.path.basename(mask)
    # Modify the basename to remove the extension at the end. Modify to your situation
    mod_basename = basename[:-22] #remove " _VALIDATION.masked.png"
    # Append modified base name to list of base names
    basename_mask_validation.append(mod_basename)

# Define output folders for the images
# Validation image folder 
folder_validation_img = "folder_to_validation_image"
# Training image folder
folder_train_img = "folder_to_train_image"

# Initiate variable to count the number of validation images
count = 0 

# Loop through the list with all images 
for img, path in zipped:  
    # Find all validation images
    if img in basename_mask_validation:
        # For each image, add 1 to count
        count += 1
        # Create new name for the validation images
        new_name = img + "_VALIDATION_image.png"
        # Create destination path for each validation image
        destination_path = os.path.join(folder_validation_img, new_name)
        
        # Optional checkpoint. Uncomment if needed
        #print(destination_path)

        # Copy the validation image to the output folder
        shutil.copy(path, destination_path)

        
    # Find all training images
    else:
        # Create new name for the training images
        new_name = img + "_TRAIN_image.png"
        # Create destination path for each training image
        destination_path = os.path.join(folder_train_img, new_name) 
        
        #optional checkpoint. Uncomment if needed
        #print(destination_path)

        # Copy the training image to the output folder
        shutil.copy(path, destination_path)

#Check that the number of validation images is the same as the number of validation masks 
print(count == num_images_validation)

# Final checkpoint 
print("Step 3 Done")

0
0
True
Step 3 Done
