# Making Folds for K-Fold Cross Validation

Making $k = 4$ folds for K-Fold Cross Validation. This notebook can be used for both patch extraction and downsizing images.

### Important Directories

In [1]:
# For accessing all of the images and files and folders and pathways
import os
import shutil
import random

### Making Folders

Making folders for each fold.

In [2]:
def make_folders():

    # Making overarching folds folder
    os.mkdir("downsize_folds")

    # Creating each fold folder
    for i in range(1, 5):

        # Making the fold folder
        os.mkdir("downsize_folds/" + str(i))
        
        # Making the carcinoma folder
        os.mkdir("downsize_folds/" + str(i) + "/c")
        
        # Making the sarcoma folder
        os.mkdir("downsize_folds/" + str(i) + "/s")

### Gathering All Images

Scraping all of the images from the train and validation folders.

In [3]:
# Arrays of original images
train_c = []
train_s = []
val_c = []
val_s = []

# Getting all the images
def gather_images():

    # Overarching folder
    original_patches = "normalized_data"

    # Different types of cancers
    types = ["c", "s"]

    # Crawling through contained folders
    for typo in types:

        # Gathering the training and validatino sets
        train_set = os.listdir(original_patches + "/train/" + typo)
        val_set = os.listdir(original_patches + "/val/" + typo)

        # Adding all of the training images
        for image in train_set:
            if (typo == "c"):
                train_c.append(original_patches + "/train/" + typo + "/" + image)
            else:
                train_s.append(original_patches + "/train/" + typo + "/" + image)

        # Adding all of the validation images
        for image in val_set:
            if (typo == "c"):
                val_c.append(original_patches + "/val/" + typo + "/" + image)
            else:
                val_s.append(original_patches + "/val/" + typo + "/" + image)

### Creating arrays for each of the folds

In this case, there are $k = 4$ folds. However, for an arbitrary $k$ amount of folds, simply change the code for fourth to another constant and adjust the for loops accordingly.

In [4]:
# Container to hold the folds
fold_1 = []
fold_2 = []
fold_3 = []
fold_4 = []

# Function to create into folds
def create_folds(data_folder):
    
    # Randomly Seed
    random.seed(random.randint(1, 10000))
    
    # Checking the cancer_type and data_type
    random.shuffle(data_folder)
    
    # Splitting the data set into fourths
    fourth = int(len(data_folder) / 4)
    
    # Putting images into the first fold
    for i in range(0, fourth):
        fold_1.append(data_folder[i])
    
    # Putting images into the second fold
    for i in range(fourth, 2 * fourth):
        fold_2.append(data_folder[i])
    
    # Putting images into the third fold
    for i in range(2 * fourth, 3 * fourth):
        fold_3.append(data_folder[i])
        
    # Putting images into the fourth fold
    for i in range(3 * fourth, len(data_folder)):
        fold_4.append(data_folder[i])

### Checking fold length

Checking the length of each fold.

In [5]:
def fold_check():
    
    # Checking the length of each fold
    print("Fold 1 Length: " + str(len(fold_1)))
    print("Fold 2 Length: " + str(len(fold_2)))
    print("Fold 3 Length: " + str(len(fold_3)))
    print("Fold 4 Length: " + str(len(fold_4)))

### Copying images from arrays into folders

Taking the arrays of image names and copying the actual images into folders.

In [6]:
def copy_images(fold, target_folder):
    
    # Parses through each image in the fold array
    for image in fold:
        
        # Checking last index of the image name
        if ("/c/" in image):
            
            # Copies each image into the fold folder
            shutil.copy(image, "downsize_folds/" + target_folder + "/c/")
            
        else:
            
            shutil.copy(image, "downsize_folds/" + target_folder + "/s")

### Testing all of the Functions

Making the folders, then gathering all of the images, creating the arrays, and then copying images from the arrays into their respective folders.

In [7]:
make_folders() # Make the folders

gather_images() # Gather the images

# Create the folds
create_folds(val_c) 
create_folds(val_s)
create_folds(train_c)
create_folds(train_s)

fold_check() # Check numbers in the folds (for troubleshooting)

# Copying the images
copy_images(fold_1, "1")
copy_images(fold_2, "2")
copy_images(fold_3, "3")
copy_images(fold_4, "4")

Fold 1 Length: 324
Fold 2 Length: 324
Fold 3 Length: 324
Fold 4 Length: 332
