# Train-Val-Test split

Notebook for randomly splitting datasets into train, valildation, and test subsets (to be used in implementation of SwinTransformer models). Code adapted from Abdul Mukit's post on StackOverflow (2018): https://stackoverflow.com/questions/53074712/how-to-split-folder-of-images-into-test-training-validation-sets-with-stratified

In [None]:
from google.colab import drive
import os
import numpy as np
import shutil
import pandas as pd
import random

In [None]:
drive.mount('/content/gdrive')

In [None]:
# set the path to where i store classifcation stuff 
drive_classification_dir = '/content/gdrive/MyDrive/Classification_mirabilis/'

# and the path to all image dataset 
img_datasets_dir = drive_classification_dir + 'img_datasets/temperature/'

# and the path to the specific dataset to split below 
this_dataset_name = 'cheW_37' # 'img_dataset_curated' for initial pre-training SwinT; 'cheW_37' for later fine-tuning
this_dataset_dir = img_datasets_dir + this_dataset_name + '/'

# get a list of the class folders
class_folders = os.listdir(this_dataset_dir)
print(class_folders)

# set path to where I will store train-val-test folders
dataset_split_name = this_dataset_name + '_split'
dataset_split_dir = img_datasets_dir + dataset_split_name + '/'

In [None]:
# firt determine all class counts
for cls in class_folders:
  cls_path = this_dataset_dir + cls
  cls_imgs = os.listdir(cls_path)
  num_imgs = len(cls_imgs)
  print(f"{cls}: {num_imgs} images total")

In [None]:
# Perform randomized split
train_ratio = 0.8
val_test_ratio = 0.1

for cls in class_folders:

  # folder to copy images from 
  cls_path = this_dataset_dir + cls
  allFileNames = os.listdir(cls_path)

  # Random shuffle
  np.random.shuffle(allFileNames)

  train_FileNames, val_FileNames, test_FileNames = np.split(np.array(allFileNames),
                                                            [int(len(allFileNames) * (1 - (val_test_ratio + val_test_ratio))),
                                                             int(len(allFileNames) * (1 - val_test_ratio)),])


  train_FileNames = [cls_path + '/' + name for name in train_FileNames.tolist()]
  val_FileNames = [cls_path + '/' + name for name in val_FileNames.tolist()]
  test_FileNames = [cls_path + '/' + name for name in test_FileNames.tolist()]

  print(f"{cls}")
  print('Total images: '+ str(len(allFileNames)))
  print('Training: '+ str(len(train_FileNames)))
  print('Validation: '+  str(len(val_FileNames)))
  print('Testing: '+ str(len(test_FileNames)))

  # Create Train/Val/Test folders 
  os.makedirs(dataset_split_dir + 'train/' + cls)
  os.makedirs(dataset_split_dir + 'val/' + cls)
  os.makedirs(dataset_split_dir + 'test/' + cls)

  # Copy-paste images
  for name in train_FileNames:
       shutil.copy(name, dataset_split_dir + 'train/' + cls)

  for name in val_FileNames:
      shutil.copy(name, dataset_split_dir + 'val/' + cls)

  for name in test_FileNames:
      shutil.copy(name, dataset_split_dir + 'test/' + cls)

