# Imports

In [None]:
import matplotlib.pyplot as plt
import random
import os
import numpy as np
import cv2
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import utils, optimizers
from PIL import Image
import shutil
from tensorflow.keras import models, layers
from tensorflow.keras.applications import vgg16
from tensorflow.keras.callbacks import EarlyStopping

# Creating subdirectories for train/test split

In [None]:
root_dir = '../raw_data'
poses_list = os.listdir(root_dir) #Lists all folders in the root directory (raw_data folder)
poses_list

In [None]:
# Creates Testing and Training directories
def create_train_val_dirs(root_path):
    for pose in poses_list:
        if pose == "Testing" or pose == "Training":
            pass
        else:
            os.makedirs(os.path.join(root_path, 'Training', pose))
            os.makedirs(os.path.join(root_path, 'Testing', pose))
            
            
#Tries to create new directories
#Errors if Training/Testing directories already exist - delete them before running this notebook!
try:
    create_train_val_dirs(root_path=root_dir)
except FileExistsError:
    print("You should not be seeing this since the upper directory is removed beforehand")

In [None]:
def split_data(SOURCE_DIR, TRAINING_DIR, VALIDATION_DIR, SPLIT_SIZE):
    """Function to split the data into train/test
    First checks that the image exists in the folder - this is a safety net for any images which don't exist that we missed when manually checking images 
    Creates a list of all images and randomises them
    Splits into train test depending on split size (defined below)
    Copies all the train images into train folder and vice versa
    """
    
    ignore = []
    for image in os.listdir(SOURCE_DIR):
        if type(cv2.imread(os.path.join(SOURCE_DIR, image))) is type(None):
            ignore.append(image)
    
    source_images = [image for image in os.listdir(SOURCE_DIR) if image not in ignore]
    # Randomising list
    source_images = random.sample(source_images, len(source_images))
    
    train_images = source_images[:int(SPLIT_SIZE * len(source_images))]
    val_images = source_images[int(SPLIT_SIZE * len(source_images)):]
    for image in train_images:
        shutil.copyfile(os.path.join(SOURCE_DIR, image), os.path.join(TRAINING_DIR, image))
    for image in val_images:
        shutil.copyfile(os.path.join(SOURCE_DIR, image), os.path.join(VALIDATION_DIR, image))
        

In [None]:
split_size = 0.7

#Loops over all the poses, defines the directories for source/train/test
#Splits the data into train/test
#Prints how many images are in train/test and in the original directory - sanity check!

for pose in poses_list:
    if pose == "Training" or pose == "Testing":
        pass
    else:
        SOURCE_DIR = f"{root_dir}/{pose}"
        TRAINING_DIR = f"{root_dir}/Training/{pose}"
        TESTING_DIR = f"{root_dir}/Testing/{pose}"    
        split_data(SOURCE_DIR, TRAINING_DIR, TESTING_DIR, split_size)

        print(f"There are {len(os.listdir(TRAINING_DIR))} images for {pose} in training")
        print(f"There are {len(os.listdir(TESTING_DIR))} images for {pose} in testing")

        print(f"Original {pose}'s directory has {len(os.listdir(SOURCE_DIR))} images\n")