Assuming that the images from male and female are in no particular order

In [1]:
import pandas as pd
import dlib
from tqdm import tqdm
import os
import itertools
import shutil

In [2]:
k = 5     # Number of folds
male_fn = '/home/monash/Desktop/fyp-work/fyp-ma-13/fyp-models/preprocessing/combined_filtered/full/male'
female_fn = '/home/monash/Desktop/fyp-work/fyp-ma-13/fyp-models/preprocessing/combined_filtered/full/female'

In [3]:
def split_into_batches(k, fn, target_size = 19354):
    """
    Splits dataset into k even batches. Any remaining images are ignored
    
    """
    folder = os.listdir(fn)
    # Only applies to female images @REMOVE: Only if the above condition no longer applies
    if len(folder) > 19354:
        folder = folder[:target_size]
    batch_size = len(folder)//k
    splits = []
    for i in range(k):
        splits.append(folder[i*batch_size:(i+1)*batch_size])
    print("Folder length: ", len(folder))
    print("Batch size: ", batch_size)
    print("Remaining: ", len(folder)%batch_size)
    return splits

In [4]:
print("Male dataset:")
print("--------------")
male_splits = split_into_batches(k, male_fn)
print()
print("Female dataset:")
print("--------------")
female_splits = split_into_batches(k, female_fn)

Male dataset:
--------------
Folder length:  19354
Batch size:  3870
Remaining:  4

Female dataset:
--------------
Folder length:  19354
Batch size:  3870
Remaining:  4


In [5]:
def chain(lst):
    """
    Chains all elements in a list together into one list
    
    lst: list
        List of lists
    """
    res = []
    for ele in lst:
        res += ele
    return res

def cross_validation(k, splits):
    """
    Perform cross-validation and returns all possible combinations after k-fold cv. 
    Here we take k-2 folds for training and 2 for val/test set

    k : int
        Number of k-folds
    splits: list
        Filenames split into k-folds
    """
    # i and j are the indices of the val/test folds
    i = 0
    res = []
    while i < k-1:
        j = i + 1
        while j < k:
            valTest = [splits[i], splits[j]]
            indices = [i, j]
            train = [m for n, m in enumerate(splits) if n not in indices]
            res.append([chain(train), chain(valTest)])
            j+=1
        i += 1
    return res

In [6]:
male_cv_sets = cross_validation(k, male_splits)
female_cv_sets = cross_validation(k, female_splits)

In [7]:
# [training images, val/test images]
def save_all_sets(cv_sets, original, target, gender):
    """
    Takes a list of CV sets, splits and saves them into folder accordingly
    
    cv_sets : list
        List of CV sets
    original : str
        Original file path name
    target : str
        Target file path name
    gender : str
        Gender 
    """
    target += gender + "/" # Becomes cv_datasets/{gender}/

    for i in tqdm(range(len(cv_sets)), "Loading sets..."):
        setCV = cv_sets[i]
        currFolder = target + "set" + str(i+1) + "/" # Becomes cv_datasets/{gender}/set{no.}/
        train, valTest = setCV[0], setCV[1]
        # Save training images 
        for m in tqdm(range(len(train)), "Loading {} train...".format(gender)):
            image_fn = train[m]
            train_original = original + "/" + image_fn
            train_target = currFolder + "train/" + image_fn # Becomes cv_datasets/{gender}/set{no.}/train/...
            # Copy and save image
            shutil.copyfile(train_original, train_target)

        # Split into val (30% of original total) and test (10% of original total)
        batch_size = len(valTest)//4

        # Split valTest into val and test
        val = []
        for j in range(3):
            val += valTest[j*batch_size: (j+1)*batch_size]
        test = valTest[3*batch_size: 4*batch_size]

        # Save val images
        for n in tqdm(range(len(val)), "Loading {} val...".format(gender)):
            val_fn = val[n]
            val_original = original + "/" + val_fn
            val_target = currFolder + "val/" + val_fn# Becomes cv_datasets/{gender}/set{no.}/val/...
            # Copy and save image
            shutil.copyfile(val_original, val_target)

        # Save test images
        for o in tqdm(range(len(test)), "Loading {} test...".format(gender)):
            test_fn = test[o]
            test_original = original + "/" + test_fn
            test_target = currFolder + "test/" + test_fn# Becomes cv_datasets/{gender}/set{no.}/test/...
            # Copy and save image
            shutil.copyfile(test_original, test_target)

### NOTE: FOLDERS WERE CREATED MANUALLY

In [8]:
# UNCOMMENT TO GENERATE MALE DATASETS
#target = r'cv_datasets/'
#save_all_sets(male_cv_sets, male_fn, target, "male")

In [9]:
# UNCOMMENT TO GENERATE FEMALE DATASETS
# target = r'cv_datasets/'
# save_all_sets(female_cv_sets, female_fn, target, "female")

Loading sets...:   0%|          | 0/10 [00:00<?, ?it/s]
Loading female train...:   0%|          | 0/11610 [00:00<?, ?it/s][A
Loading female train...:   5%|▌         | 625/11610 [00:00<00:01, 6249.25it/s][A
Loading female train...:  11%|█         | 1230/11610 [00:00<00:01, 6186.28it/s][A
Loading female train...:  16%|█▌        | 1850/11610 [00:00<00:01, 6189.97it/s][A
Loading female train...:  21%|██▏       | 2480/11610 [00:00<00:01, 6221.63it/s][A
Loading female train...:  27%|██▋       | 3100/11610 [00:00<00:01, 6214.52it/s][A
Loading female train...:  32%|███▏      | 3709/11610 [00:00<00:01, 6175.26it/s][A
Loading female train...:  37%|███▋      | 4305/11610 [00:00<00:01, 6107.99it/s][A
Loading female train...:  42%|████▏     | 4916/11610 [00:00<00:01, 6106.17it/s][A
Loading female train...:  47%|████▋     | 5506/11610 [00:00<00:01, 6040.14it/s][A
Loading female train...:  53%|█████▎    | 6128/11610 [00:01<00:00, 6092.19it/s][A
Loading female train...:  58%|█████▊    | 676