<a href="https://colab.research.google.com/github/emcdona1/fmnh_scripts/blob/main/Test_train_split_an_image_set_with_metadata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
from pathlib import Path
import shutil
import time
from random import randint

**In the cell below, adjust the values as necessary.**

`SPLIT` is the test split.

`image_dir` is the directory of the images to be split.  A new folder with the same name + `_split` will be generated in the same parent directory.


In [2]:
SPLIT = 0.1
image_dir = Path('./drive/MyDrive/Machine Learning projects - Grainger-funded/3. Handwriting recognition project - Beth McDonald + Keshab Panthi/' +\
                'image_sets/IAM_words')

In [3]:
copy_image_dir = Path(image_dir.parent, f'{image_dir.name}_split')
train_dir = Path(copy_image_dir, 'train')
test_dir = Path(copy_image_dir, 'test')
try:
    os.makedirs(train_dir)
    os.makedirs(test_dir)
except FileExistsError as e:
    print('WARNING: These directories already exist!')
    print(e)

[Errno 17] File exists: 'drive/MyDrive/Machine Learning projects - Grainger-funded/3. Handwriting recognition project - Beth McDonald + Keshab Panthi/image_sets/IAM_words_split/train'


## 1. Even sampling (one directory of images with a metadata file)

Only CSV and Markdown files will be copied to both train and test directories.

All other files will be evenly distributed (e.g. every 10th file for 10% train/test split)

In [None]:
images = os.listdir(image_dir)
metadata = [f for f in images if Path(f).suffix == '.csv' or Path(f).suffix == '.md']
for m in metadata:
  images.remove(m)

test_set_size = int(len(images) * SPLIT)
test_idx = sorted([int(i * (1 // SPLIT)) for i in list(range(test_set_size))], reverse=True)

test_images = list()
for idx, img in enumerate(test_idx):
    test_images.append(images.pop(idx))
train_images = images

In [None]:
for img in train_images:
    shutil.copyfile(Path(image_dir, img), Path(train_dir, img))
for img in test_images:
    shutil.copyfile(Path(image_dir, img), Path(test_dir, img))
for file in metadata:
    shutil.copyfile(Path(image_dir, file), Path(train_dir, file))
    shutil.copyfile(Path(image_dir, file), Path(test_dir, file))

## 2. Stratified sampling (multiple sub directories of images)

Any files in the root folder will be copied to BOTH train and test folders (e.g. metadata, readmes, any data preprocessing)

In [4]:
for parent, sub_folders, files in os.walk(image_dir):
    if sub_folders and os.path.exists(Path(parent, sub_folders[0])):
        # checks if the code has already been executed at this directory level -- and if so, it skips to the next one
        print(f'already processed: {parent}')
        pass
    else:
        if Path(parent).relative_to(image_dir) == Path('.'):
            for f in files:
                src = str(Path(parent, f)).replace(' ', r'\ ')
                dest = str(Path(train_dir, f)).replace(' ', r'\ ')
                !cp $src $dest -u
                dest = str(Path(test_dir, f)).replace(' ', r'\ ')
                !cp $src $dest -u
        else:
            test_image_indicies = sorted(list(range(int(len(files) * SPLIT))), reverse=True)
            test_image_indicies = [int(i * (1 // SPLIT)) for i in test_image_indicies]
            test_images = [files[i] for i in test_image_indicies]
            subdir = Path(parent).relative_to(image_dir)
            for idx in test_image_indicies:
                files.pop(idx)
            train_images = files
            if train_images:  # and not os.path.exists(Path(train_dir, subdir, train_images[0])):
                for image in train_images:
                    src = str(Path(parent, image)).replace(' ', r'\ ')
                    dest = str(Path(train_dir, subdir, image)).replace(' ', r'\ ')
                    !cp $src $dest -u
                    # shutil.copy(Path(parent, image), Path(train_dir, subdir, image))
            if test_images and not os.path.exists(Path(train_dir, subdir, test_images[0])):
                for image in test_images:
                    src = str(Path(parent, image)).replace(' ', r'\ ')
                    dest = str(Path(test_dir, subdir, image)).replace(' ', r'\ ')
                    !cp $src $dest -u
                    # shutil.copy(Path(parent, image), Path(test_dir, subdir, image))
        for child_folder in sub_folders:
            new_folder = Path(parent, child_folder).relative_to(image_dir)
            try:
                os.makedirs(Path(train_dir, new_folder))
                os.makedirs(Path(test_dir, new_folder))
            except:
                pass

drive/MyDrive/Machine Learning projects - Grainger-funded/3. Handwriting recognition project - Beth McDonald + Keshab Panthi/image_sets/IAM_words
drive/MyDrive/Machine Learning projects - Grainger-funded/3. Handwriting recognition project - Beth McDonald + Keshab Panthi/image_sets/IAM_words/a02
drive/MyDrive/Machine Learning projects - Grainger-funded/3. Handwriting recognition project - Beth McDonald + Keshab Panthi/image_sets/IAM_words/a02/a02-004
drive/MyDrive/Machine Learning projects - Grainger-funded/3. Handwriting recognition project - Beth McDonald + Keshab Panthi/image_sets/IAM_words/a02/a02-000
drive/MyDrive/Machine Learning projects - Grainger-funded/3. Handwriting recognition project - Beth McDonald + Keshab Panthi/image_sets/IAM_words/a02/a02-020
drive/MyDrive/Machine Learning projects - Grainger-funded/3. Handwriting recognition project - Beth McDonald + Keshab Panthi/image_sets/IAM_words/a02/a02-012
drive/MyDrive/Machine Learning projects - Grainger-funded/3. Handwriting