# Prepare Data

In [1]:
SPRITE_DIRECTORY = 'sprites'

## Count dataset

In [2]:
import os

FIRE_DIRECTORY = os.path.join(SPRITE_DIRECTORY, 'fire')
GRASS_DIRECTORY = os.path.join(SPRITE_DIRECTORY, 'grass')
WATER_DIRECTORY = os.path.join(SPRITE_DIRECTORY, 'water')
def get_file_count_in(directory):
    files = os.listdir(directory)
    return len(files)
FIRE_SPRITE_COUNT = get_file_count_in(FIRE_DIRECTORY)
GRASS_SPRITE_COUNT = get_file_count_in(GRASS_DIRECTORY)
WATER_SPRITE_COUNT = get_file_count_in(WATER_DIRECTORY)
print(f"Number of fire sprites: {FIRE_SPRITE_COUNT}")
print(f"Number of grass sprites: {GRASS_SPRITE_COUNT}")
print(f"Number of water sprites: {WATER_SPRITE_COUNT}")

Number of fire sprites: 64
Number of grass sprites: 97
Number of water sprites: 131


Since there are more grass and water sprites than fire sprites, I will be using **grass** and **water** for classification. The fire sprites can therefore be removed.

In [3]:
import shutil
shutil.rmtree(FIRE_DIRECTORY)

## Splitting dataset

In [4]:
import os
import random

def create_directories_if_necessary(file):
    if not os.path.exists(os.path.dirname(file)):
        os.makedirs(os.path.dirname(file))

def move_file(source_directory, target_directory, file_name):
    source_path = os.path.join(source_directory, file_name)
    target_path = os.path.join(target_directory, file_name)
    create_directories_if_necessary(target_path)
    os.rename(source_path, target_path)

def split_files_into(
    source_directory,
    first_target_directory,
    second_target_directory,
    first_target_percentage
):
    file_names = os.listdir(source_directory)
    file_count = len(file_names)
    first_target_count = round(file_count * first_target_percentage)
    print(f"Moving {first_target_count} files to {first_target_directory}")
    # Move random files to first directory
    for _ in range(first_target_count):
        random_file_name = random.choice(file_names)
        file_names.remove(random_file_name)
        move_file(source_directory, first_target_directory, random_file_name)
    # Move remaining files to second directory
    second_target_count = file_count - first_target_count
    print(f"Moving {second_target_count} files to {second_target_directory}")
    for file_name in file_names:
        move_file(source_directory, second_target_directory, file_name)

def split_into_training_and_test(data_directory, test_percentage):
    class_names = next(os.walk(data_directory))[1]
    for class_name in class_names:
        class_path = os.path.join(data_directory, class_name)
        test_path = os.path.join(data_directory, "test", class_name)
        training_path = os.path.join(data_directory, "training", class_name)
        split_files_into(
            class_path,
            test_path,
            training_path,
            test_percentage
        )
        os.rmdir(class_path)

split_into_training_and_test(SPRITE_DIRECTORY, 0.1)

Moving 10 files to sprites\test\grass
Moving 87 files to sprites\training\grass
Moving 13 files to sprites\test\water
Moving 118 files to sprites\training\water
