# [Galaxy Zoo - The Galaxy Challenge](https://www.kaggle.com/c/galaxy-zoo-the-galaxy-challenge)

Important note: Images provided by Kaggle must be unpacked in a folder called ``./data/images_training_rev1`` (at the root of the project).

Now we'll proceed to create an 80/20 split of the dataset. To do this, we'll randomly select 80% of the images and place them in a directory. These images will be all potentially used for training later on. The remaining 20% of the images will be placed in a different directory and these will be used to validate our model.

In [5]:
import glob
import math
import random
import os
import shutil

# Save data directory locations
data_dir = r'../data'
original_data_dir = data_dir + '/images_training_rev1'
training_dir = data_dir + '/training'
validation_dir = data_dir + '/validation'
test_dir = data_dir + '/test'

def load_img_paths():
    '''
    Retrieve the full path of all images in the dataset
    '''
    return glob.glob(original_data_dir + '/*.jpg')

def create_temp_datasets(img_paths, train_size = 4, validation_size = 2, test_size = 4):
    '''
    Randomly select images to be in the training, validation, and test datasets
    '''
    assert(len(img_paths) > 0)
    assert(train_size > 0)
    assert(validation_size > 0)
    assert(test_size > 0)
    assert(len(img_paths) >= (train_size + validation_size + test_size))

    # Randomly select images that will be in each set
    random.shuffle(img_paths)
    train_img_paths = img_paths[0:train_size]
    validation_img_paths = img_paths[train_size: (train_size + validation_size)]
    test_img_paths = img_paths[(train_size + validation_size): (train_size + validation_size + test_size)]
    
    # Create training, validation, and test directories 
    if not os.path.exists(training_dir):
        os.makedirs(training_dir)
    if not os.path.exists(validation_dir):
        os.makedirs(validation_dir)
    if not os.path.exists(test_dir):
        os.makedirs(test_dir)
        
    # Place training, validation, and test images in their respective directories
    for x in train_img_paths:
        shutil.copyfile(x, x.replace(original_data_dir, training_dir))
    for x in validation_img_paths:
        shutil.copyfile(x, x.replace(original_data_dir, validation_dir))
    for x in test_img_paths:
        shutil.copyfile(x, x.replace(original_data_dir, test_dir))
        
def remove_temp_datasets():
    '''
    Remove directories that were temporary created to save the training, validation, and test
    '''
    shutil.rmtree(training_dir)
    shutil.rmtree(validation_dir)
    shutil.rmtree(test_dir)

Let's go ahead and execute the helper methods defined above:

In [6]:
# Declare the size of each of the datasets we'll be dealing with
train_size = 4000
validation_size = 500
test_size = 4000

img_paths = load_img_paths()
if len(img_paths) > 0:
    create_temp_datasets(img_paths, train_size, validation_size, test_size)
else:
    msg = """
        No images were found in the '%s' directory. Either training, validation and test
        directories have already been created, or the datasets structure is not correctly setup.
    """ % original_data_dir
    print(msg)