In [1]:
'''
The goal of this file is to import all of the training samples, and split them into 
training, testing, and validation sets with an 80:10:10 split. 

We export these because we want them to be consistent across all training and testing

We create a validation + test set for cases where we want the most training possible
'''
import pandas as pd
from sklearn.model_selection import train_test_split
RANDOM_SEED = 42


In [2]:
'''
import all samples
'''
all_samples = pd.read_csv('data/flickr_8k/captions.txt')

In [3]:
'''
get the filenames for the training, validation, and test sets

split the train/val/test 80/10/10
'''
# get all of the filenames from the dataset 
ALL_FILENAMES = list(set(all_samples['image']))
# split the filenames into train test val 80-10-10
TRAIN_FILENAMES, TEST_FILENAMES = train_test_split(ALL_FILENAMES, test_size=0.2, random_state=RANDOM_SEED)
TEST_FILENAMES, VALIDATION_FILENAMES = train_test_split(TEST_FILENAMES, test_size=0.5, random_state=RANDOM_SEED) 
# define a combined training plus validatoin set
TRAIN_AND_VAL_FILENAMES = TRAIN_FILENAMES + VALIDATION_FILENAMES

In [4]:
''' 
get the actual rows from the df corresponding to the different sets
'''
# get the samples with the given filenames
train_samples = all_samples.loc[all_samples['image'].isin(TRAIN_FILENAMES)]
validation_samples = all_samples.loc[all_samples['image'].isin(VALIDATION_FILENAMES)]
test_samples = all_samples.loc[all_samples['image'].isin(TEST_FILENAMES)]
train_and_val_samples = all_samples.loc[all_samples['image'].isin(TRAIN_AND_VAL_FILENAMES)]

In [5]:
'''
export these to files so that they can be consistent no matter what

commented out and removed so that this is not run again
'''
# train_samples.to_csv('data/flickr_8k/train.csv', index=False)
# validation_samples.to_csv('data/flickr_8k/validation.csv', index=False)
# test_samples.to_csv('data/flickr_8k/test.csv', index=False)
# train_and_val_samples.to_csv('data/flickr_8k/train_and_val.csv', index=False)