# Create a machine learning dataset from image folder

Creates an image dataset suitable for machine learning. 
It expects as input a directory containing the images grouped in subfolders 

data/ <br>
&nbsp;&nbsp;&nbsp;category1/ <br>
&nbsp;&nbsp;&nbsp;category2/ <br>
&nbsp;&nbsp;&nbsp;category3/ <br>
&nbsp;&nbsp;&nbsp;.......... <br>

The script will extract the filenames and the categories in the directory and it will assign the label to each image. 
After shuffling the data, it will split it into train and test set in the proportion set by the parameter TRAIN_TEST_SPLIT

The result will be 4 numpy arrays <br>
&nbsp;&nbsp;&nbsp;train_files <br>
&nbsp;&nbsp;&nbsp;train_labels <br>
&nbsp;&nbsp;&nbsp;test_files <br>
&nbsp;&nbsp;&nbsp;test_labels <br>
Which will be stored in a specified file. 

In [19]:
import tensorflow as tf
import os
import numpy as np 
from random import shuffle
import glob

directory = 'data/'
data_file = 'data.npz'


# The percent of data that will be included in the test set
TRAIN_TEST_SPLIT = 0.2

In [None]:
# Get the labels from the directory 
labels = [x[1] for x in os.walk(directory)][0]

labels = sorted(labels)

labels = labels[1:]

num_labels = len(labels)

# build dictionary for indexes
label_indexes = {labels[i]: i for i in range(0, len(labels))}

label_indexes

In [16]:
# get the filepaths 
data_files = glob.glob(directory + '**/*.jpg', recursive=True)

# shuffle the data 
shuffle(data_files)

num_data_files = len(data_files)

data_labels = []
# build the labels 
for file in data_files:
    label = file.split('/')[1]
    data_labels.append(label_indexes[label])

# just a check to see if everything is ok
for i in range(10):
    print(data_files[i], data_labels[i])

assert num_data_files == len(data_labels)    

# convert the labels to one hot
data_labels = np.array(data_labels)
data_labels_one_hot = tf.keras.utils.to_categorical(data_labels)

data/u/u_292.jpg 21
data/b/b_108.jpg 2
data/u/u_672.jpg 21
data/i/i_670.jpg 9
data/r/r_733.jpg 18
data/q/q_692.jpg 17
data/l/l_612.jpg 12
data/u/u_417.jpg 21
data/x/x_282.jpg 24
data/a/a_338.jpg 1


In [17]:
# TRAIN/TEST split 
nr_test_data = int(num_data_files * TRAIN_TEST_SPLIT)

train_data_files = data_files[nr_test_data:]
test_data_files = data_files[:nr_test_data]

train_labels = data_labels_one_hot[nr_test_data:]
test_labels = data_labels_one_hot[:nr_test_data]

assert len(train_labels) + len(test_labels) == num_data_files
assert len(test_data_files) + len(train_data_files) == num_data_files

In [18]:
np.savez(data_file, 
         train_data_files=train_data_files, 
         test_data_files=test_data_files, 
         train_labels=train_labels, 
         test_labels=test_labels)