In [1]:
import os
import re
import gc

import cv2
import h5py
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold

os.chdir("../data/cropped")

In [2]:
seed = 1 # Controls the randomness of set-splitting below.

num_classes = 2 # The classes are "0" ... "num_classes - 1".
image_width = 128
image_height = 128
num_channels = 3 # RGB color images have three channels.

test_prop = 0.1
num_splits = 3 # The "K" in K-fold cross validation.

In [3]:
# Clean all unwanted, hidden system files in the working directory.
# In OS X, one such file is ".DS_Store".
# Such files can interfere with the data processing below.

unwanted_files = [".DS_Store"]
for file in unwanted_files:
    try:
        os.remove(file)
        print(f"Removed {file} in {directory}")
    except:
        print(f"No {file} in ../data/cropped.")

for file in unwanted_files:
    for directory in os.listdir():
        try:
            os.remove(directory + "/" + file)
            print(f"Removed {file} in {directory}")
        except:
            print(f"No {file} in ../data/{directory}.")

No .DS_Store in ../data/cropped.
No .DS_Store in ../data/cropped_2018-06-15.
No .DS_Store in ../data/cropped_2018-06-14.


In [4]:
# Verify that each file in every (sub)directory is an image file.
for directory in os.listdir():
    total_non_image_files = 0
    for file in os.listdir(directory):
        if not re.search(".jpg", file):
            total_non_image_files += 1
            print(f"Non-image file exists: {file}")
    print(f"Non-jpeg files in ../data/{directory}: {total_non_image_files}.")
print()

# Find total number of examples, "num_examples".
num_examples = 0
for directory in os.listdir():
    num_examples += len(os.listdir(directory))
print(f"There are {num_examples} examples.",
      end="\n\n")

# Verify that each example has a label.
num_errors = 0
for directory in os.listdir():
    labels = pd.read_csv(f"../labels/{directory[8:]}.csv")
    for pict in os.listdir(directory):
        if pict[:len(pict) - 4] not in labels["date_id"].values:
            print(f"{pict} is without a label.")
print(f"There are {num_errors} unlabelled examples.")

Non-jpeg files in ../data/cropped_2018-06-15: 0.
Non-jpeg files in ../data/cropped_2018-06-14: 0.

There are 4750 examples.

There are 0 unlabelled examples.


In [5]:
# Create example and label matrices.

print("Filling in X and y...",
      end="")
y = np.zeros(shape=(num_examples,
                    num_classes))
X = np.empty(shape=(num_examples,
                    image_width,
                    image_height,
                    num_channels))
index = 0
for directory in os.listdir():
    labels = pd.read_csv(f"../labels/{directory[8:]}.csv")
    for pict in os.listdir(directory):
        X[index] = cv2.imread(f"{directory}/{pict}").astype("float64") # Note X[index] is BGR, not RGB.
        label = labels.loc[labels["date_id"] == pict[:len(pict) - 4], "label"]
        try:
            y[index][int(label.iloc[0])] = 1
        except:
            print()
            print(f"Invalid label for {pict[:len(pict) - 4]}.jpeg, label: {label.iloc[0]}.")
        index += 1
print("done.")

Filling in X and y...done.


In [6]:
# Obtain, perform mean-subtraction and normalization on,
# and save the test set.

print("Splitting data...",
      end="")
train_X, test_X, train_y, test_y = train_test_split(X,
                                                    y,
                                                    test_size=test_prop,
                                                    random_state=seed)
del X, y
gc.collect()
print("done.",
      end="\n\n")

print("Perform mean-subtraction and normalization of test set...",
      end="")
train_mean, train_sd = train_X.mean(), train_X.std()
test_X = (test_X - train_mean) / train_sd
print("done.",
      end="\n\n")

print("Saving test set...",
      end="")
base_path = "../hdf5"
with h5py.File(f"{base_path}/test_set.hdf5", "w") as hf:
    hf.create_dataset("test_X",
                      data=test_X)
    del test_X
    gc.collect()
    hf.create_dataset("test_y",
                      data=test_y)
    del test_y
    gc.collect()
    hf.create_dataset("train_mean_sd",
                      data=np.array([train_mean, train_sd]))
    del train_mean, train_sd
    gc.collect()
print("done.")

Splitting data...done.

Perform mean-subtraction and normalization of test set...done.

Saving test set...done.


In [7]:
# Obtain, perform mean-subtraction and normalization on,
# and save all train and validation sets.

print("Saving training and validation sets...",
      end="\n\n")

get_skf_indices = StratifiedKFold(n_splits=num_splits,
                                  shuffle=True,
                                  random_state=seed)

curr_split_num = 1
for train_indices, validation_indices in get_skf_indices.split(train_X, train_y[:, 0]):
    print(f"Saving split {curr_split_num}.")
    with h5py.File(f"{base_path}/train_validation_set_{curr_split_num}.hdf5", "w") as hf:
        curr_train_X = train_X[train_indices]
        curr_train_mean, curr_train_sd = curr_train_X.mean(), curr_train_X.std()
        curr_train_X = (curr_train_X - curr_train_mean) / curr_train_sd
        hf.create_dataset("train_X",
                          data=curr_train_X)
        del curr_train_X
        gc.collect()
        curr_train_y = train_y[train_indices]
        hf.create_dataset("train_y",
                          data=curr_train_y)
        del curr_train_y
        gc.collect()
        curr_validation_X = (train_X[validation_indices] - curr_train_mean) / curr_train_sd
        hf.create_dataset("validation_X",
                          data=curr_validation_X)
        del curr_validation_X
        gc.collect()
        curr_validation_y = train_y[validation_indices]
        hf.create_dataset("validation_y",
                          data=curr_validation_y)
        del curr_validation_y
        gc.collect()
        hf.create_dataset("train_mean_sd",
                          data=np.array([curr_train_mean, curr_train_sd]))
        del curr_train_mean, curr_train_sd
        gc.collect()
    if (curr_split_num < num_splits):
        input("Press enter when sufficient memory has been freed to proceed. ")
    else:
        print()
    curr_split_num += 1

del train_X, train_y
gc.collect()
print("...done.")

Saving training and validation sets...

Saving split 1.
Press enter when sufficient memory has been freed to proceed. 
Saving split 2.
Press enter when sufficient memory has been freed to proceed. 
Saving split 3.

...done.
