In [None]:
import os
import re
import gc

import cv2
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

import pickle_wrapper

os.chdir("../data/cropped")

In [None]:
seed = 1 # Controls the randomness of set-splitting below.

num_classes = 2 # The classes are "0" ... "num_classes - 1".
image_width = 128
image_height = 128
num_channels = 3 # RGB color images have three channels.

train_prop = 0.8
test_prop = 0.1
num_splits = 1 # The "K" in K-fold cross validation.

In [None]:
# Clean all unwanted, hidden system files in the working directory.
# In OS X, one such file is ".DS_Store".
# Such files can interfere with the data processing below.

unwanted_files = [".DS_Store"]
for file in unwanted_files:
    try:
        os.remove(file)
    except:
        print(f"No {file} in ../data/cropped.")

for file in unwanted_files:
    for directory in os.listdir():
        try:
            os.remove(directory + "/" + file)
            print(f"Removed {file} in {directory}")
        except:
            print(f"No {file} in ../data/{directory}.")

In [None]:
# Verify that each file in every (sub)directory is an image file.
for directory in os.listdir():
    total_non_image_files = 0
    for file in os.listdir(directory):
        if not re.search(".jpg", file):
            total_non_image_files += 1
            print(f"Non-image file exists: {file}")
    print(f"Non-jpeg files in ../data/{directory}: {total_non_image_files}.")
print()

# Find total number of examples, "num_examples".
num_examples = 0
for directory in os.listdir():
    num_examples += len(os.listdir(directory))
print(f"There are {num_examples} examples.",
      end="\n\n")

# Verify that each example has a label.
num_errors = 0
for directory in os.listdir():
    labels = pd.read_csv(f"../labels/{directory[8:]}.csv")
    for pict in os.listdir(directory):
        if pict[:len(pict) - 4] not in labels["date_id"].values:
            print(f"{pict} is without a label.")
print(f"There are {num_errors} unlabelled examples.")

In [None]:
# Create example and label matrices.

print("Filling in X and y...",
      end="")
y = np.zeros(shape=(num_examples,
                    num_classes))
X = np.empty(shape=(num_examples,
                    image_width,
                    image_height,
                    num_channels))
index = 0
for directory in os.listdir():
    labels = pd.read_csv(f"../labels/{directory[8:]}.csv")
    for pict in os.listdir(directory):
        X[index] = cv2.imread(f"{directory}/{pict}").astype("float64") # Note X[index] is BGR, not RGB.
        label = labels.loc[labels["date_id"] == pict[:len(pict) - 4], "label"]
        try:
            y[index][int(label.iloc[0])] = 1
        except:
            print()
            print(f"Invalid label for {pict[:len(pict) - 4]}.jpeg, label: {label.iloc[0]}.")
        index += 1
print("done.")

In [None]:
# Obtain, perform mean-subtraction and normalization on,
# and pickle the test set.

print("Splitting data...",
      end="")
train_X, test_X, train_y, test_y = train_test_split(X,
                                                    y,
                                                    test_size=test_prop,
                                                    random_state=seed)
del X, y
gc.collect()
print("done.",
      end="\n\n")

print("Perform mean-subtraction and normalization of test set...",
      end="")
train_mean, train_sd = train_X.mean(), train_X.std()
test_X = (test_X - train_mean) / train_sd
print("done.",
      end="\n\n")

print("Pickling test set...",
      end="\n\n")
base_path = "../pickles/"
pickle_wrapper.pickle_dump(test_X,
                           base_path + "test_X.npy")
del test_X
gc.collect()
pickle_wrapper.pickle_dump(test_y,
                           base_path + "test_y.npy")
del test_y
gc.collect()
pickle_wrapper.pickle_dump((train_mean, train_sd),
                           base_path + "train_mean_sd_0.tuple")
del train_mean, train_sd
gc.collect()
print()
print("...done.")

In [None]:
# Obtain, perform mean-subtraction and normalization on,
# and pickle all train and validation sets.

print("Pickling training and validation sets...",
      end="\n\n")

val_prop = pickle_wrapper.find_adjusted_val_prop(1 - train_prop - test_prop,
                                                 test_prop,
                                                 num_examples)
get_sss_indices = StratifiedShuffleSplit(n_splits=num_splits,
                                         test_size=val_prop,
                                         random_state=seed)

split_num = 1
for train_indices, validation_indices in get_sss_indices.split(train_X, train_y):
    print(f"Pickling split {split_num}.")
    curr_train_X = train_X[train_indices]
    curr_train_mean, curr_train_sd = curr_train_X.mean(), curr_train_X.std()
    curr_train_X = (curr_train_X - curr_train_mean) / curr_train_sd
    pickle_wrapper.pickle_dump((curr_train_mean, curr_train_sd),
                               base_path + f"train_mean_sd_{split_num}.tuple")
    del curr_train_mean, curr_train_sd
    gc.collect()
    pickle_wrapper.pickle_dump(curr_train_X,
                               base_path + f"train_X_{split_num}.npy")
    del curr_train_X
    gc.collect()
    curr_validation_X = train_X[validation_indices]
    pickle_wrapper.pickle_dump(curr_validation_X,
                               base_path + f"validation_X_{split_num}.npy")
    del curr_validation_X
    gc.collect()
    curr_train_y = train_y[train_indices]
    pickle_wrapper.pickle_dump(curr_train_y,
                               base_path + f"train_y_{split_num}.npy")
    del curr_train_y
    gc.collect()
    curr_validation_y = train_y[validation_indices]
    pickle_wrapper.pickle_dump(curr_validation_y,
                               base_path + f"validation_y_{split_num}.npy")
    del curr_validation_y
    gc.collect()
    
    split_num += 1
    input("Press enter when memory has been freed for new pickles. ")
    print()

del train_X, train_y
gc.collect()

print("...done.")