In [1]:
import os
import re
import gc
import random

import cv2
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

import data_manager

os.chdir("../data/cropped")

In [2]:
# Clean all unwanted, hidden system files in the working directory.
# In OS X, one such file is ".DS_Store".
# Such files can interfere with the data processing below.

unwanted_files = [".DS_Store"]
for file in unwanted_files:
    try:
        os.remove(file)
    except:
        print(f"No {file} in ../data/cropped.")

for file in unwanted_files:
    for directory in os.listdir():
        try:
            os.remove(directory + "/" + file)
            print(f"Removed {file} in {directory}")
        except:
            print(f"No {file} in ../data/{directory}.")

No .DS_Store in ../data/cropped.
No .DS_Store in ../data/cropped_2018-06-14.


In [3]:
# Verify that each file in every (sub)directory is an image file.
for directory in os.listdir():
    total_non_image_files = 0
    for file in os.listdir(directory):
        if not re.search(".jpg", file):
            total_non_image_files += 1
            print(f"Non-image file exists: {file}")
    print(f"Non-image files in ../data/{directory}: {total_non_image_files}.")
print()

# Find total number of examples, "num_examples".
num_examples = 0
for directory in os.listdir():
    num_examples += len(os.listdir(directory))
print(f"There are {num_examples} examples.")
print()

# Verify that each example has a label.
num_errors = 0
for directory in os.listdir():
    labels = pd.read_csv(f"../labels/{directory[8:]}.csv")
    for pict in os.listdir(directory):
        if pict[:len(pict) - 4] not in labels["date_id"].values:
            print(f"{pict} is without a label.")
print(f"There are {num_errors} unlabelled examples.")

Non-image files in ../data/cropped_2018-06-14: 0.

There are 1390 examples.

There are 0 unlabelled examples.


In [4]:
num_classes = 2 # The classes are "0" and "1".
image_width = 128
image_height = 128
num_channels = 3
train_prop = 0.8
test_prop = 0.1
val_prop = 0.1
seed = 1
num_splits = 1

In [6]:
# Create example and label matrices.
print("Filling in X and y...",
      end="")
y = np.zeros(shape=(num_examples,
                    num_classes))
X = np.empty(shape=(num_examples,
                    image_width,
                    image_height,
                    num_channels))
index = 0
for directory in os.listdir():
    labels = pd.read_csv(f"../labels/{directory[8:]}.csv")
    for pict in os.listdir(directory):
        X[index] = cv2.imread(f"{directory}/{pict}").astype("float64") # Note X[index] is BGR, not RGB.
        label = labels.loc[labels["date_id"] == pict[:len(pict) - 4], "label"]
        y[index][label.iloc[0]] = 1
        index += 1
print("done.")

Filling in X and y... done.


In [43]:
get_sss_indices = StratifiedShuffleSplit(n_splits=10,
                                         test_size=val_prop,
                                         random_state=seed)

In [44]:
a = get_sss_indices.split(X=X, y=y)

In [45]:
for train_index, test_index in a:
    m = train_index

In [None]:
# Split the data into train, validation, and test sets.
print("Splitting data...",
      end="")
train_X, test_X, train_y, test_y = train_test_split(X,
                                                    y,
                                                    test_size=test_prop,
                                                    random_state=seed)
del X, y
gc.collect()
val_prop = data_manager.find_adjusted_val_prop(val_prop,
                                               test_prop,
                                               num_examples)
get_sss_indices = StratifiedShuffleSplit(n_splits=num_splits,
                                         test_size=val_prop,
                                         random_state=seed)
train_X, validation_X, train_y, validation_y = train_test_split(train_X,
                                                                train_y,
                                                                test_size=val_prop,
                                                                random_state=seed)
print("done.")

In [None]:
# Perform mean-subtraction and normalization of all images,
# calculating mean and standard deviation from the train set.
print("Normalizing features...",
      end="")
train_mean, train_std = train_X.mean(), train_X.std()
train_X = (train_X - train_mean) / train_std
validation_X = (validation_X - train_mean) / train_std
test_X = (test_X - train_mean) / train_std
print("done.")

In [None]:
# Pickle all sets.
print("Pickling sets...")
base_path = "../pickles/"

In [None]:
data_manager.pickle_dump((train_mean, train_std),
                         base_path + "train_mean_sd.tuple")
del train_mean, train_sd
gc.collect()

In [None]:
data_manager.pickle_dump(train_X,
                         base_path + "train_X.npy")
del train_X
gc.collect()

In [None]:
data_manager.pickle_dump(train_y,
                         base_path + "train_y.npy")
del train_y
gc.collect()

In [None]:
data_manager.pickle_dump(validation_X,
                         base_path + "validation_X.npy")
del validation_X
gc.collect()

In [None]:
data_manager.pickle_dump(validation_y,
                         base_path + "validation_y.npy")
del validation_y
gc.collect()

In [None]:
data_manager.pickle_dump(test_X,
                         base_path + "test_X.npy")
del test_X
gc.collect()

In [None]:
data_manager.pickle_dump(test_y,
                         base_path + "test_y.npy")
del test_y
gc.collect()
print("Done.")