In [4]:
import os
import re

import cv2
import numpy as np
import pandas as pd

import data_manager

os.chdir("../data/cropped")

# Clean all ".DS_Store".
try:
    os.remove(".DS_Store")
except:
    print("No .DS_Store here.")

for directory in os.listdir():
    try:
        os.remove(directory + "/" + ".DS_Store")
        print("Removed .DS_Store at " + directory)
    except:
        print("No .DS_Store at " + directory)
        
# Verify that each file in every (sub)directory is an image file.
for directory in os.listdir():
    for file in os.listdir(directory):
        if not re.search(".jpg", file):
            print("Non-image file exists: " + file)
    
# Find total number of examples, "num_examples".
num_examples = 0
for directory in os.listdir():
    num_examples += len(os.listdir(directory))
print("There are " + str(num_examples) + " examples.")

# Verify that each example has a label.
num_errors = 0
for directory in os.listdir():
    labels = pd.read_csv("../labels/" + directory[8:] + ".csv")
    for pict in os.listdir(directory):
        if pict[:len(pict) - 4] not in labels["date_id"].values:
            print(pict + " is without a label.")
print("There are " + str(num_errors) + " unlabelled examples.")

No .DS_Store here.
No .DS_Store at cropped_2018-06-29
No .DS_Store at cropped_2018-06-20
No .DS_Store at cropped_2018-06-21
No .DS_Store at cropped_2018-06-19
No .DS_Store at cropped_2018-07-02
No .DS_Store at cropped_2018-07-04
No .DS_Store at cropped_2018-07-03
No .DS_Store at cropped_2018-06-15
No .DS_Store at cropped_2018-06-23
No .DS_Store at cropped_2018-06-22
No .DS_Store at cropped_2018-06-14
No .DS_Store at cropped_2018-07-06
There are 32637 examples.
There are 0 unlabelled examples.


In [None]:
# Create example and label matrices.
y = np.empty(shape=(num_examples, 2), dtype="int32")
X = np.empty(shape=(num_examples, 128, 128, 3), dtype="float64")
print("Filling in X and y...")
index = 0
for directory in os.listdir():
    labels = pd.read_csv("../labels/" + directory[8:] + ".csv")
    for pict in os.listdir(directory):
        X[index] = cv2.imread(directory + "/" + pict).astype("float64") # Note X[index] is BGR, not RGB.
        label = labels.loc[labels["date_id"] == pict[:len(pict) - 4], "label"]
        if label == 1:
            y[index][0] = 1
            y[index][1] = 0
        else:
            y[index][0] = 0
            y[index][1] = 1
        index += 1

# Split the data into train, validation, and test sets.
train_X, test_X, train_y, test_y 
    = train_test_split(X, y, test_size=0.2, random_state=1)
train_X, validation_X, train_y, validation_y 
    = train_test_split(train_X, train_y, test_size=0.2, random_state=1)

# Perform mean-subtraction and normalization of all images, calculating
# mean and standard deviation from the train set.
train_mean, train_std = train_X.mean(), train_X.std()
train_X = (train_X - train_mean) / train_std
validation_X = (validation_X - train_mean) / train_std
test_X = (test_X - train_mean) / train_std

# Pickle all sets.
print("Pickling sets...")
base_path = "../pickles/"
data_manager.pickle_dump(train_X, base_path + "train_X.npy")
data_manager.pickle_dump(train_y, base_path + "train_y.npy")
data_manager.pickle_dump(train_X, base_path + "validation_X.npy")
data_manager.pickle_dump(train_y, base_path + "validation_y.npy")
data_manager.pickle_dump(train_X, base_path + "test_X.npy")
data_manager.pickle_dump(train_y, base_path + "test_y.npy")
print("Done.")