In [1]:
import os
import re
import gc
import random

import cv2
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import data_manager

random.seed(1)
np.random.seed(1)

os.chdir("../data/cropped")

# Clean all ".DS_Store".
try:
    os.remove(".DS_Store")
except:
    print("No .DS_Store at ../data/cropped.")

for directory in os.listdir():
    try:
        os.remove(directory + "/" + ".DS_Store")
        print("Removed .DS_Store at " + directory)
    except:
        print("No .DS_Store at " + directory)
        
# Verify that each file in every (sub)directory is an image file.
for directory in os.listdir():
    for file in os.listdir(directory):
        if not re.search(".jpg", file):
            print("Non-image file exists: " + file)
    
# Find total number of examples, "num_examples".
num_examples = 0
for directory in os.listdir():
    num_examples += len(os.listdir(directory))
print("There are " + str(num_examples) + " examples.")

# Verify that each example has a label.
num_errors = 0
for directory in os.listdir():
    labels = pd.read_csv("../labels/" + directory[8:] + ".csv")
    for pict in os.listdir(directory):
        if pict[:len(pict) - 4] not in labels["date_id"].values:
            print(pict + " is without a label.")
print("There are " + str(num_errors) + " unlabelled examples.")

No .DS_Store here.
No .DS_Store at cropped_2018-06-29
No .DS_Store at cropped_2018-06-20
No .DS_Store at cropped_2018-06-21
No .DS_Store at cropped_2018-06-19
No .DS_Store at cropped_2018-07-02
No .DS_Store at cropped_2018-07-04
No .DS_Store at cropped_2018-07-03
No .DS_Store at cropped_2018-06-15
No .DS_Store at cropped_2018-06-23
No .DS_Store at cropped_2018-06-22
No .DS_Store at cropped_2018-06-14
No .DS_Store at cropped_2018-07-06
There are 32637 examples.
There are 0 unlabelled examples.


In [2]:
# Create example and label matrices.
y = np.empty(shape=(num_examples, 2))
X = np.empty(shape=(num_examples, 128, 128, 3))
print("Filling in X and y...")
index = 0
for directory in os.listdir():
    labels = pd.read_csv("../labels/" + directory[8:] + ".csv")
    for pict in os.listdir(directory):
        X[index] = cv2.imread(directory + "/" + pict).astype("float64") # Note X[index] is BGR, not RGB.
        label = labels.loc[labels["date_id"] == pict[:len(pict) - 4], "label"]
        if label.iloc[0] == 1:
            y[index][0] = 1
            y[index][1] = 0
        else:
            y[index][0] = 0
            y[index][1] = 1
        index += 1
        
# Split the data into train, validation, and test sets.
print("Splitting data...")
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=1)
del X
del y
gc.collect()
train_X, validation_X, train_y, validation_y = train_test_split(train_X, train_y, test_size=0.2, random_state=1)

# Perform mean-subtraction and normalization of all images, calculating
# mean and standard deviation from the train set.
print("Normalizing features...")
train_mean, train_std = train_X.mean(), train_X.std()
train_X = (train_X - train_mean) / train_std
unnorm_validation_X = validation_X
validation_X = (validation_X - train_mean) / train_std
unnorm_test_X = test_X
test_X = (test_X - train_mean) / train_std

# Pickle all sets.
print("Pickling sets...")
base_path = "../pickles/"

Filling in X and y...
Splitting data...
Normalizing features...
Pickling sets...


In [3]:
data_manager.pickle_dump(train_X, base_path + "train_X.npy")
del train_X
gc.collect()

Writing total_bytes=8213102756...
Writing bytes [0, 1073741824)... Done.
Writing bytes [1073741824, 2147483648)... Done.
Writing bytes [2147483648, 3221225472)... Done.
Writing bytes [3221225472, 4294967296)... Done.
Writing bytes [4294967296, 5368709120)... Done.
Writing bytes [5368709120, 6442450944)... Done.
Writing bytes [6442450944, 7516192768)... Done.
Writing bytes [7516192768, 8213102756)... Done.


0

In [4]:
data_manager.pickle_dump(train_y, base_path + "train_y.npy")
del train_y
gc.collect()

Writing total_bytes=334347...
Writing bytes [0, 334347)... Done.


0

In [5]:
data_manager.pickle_dump(validation_X, base_path + "validation_X.npy")
del validation_X
gc.collect()

Writing total_bytes=2053374112...
Writing bytes [0, 1073741824)... Done.
Writing bytes [1073741824, 2053374112)... Done.


0

In [6]:
data_manager.pickle_dump(validation_y, base_path + "validation_y.npy")
del validation_y
gc.collect()

Writing total_bytes=83707...
Writing bytes [0, 83707)... Done.


0

In [7]:
data_manager.pickle_dump(test_X, base_path + "test_X.npy")
del test_X
gc.collect()

Writing total_bytes=2566914208...
Writing bytes [0, 1073741824)... Done.
Writing bytes [1073741824, 2147483648)... Done.
Writing bytes [2147483648, 2566914208)... Done.


0

In [8]:
data_manager.pickle_dump(test_y, base_path + "test_y.npy")
del test_y
gc.collect()

Writing total_bytes=104603...
Writing bytes [0, 104603)... Done.
Done.


In [None]:
data_manager.pickle_dump(unnorm_test_X, base_path + "unnorm_test_X.npy")
del unnorm_test_X
gc.collect()

In [None]:
data_manager.pickle_dump(unnorm_validation_X, base_path + "unnorm_validation_X.npy")
del unnorm_validation_X
gc.collect()
print("Done.")