In [1]:
import os
import re
import gc
import random

import cv2
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold

import data_manager

random.seed(1)
np.random.seed(1)

os.chdir("../data/cropped")

In [11]:
# Clean all unwanted, hidden system files in the working directory.
# In OS X, one such file is ".DS_Store".
# Such files can interfere with the data processing below.

unwanted_files = [".DS_Store"]
for file in unwanted_files:
    try:
        os.remove(file)
    except:
        print("No {0} in ../data/cropped.".format(file))

for file in unwanted_files:
    for directory in os.listdir():
        try:
            os.remove(directory + "/" + file)
            print("Removed {0} in {1}".format(file, directory))
        except:
            print("No {0} in ../data/{1}.".format(file, directory))

No .DS_Store in ../data/cropped.
No .DS_Store in ../data/cropped_2018-07-19.
No .DS_Store in ../data/cropped_2018-07-17.
No .DS_Store in ../data/cropped_2018-07-16.
No .DS_Store in ../data/cropped_2018-07-11.
No .DS_Store in ../data/cropped_2018-07-18.
No .DS_Store in ../data/cropped_2018-06-29.
No .DS_Store in ../data/cropped_2018-06-20.
No .DS_Store in ../data/cropped_2018-06-21.
No .DS_Store in ../data/cropped_2018-06-19.
No .DS_Store in ../data/cropped_2018-07-02.
No .DS_Store in ../data/cropped_2018-07-04.
No .DS_Store in ../data/cropped_2018-07-03.
No .DS_Store in ../data/cropped_2018-07-14.
No .DS_Store in ../data/cropped_2018-07-23.
No .DS_Store in ../data/cropped_2018-06-15.
No .DS_Store in ../data/cropped_2018-06-23.
No .DS_Store in ../data/cropped_2018-06-22.
No .DS_Store in ../data/cropped_2018-06-14.
No .DS_Store in ../data/cropped_2018-07-06.
No .DS_Store in ../data/cropped_2018-07-09.

Total non-image files in ../data/cropped_2018-07-19: 0
Total non-image files in ../dat

In [14]:
# Verify that each file in every (sub)directory is an image file.
for directory in os.listdir():
    total_non_image_files = 0
    for file in os.listdir(directory):
        if not re.search(".jpg", file):
            total_non_image_files += 1
            print("Non-image file exists: {1}".format(file))
    print("Total non-image files in ../data/{0}: {1}.".format(directory, total_non_image_files))
print()

# Find total number of examples, "num_examples".
num_examples = 0
for directory in os.listdir():
    num_examples += len(os.listdir(directory))
print("There are {0} examples.".format(num_examples))
print()

# Verify that each example has a label.
num_errors = 0
for directory in os.listdir():
    labels = pd.read_csv("../labels/{0}.csv".format(directory[8:]))
    for pict in os.listdir(directory):
        if pict[:len(pict) - 4] not in labels["date_id"].values:
            print("{0} is without a label.".format(pict))
print("There are {0} unlabelled examples.".format(num_errors))

Total non-image files in ../data/cropped_2018-07-19: 0.
Total non-image files in ../data/cropped_2018-07-17: 0.
Total non-image files in ../data/cropped_2018-07-16: 0.
Total non-image files in ../data/cropped_2018-07-11: 0.
Total non-image files in ../data/cropped_2018-07-18: 0.
Total non-image files in ../data/cropped_2018-06-29: 0.
Total non-image files in ../data/cropped_2018-06-20: 0.
Total non-image files in ../data/cropped_2018-06-21: 0.
Total non-image files in ../data/cropped_2018-06-19: 0.
Total non-image files in ../data/cropped_2018-07-02: 0.
Total non-image files in ../data/cropped_2018-07-04: 0.
Total non-image files in ../data/cropped_2018-07-03: 0.
Total non-image files in ../data/cropped_2018-07-14: 0.
Total non-image files in ../data/cropped_2018-07-23: 0.
Total non-image files in ../data/cropped_2018-06-15: 0.
Total non-image files in ../data/cropped_2018-06-23: 0.
Total non-image files in ../data/cropped_2018-06-22: 0.
Total non-image files in ../data/cropped_2018-06

In [15]:
# Create example and label matrices.
num_classes = 2 # The classes are "0" and "1".
image_width = 128
image_height = 128
num_channels = 3

y = np.zeros(shape=(num_examples, num_classes))
X = np.empty(shape=(num_examples, image_width, image_height, num_channels))
print("Filling in X and y...")
index = 0
for directory in os.listdir():
    labels = pd.read_csv("../labels/" + directory[8:] + ".csv")
    for pict in os.listdir(directory):
        X[index] = cv2.imread(directory + "/" + pict).astype("float64") # Note X[index] is BGR, not RGB.
        label = labels.loc[labels["date_id"] == pict[:len(pict) - 4], "label"]
        y[index][label.iloc[0]] = 1
        index += 1
print("done.", flush=True)

# Split the data into train, validation, and test sets.
print("Splitting data...")
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=1)
del X
del y
gc.collect()
train_X, validation_X, train_y, validation_y = train_test_split(train_X, train_y, test_size=0.2, random_state=1)
print("done.", flush=True)

# Perform mean-subtraction and normalization of all images,
# calculating mean and standard deviation from the train set.
print("Normalizing features...")
train_mean, train_std = train_X.mean(), train_X.std()
train_X = (train_X - train_mean) / train_std
validation_X = (validation_X - train_mean) / train_std
test_X = (test_X - train_mean) / train_std
print("done.", flush=True)

# Pickle all sets.
print("Pickling sets...")
base_path = "../pickles/"

Filling in X and y...


KeyboardInterrupt: 

In [3]:
data_manager.pickle_dump(train_X, base_path + "train_X.npy")
del train_X
gc.collect()

Writing total_bytes=12650938532...
Writing bytes [0, 1073741824)... Done.
Writing bytes [1073741824, 2147483648)... Done.
Writing bytes [2147483648, 3221225472)... Done.
Writing bytes [3221225472, 4294967296)... Done.
Writing bytes [4294967296, 5368709120)... Done.
Writing bytes [5368709120, 6442450944)... Done.
Writing bytes [6442450944, 7516192768)... Done.
Writing bytes [7516192768, 8589934592)... Done.
Writing bytes [8589934592, 9663676416)... Done.
Writing bytes [9663676416, 10737418240)... Done.
Writing bytes [10737418240, 11811160064)... Done.
Writing bytes [11811160064, 12650938532)... Done.


0

In [4]:
data_manager.pickle_dump(train_y, base_path + "train_y.npy")
del train_y
gc.collect()

Writing total_bytes=514923...
Writing bytes [0, 514923)... Done.


0

In [5]:
data_manager.pickle_dump(validation_X, base_path + "validation_X.npy")
del validation_X
gc.collect()

Writing total_bytes=3163029664...
Writing bytes [0, 1073741824)... Done.
Writing bytes [1073741824, 2147483648)... Done.
Writing bytes [2147483648, 3163029664)... Done.


0

In [6]:
data_manager.pickle_dump(validation_y, base_path + "validation_y.npy")
del validation_y
gc.collect()

Writing total_bytes=128859...
Writing bytes [0, 128859)... Done.


0

In [7]:
data_manager.pickle_dump(test_X, base_path + "test_X.npy")
del test_X
gc.collect()

Writing total_bytes=3953787040...
Writing bytes [0, 1073741824)... Done.
Writing bytes [1073741824, 2147483648)... Done.
Writing bytes [2147483648, 3221225472)... Done.
Writing bytes [3221225472, 3953787040)... Done.


0

In [8]:
data_manager.pickle_dump(test_y, base_path + "test_y.npy")
del test_y
gc.collect()

Writing total_bytes=161035...
Writing bytes [0, 161035)... Done.


0

In [9]:
data_manager.pickle_dump(unnorm_test_X, base_path + "unnorm_test_X.npy")
del unnorm_test_X
gc.collect()

Writing total_bytes=3953787040...
Writing bytes [0, 1073741824)... Done.
Writing bytes [1073741824, 2147483648)... Done.
Writing bytes [2147483648, 3221225472)... Done.
Writing bytes [3221225472, 3953787040)... Done.


0

In [10]:
data_manager.pickle_dump(unnorm_validation_X, base_path + "unnorm_validation_X.npy")
del unnorm_validation_X
gc.collect()
print("Done.")

Writing total_bytes=3163029664...
Writing bytes [0, 1073741824)... Done.
Writing bytes [1073741824, 2147483648)... Done.
Writing bytes [2147483648, 3163029664)... Done.
Done.


In [11]:
data_manager.pickle_dump((train_mean, train_std), base_path + "train_mean_sd.tuple")



Writing total_bytes=139...
Writing bytes [0, 139)... Done.


In [None]:
print("Done.")