# Imports

In [38]:
import tensorflow
from numpy.random import seed

seed(888)
tensorflow.random.set_seed(404)

In [39]:
import os
import numpy as np
import tensorflow as tf

# Constants

In [53]:
X_TRAIN_PATH = '11_MNIST/digit_xtrain.csv'
X_TEST_PATH = '11_MNIST/digit_xtest.csv'
Y_TRAIN_PATH = '11_MNIST/digit_ytrain.csv'
Y_TEST_PATH = '11_MNIST/digit_ytest.csv'

NR_CLASSES = 10
VALIDATION_SIZE = 10000

# Get the Data

In [41]:
%%time

y_train_all = np.loadtxt(Y_TRAIN_PATH, delimiter=',', dtype=int)
y_test = np.loadtxt(Y_TEST_PATH, delimiter=',', dtype=int)

CPU times: user 4.88 ms, sys: 175 µs, total: 5.05 ms
Wall time: 4.32 ms


In [42]:
%%time

x_train_all = np.loadtxt(X_TRAIN_PATH, delimiter=',', dtype=int)
x_test = np.loadtxt(X_TEST_PATH, delimiter=',', dtype=int)

CPU times: user 1.67 s, sys: 209 ms, total: 1.88 s
Wall time: 1.84 s


# Explore

The images we have are 28x28 pixels and in grey scale, this means we just have one value (int between 0 and 255) that indicates how light or dark our pixel is. 0 means totally white, 255 means totally black.

In [43]:
print(f'labels for training have the shape {y_train_all.shape}, for test they are {y_test_all.shape}')

labels for training have the shape (60000,), for test they are (10000,)


In [44]:
print(f'features for training have the shape {x_train_all.shape}, for test they are {x_test_all.shape}')

features for training have the shape (60000, 784), for test they are (10000, 784)


**Note** for future projects and tutorials (we might not take it into consideration in this one): It's not very good to flatten our features when working with images, having a flat list of the 784 pixels (28x28). Why? Because we lose information of the position of each pixel, we don't know which pixels are surrounding others.

Let's take a look at one spare image:

In [45]:
x_train_all[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   3,  18,  18,  18,
       126, 136, 175,  26, 166, 255, 247, 127,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,  30,  36,  94, 154, 17

And let's now look at the first 5 spare labels (which corresponds to the categories or classes of the first images):

In [46]:
y_train_all[:5]

array([5, 0, 4, 1, 9])

# Data Preprocessing

## Re-scale our features
Why? Because our output (target) will be very low, a number between 0 and 9. When this happens, it helps if our input data is also low, e.g. numbers between 0-1.

In [47]:
x_train_all, x_test = x_train_all / 255.0, x_test / 255.0 # We're re-scaling + converting to floats

## Convert target values into one-hot encoding

We'll now convert our y_train_all, which is a spare matrix, into a full matrix:

In [48]:
values = y_train_all[:5]
np.eye(10)[values] # see .eye() documentation for understanding, values represent the index

array([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

In [49]:
y_train_all = np.eye(NR_CLASSES)[y_train_all]

In [51]:
y_train_all.shape

(60000, 10)

In [52]:
y_test = np.eye(NR_CLASSES)[y_test]
y_test.shape

(10000, 10)

## Create validation dataset from our training data

**Challenge:** Split the training dataset into a smaller training dataset and a validation dataset for the features and the labels. Create four arrays: `x_val`, `y_val`, `x_train`, and `y_train` from `x_train_all` and `y_train_all`. Use the validation size of 10,000 (defined in the constants).

In [55]:
x_val = x_train_all[:VALIDATION_SIZE]
y_val = y_train_all[:VALIDATION_SIZE]

In [56]:
x_train = x_train_all[VALIDATION_SIZE:]
y_train = y_train_all[VALIDATION_SIZE:]

In [57]:
x_train.shape

(50000, 784)

In [58]:
x_val.shape

(10000, 784)