In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw
import time

In [None]:
# Load the MNIST image dataset, and then see what we have.

# Note: This data is already divided up in such a way that it can be used for training and testing,
# So there is no need to import train_test_split from scikit-learn that we used in lesson-01
(train_images_, train_labels), (test_images_, test_labels) = datasets.mnist.load_data()

# First we will 'normalize' our input data. 
# This may not always be necessary, but it is good practice, and makes finding the best model easier.
# We'll set the numerical values to a range between 0 and 1
train_images_, test_images_ = train_images_ / 255.0, test_images_ / 255.0  


print(f"train_images_.shape = {train_images_.shape}")
print(f"train_labels.shape = {train_labels.shape}")
print(f"train labels, first 20: {train_labels[0:20]}")
print()
print(f"test labels size: {len(test_labels)}")

In [None]:
# The images are 28 x 28 greyscale captures of handwritten digits, representing numbers from 0 to 9.

# We've assigned 60,000 of the images for training, and 10,000 images for testing.
# All these data are what we call Labeled Data: data for which we know the right answer.

# Now lets take a look at the first one of them: This should be a '5' 
# according to the train labels output, above.
img = train_images_[0]

# We can show the grayscale image with PIL (aka pillow)
# pil_image = Image.fromarray(img)
# pil_image.show()

# OR with matplotlip: This image shows grayscale brightness, 
# with the largest value reprenting the most dark grey.
implot = plt.imshow(img)
implot.set_cmap('Greys')
plt.colorbar()

In [None]:
# Here we show the same image with more colorful color map
implot = plt.imshow(img)
implot.set_cmap('YlGnBu')
plt.colorbar()

In [None]:
from tensorflow.keras import initializers

# What model we want depends on many things, with the most important one being:
# (1) how accurate are the predictions.
#
# Other important metrics to consider are:
# (2) how big is the model, 
# (3) how long does it take to predict new data, and 
# (4) how long does it take to train.

# Picking up where we left off, let's first try using a deep neural network

# In lesson-01 we used only Linear or no activation.
# For this model, we add relu to add some non-linear type behavior.
model = models.Sequential()

model.add(tf.keras.layers.Reshape((784,), input_shape=(28,28)))
model.add(layers.Dense(200, kernel_initializer=initializers.RandomNormal(stddev=0.05), activation='relu'))
model.add(layers.Dense(100, kernel_initializer=initializers.RandomNormal(stddev=0.05), activation='relu'))
model.add(layers.Dense(50, kernel_initializer=initializers.RandomNormal(stddev=0.05), activation='relu'))

# 10 neurons, one per output digit. Softmax is commonly used for the output in this categorical situation.
model.add(layers.Dense(10, kernel_initializer=initializers.RandomNormal(stddev=0.05), activation='softmax'))

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Note that there was no mention of the input data in this model creation.
# The model is independing of the training data (other than the shape of the input).

# Let's see how big the model is, and how many trainable parameters there are
model.summary()

In [None]:
# Train the model using only 10 epochs for starters and see how we did.
# To keep the training process from using too much memory we'll send in batches of
# 64 images, instead of all the images at once.
import time
start = time.time()
model.fit(train_images_, train_labels, epochs=10, batch_size=64, validation_data=(test_images_, test_labels))
end = time.time()
print(f"\nTraining time is {round(end - start, 3)} seconds\n")
      
# Evaluate the model
start = time.time()
test_loss, test_acc = model.evaluate(test_images_, test_labels)
end = time.time()
print("\nTest accuracy:", round(test_acc * 100, 2))
print(f"\nTesting time for 10,000 examples is {round(end - start, 3)} seconds\n")

In [None]:
# For this simple deep learning model we should get a test accuracy of 97.9 +/- 0.4 percent. 
# We also measured and reported the training time and the testing/prediction time.

# Note that the training accuracy - which is reported as 'accuracy' while fitting our model to the training data - and 
# our test accuracy, reported as 'val_accuracy' - are typically a little different, but hopefully not too much. 

# In fact the val_accuracy - should have stopped getting better in any substantial
# way after epoch 5 or 6, assuming this is the first time you ran it.

# This may be a case of overfitting - and we can take some additional measure to imporove our model and reduce those effects.
# These measures are called 'regularization', and I'll discuss them later on.

In [None]:
# Let's try adding a dropout layer
tf.random.set_seed(0)

model = models.Sequential()
dropout_fraction = 0.2

model.add(tf.keras.layers.Reshape((784,), input_shape=(28,28)))
model.add(tf.keras.layers.Dropout(dropout_fraction, input_shape=(784,)))
model.add(layers.Dense(200, kernel_initializer=initializers.RandomNormal(stddev=0.05), activation='relu'))
model.add(layers.Dense(100, kernel_initializer=initializers.RandomNormal(stddev=0.05), activation='relu'))
model.add(layers.Dense(50, kernel_initializer=initializers.RandomNormal(stddev=0.05), activation='relu'))

# 10 neurons, one per output digit. Softmax is commonly used for the output in this categorical situation.
model.add(layers.Dense(10, kernel_initializer=initializers.RandomNormal(stddev=0.05), activation='softmax'))

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Note that there was no mention of the input data in this model creation.
# The model is independing of the training data (other than the shape of the input).

# Let's see how big the model is, and how many trainable parameters there are
model.summary()

In [None]:
# We will again, train the model using only 10 epochs for starters and see how we did.
# To keep the training process from using too much memory we'll send in batches of
# 64 images, instead of all the images at once.
import time
start = time.time()
model.fit(train_images_, train_labels, epochs=10, batch_size=64, validation_data=(test_images_, test_labels))
end = time.time()
print(f"\nTraining time is {round(end - start, 3)} seconds\n")
      
# Evaluate the model
start = time.time()
test_loss, test_acc = model.evaluate(test_images_, test_labels)
end = time.time()
print("\nTest accuracy:", round(test_acc * 100, 2))
print(f"\nTesting time for 10,000 examples is {round(end - start, 3)} seconds\n")

In [None]:
# Adding the dropout layer, which literally randomly ignores dropout_fraction (20%) of the data in each epoch had 
# the accuracy and val_accuracy staying much more in step with each other.

# In all my test cases, the val_accuracy and training accuracy were more similar in the latter model with drop-out. 

In [None]:
# We can also build and fit a new and different model...

# Below we create a Convolutional Neural Network or CNN model in tensorflow/keras,
# to train and predict these MNIST data.

# We need to add one more dimension to the data to get this to feed into the models (and make the tensors flow).
print(f"Before reshaping, train_images_.shape = {train_images_.shape}")

# Add the extra dimension and create a new set of variables.
train_images = np.reshape(train_images_, (60000, 28, 28, 1))
test_images = np.reshape(test_images_, (10000, 28, 28, 1))

print(f"After reshaping, train_images.shape = {train_images.shape}")

# Recall that we already normalized our data.

In [None]:
# The base model, Sequential, is basically a shell in which we can add layers including
# convolutional layers, pooling steps, flattening, and dense neural newtork connections.
# These various layers and their connections are specifying the network architecture.
model = models.Sequential()

# The type and size of the layers, how they change as we move toward the end of the sequence, 
# how big of a convolutional kernel (here we use 3x3), and what type of activation functions
# are used (relu vs tanh vs sigmoid) on the inner layers are just some of the many meta-parameters
# that one can exeriment with to find the best model for a given set of input data.
model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)))
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.Flatten())

model.add(layers.Dense(64, activation='relu'))

# 10 neurons, one per output digit. Softmax is commonly used for the output in this categorical situation.
model.add(layers.Dense(10, activation='softmax'))

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

In [None]:
# We can see that this CNN model has only 50% of the Trainable parameters as our deep neural network.

# Train the model
start = time.time()
model.fit(train_images, train_labels, epochs=10, batch_size=64, validation_data=(test_images, test_labels))
end = time.time()
print(f"\nTraining time is {round(end - start, 3)} seconds\n")

# Evaluate the model
start = time.time()
test_loss, test_acc = model.evaluate(test_images, test_labels)
end = time.time()
print("\nTest accuracy:", round(test_acc * 100, 2))
print(f"\nTesting time for 10,000 examples is {round(end - start, 3)} seconds\n")

In [None]:
# Using this smaller model we obtained a higher test accuracy, although it did cost us by taking
# longer to train, and longer to predict results. These are actually relatively small models.
# Some production models can take many hours or even days to train.

# So depending on your cost structure and how often you need to retrain and incorporate the latest data, there may
# be a reason to trade off a little accuracy for faster data incorporation (feedback).

# The CNN architecture - being more complex - requires a greater number of calculations
# for each trainable parameter.

# CNNs are designed to yield more accurate predictions and a better analysis of images.

In [None]:
# This should have resulted in about 99% accuracy. 
# It's good to ask whatever your result (99% or whatever): Is it good enough? ...and to dig a bit more into the details.

# I alway like to look at a sample of the failed cases.
# Are they really that bad or are we missing something in our model?

# So how do we find out which ones were bad? 
# Let's start with getting the test case predictions.
pred_labels_ = model.predict(test_images)

In [None]:
# During the training process, TensorFlow will internally convert numerical labels into the appropriate format depending on
# the loss function you're using. In this particular case as one of 10 categorical labels.

# So even though we feed it a 5 it automatically converts this to a vector, represented either as:
[0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0]

# or shown as: [0,0,0,0,0,1,0,0,0,0] 

# And so, our predicted labels are also in this format...

print(f"pred_labels_.shape = {pred_labels_.shape}")
print(f"test_labels.shape = {test_labels.shape}\n")
print(f"pred_labels_[0] = {np.round(pred_labels_[0], 3)}\ntest_labels[0] = {test_labels[0]}")

In [None]:
# So we'll need to do some manipulations to make them more easily comparable.

# Create a container of the same size as pred_labels, with only the single numerical value representing the best prediction.
pred_labels = np.zeros((pred_labels_.shape[0]), dtype=np.int16)

# Then fill it with the index with the max values.
for i, p in enumerate(pred_labels_):
    pred_labels[i] = np.argmax(p)

# Now we can compare them and get an array indicating where there is agreement and where there is not.
truth = (pred_labels == test_labels)

failure_indexes = np.where(~truth)
print(f"Error rate = {sum(~truth)/len(truth)}") 

print(f"Indexes of failed predictions:\n{failure_indexes}")
print()
print(f"Labels:      {test_labels[failure_indexes]}")
print(f"Predictions: {pred_labels[failure_indexes]}")

# Looking at the first example where there was disagreement(failure).
idx = failure_indexes[0][0]

# Recall that we rescaled (normalized) the original train and test images, so we'll
# have to undo that to see them in their original form. 
image_array = (test_images_[idx] * 255)

implot = plt.imshow(image_array)
implot.set_cmap('Greys')
plt.colorbar()


In [None]:
# OK, Can you see why the computer might get this one confused?
# Look at the first value in the lists above to see what is predicted and what is the label.

# Lets randomly sample one more from the remaining values
idx_value = np.random.choice(failure_indexes[0][1:], size=None, replace=True) 
print(f"Index Value = {idx_value}")

# Fix these next two lines
print('Label:', test_labels[idx_value])
print('prediction:', pred_labels[idx_value])

image_array2 = (test_images_[idx_value] * 255)
implot = plt.imshow(image_array2)
implot.set_cmap('Greys')
plt.colorbar()

# Does it make sense that this one is challenging to predict?

# After looking at many of these failure cases, often I can see where the confusion is coming from, very infrequently I 
# think the label is wrong, and sometimes I just can't figure out what I'm looking at.

In [None]:
# It would also be good to see if one type of number failed more than another, etc.
plt.hist(test_labels[failure_indexes])
plt.title("Distribution of Failed Handwriting Predictions")
plt.xlabel("Labeled Value")
plt.show()

# If it looks like it is not doing a good job on an obvious number,
# there are surely other tricks we can try to implement to bump up our accuracy.
#  - Other architectures
#  - More training
#  - Different types of regularization
#  etc.

In [None]:
# On to Lesson-03