In [40]:
# Parameters
# The batch size, number of training epochs and location of the data files is defined here. 
# Data files are hosted in a Google Cloud Storage (GCS) bucket which is why their address starts with gs://.

BATCH_SIZE = 128
EPOCHS = 10

training_images_file   = 'gs://mnist-public/train-images-idx3-ubyte'
training_labels_file   = 'gs://mnist-public/train-labels-idx1-ubyte'
validation_images_file = 'gs://mnist-public/t10k-images-idx3-ubyte'
validation_labels_file = 'gs://mnist-public/t10k-labels-idx1-ubyte'

In [41]:
# Imports
# All the necessary Python libraries are imported here, including TensorFlow.

import os, re, math, json, shutil, pprint
import numpy as np
import tensorflow as tf
print("Tensorflow version " + tf.__version__)

Tensorflow version 2.9.2


In [42]:
# Parse files and prepare training and validation datasets.

AUTO = tf.data.experimental.AUTOTUNE

def read_label(tf_bytestring):
    label = tf.io.decode_raw(tf_bytestring, tf.uint8)
    label = tf.reshape(label, [])
    label = tf.one_hot(label, 10)
    return label
  
def read_image(tf_bytestring):
    image = tf.io.decode_raw(tf_bytestring, tf.uint8)
    image = tf.cast(image, tf.float32)/256.0
    image = tf.reshape(image, [28*28])
    return image

In [43]:
# Apply this function to the dataset using .map and obtain a dataset of images.
# The same kind of reading and decoding for is done using .zip for images and labels.

def load_dataset(image_file, label_file):
    imagedataset = tf.data.FixedLengthRecordDataset(image_file, 28*28, header_bytes=16)
    imagedataset = imagedataset.map(read_image, num_parallel_calls=16)
    labelsdataset = tf.data.FixedLengthRecordDataset(label_file, 1, header_bytes=8)
    labelsdataset = labelsdataset.map(read_label, num_parallel_calls=16)
    dataset = tf.data.Dataset.zip((imagedataset, labelsdataset))
    return dataset 

In [44]:
# Training dataset
# The tf.data.Dataset API has all the necessary utility functions for preparing datasets.
# .cache caches the dataset in RAM
# .shuffle shuffles it with a buffer of 5000 elements
# .repeat loops the dataset
# .batch pulls multiple images and labels together into a mini-batch
# .prefetch can use the CPU to prepare the next batch while the current batch is being trained on the GPU
# The validation dataset is prepared in a similar way.

def get_training_dataset(image_file, label_file, batch_size):
    dataset = load_dataset(image_file, label_file)
    dataset = dataset.cache()  # this small dataset can be entirely cached in RAM, for TPU this is important to get good performance from such a small dataset
    dataset = dataset.shuffle(5000, reshuffle_each_iteration=True)
    dataset = dataset.repeat() # Mandatory for Keras for now
    dataset = dataset.batch(batch_size, drop_remainder=True) # drop_remainder is important on TPU, batch size must be fixed
    dataset = dataset.prefetch(AUTO)  # fetch next batches while training on the current one (-1: autotune prefetch buffer size)
    return dataset
  
def get_validation_dataset(image_file, label_file):
    dataset = load_dataset(image_file, label_file)
    dataset = dataset.cache() # this small dataset can be entirely cached in RAM, for TPU this is important to get good performance from such a small dataset
    dataset = dataset.batch(10000, drop_remainder=True) # 10000 items in eval dataset, all in one batch
    dataset = dataset.repeat() # Mandatory for Keras for now
    return dataset

# instantiate the datasets
training_dataset = get_training_dataset(training_images_file, training_labels_file, BATCH_SIZE)
validation_dataset = get_validation_dataset(validation_images_file, validation_labels_file)

# For TPU, we will need a function that returns the dataset
training_input_fn = lambda: get_training_dataset(training_images_file, training_labels_file, BATCH_SIZE)
validation_input_fn = lambda: get_validation_dataset(validation_images_file, validation_labels_file)

In [45]:
# Keras Model
# The models will be straight sequences of layers so tf.keras.Sequential is used to create them. 
# There are 10 neurons because we are classifying handwritten digits into 10 classes.
# A Keras model also needs to know the shape of its inputs, tf.keras.layers.Input can be used to define it.
# Configuring the model is done in Keras using the model.compile function. 
# A classification model requires a cross-entropy loss function, called 'categorical_crossentropy' in Keras. 
# The model computes the 'accuracy' metric, which is the percentage of correctly classified images.

model = tf.keras.Sequential(
  [
      tf.keras.layers.Reshape(input_shape=(28*28,), target_shape=(28, 28, 1)),
      
      tf.keras.layers.Conv2D(kernel_size=3, filters=12, use_bias=False, padding='same'),
      tf.keras.layers.BatchNormalization(center=True, scale=False),
      tf.keras.layers.Activation('relu'),
      
      tf.keras.layers.Conv2D(kernel_size=6, filters=24, use_bias=False, padding='same', strides=2),
      tf.keras.layers.BatchNormalization(center=True, scale=False),
      tf.keras.layers.Activation('relu'),
      
      tf.keras.layers.Conv2D(kernel_size=6, filters=32, use_bias=False, padding='same', strides=2),
      tf.keras.layers.BatchNormalization(center=True, scale=False),
      tf.keras.layers.Activation('relu'),
      
      tf.keras.layers.Flatten(),
      
      tf.keras.layers.Dense(200, use_bias=False),
      tf.keras.layers.BatchNormalization(center=True, scale=False),
      tf.keras.layers.Activation('relu'),
      
      tf.keras.layers.Dropout(0.3),
      tf.keras.layers.Dense(10, activation='softmax')
  ])

# Handwritten digits are made of shapes and this shape information is lost when the pixels are flattened to make a single vector.
# Convolutional networks can leverage the shape information.
# Convolutional neural networks apply a series of learnable filters to the input image. 
# A convolutional layer is defined by the filter size, the number of filters applied and the stride. 
# The input and the output of a convolutional layer each have three dimensions (width, height, number of channels), starting with the input image (width, height, RGB channels). 
# To fix signs of overfitting a dropout layer is added.
# Batch normalization helps neural networks converge and usually allows you to train faster by allowing the network to decide how much centering and re-scaling to apply at each neuron.

model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.01), loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Implementing a learning rate schedule that decays the learning rate exponentially. 
# lr decay function
def lr_decay(epoch):
  return 0.01 * math.pow(0.6, epoch)

# lr schedule callback
lr_decay_callback = tf.keras.callbacks.LearningRateScheduler(lr_decay, verbose=True)

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 reshape (Reshape)           (None, 28, 28, 1)         0         
                                                                 
 conv2d (Conv2D)             (None, 28, 28, 12)        108       
                                                                 
 batch_normalization (BatchN  (None, 28, 28, 12)       36        
 ormalization)                                                   
                                                                 
 activation (Activation)     (None, 28, 28, 12)        0         
                                                                 
 conv2d_1 (Conv2D)           (None, 14, 14, 24)        10368     
                                                                 
 batch_normalization_1 (Batc  (None, 14, 14, 24)       72        
 hNormalization)                                      

In [46]:
# Train and validate the model
# The training happens by calling model.fit and passing in both the training and validation datasets.

steps_per_epoch = 60000//BATCH_SIZE  # 60,000 items in this dataset
print("Steps per epoch: ", steps_per_epoch)

history = model.fit(training_dataset, steps_per_epoch=steps_per_epoch, epochs=EPOCHS, validation_data=validation_dataset, validation_steps=1, callbacks=[lr_decay_callback])

Steps per epoch:  468

Epoch 1: LearningRateScheduler setting learning rate to 0.01.
Epoch 1/10

Epoch 2: LearningRateScheduler setting learning rate to 0.006.
Epoch 2/10

Epoch 3: LearningRateScheduler setting learning rate to 0.0036.
Epoch 3/10

Epoch 4: LearningRateScheduler setting learning rate to 0.0021599999999999996.
Epoch 4/10

Epoch 5: LearningRateScheduler setting learning rate to 0.001296.
Epoch 5/10

Epoch 6: LearningRateScheduler setting learning rate to 0.0007775999999999998.
Epoch 6/10

Epoch 7: LearningRateScheduler setting learning rate to 0.0004665599999999999.
Epoch 7/10

Epoch 8: LearningRateScheduler setting learning rate to 0.00027993599999999994.
Epoch 8/10

Epoch 9: LearningRateScheduler setting learning rate to 0.00016796159999999993.
Epoch 9/10

Epoch 10: LearningRateScheduler setting learning rate to 0.00010077695999999997.
Epoch 10/10


In [47]:
# The shape of the output tensor is [128, 10] because 128 images are processed and computing the argmax across the 10 probabilities returned for each image, thus axis=1.
# This model recognises more than 99% of the digits.