## Import Packages, Environment Setting

In [1]:
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.losses as losses
import tensorflow.keras.metrics as metrics
import tensorflow.keras.optimizers as optimizers

import tensorflow_datasets as tfds

import numpy as np
import itertools as it

from bokeh.plotting import figure, show
from bokeh.layouts import row
from bokeh.io import output_notebook, reset_output

reset_output()
output_notebook()

physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)

## Multilayer Perceptron
### Forward Propagation

The multilayer perceptron consists of several layers of operations. The output of each **neuron** in the layer is the **linear combination** of the input tensor followed by an **activation function**. For instance, in $Layer^1$, the first neuron $\mathbf{a}^1_1$ is:

$\mathbf{a}^1_1 = f^1(\mathbf{X^0w^1_1}+b^1)$

The output of every neuron in $Layer^1$ will then be stacked into a new Tensor:

$\mathbf{A}^1=\begin{bmatrix}    \mathbf{a}^1_1 & \mathbf{a}^1_2 & \cdots &　\mathbf{a}^1_{n1}\end{bmatrix}=\begin{bmatrix}    a_{11}^1 & a_{21}^1 & \cdots & a_{n_11}^1\\    a_{12}^1 & a_{22}^1 & \cdots & a_{n_12}^1\\    \vdots & \vdots & \ddots & \vdots\\    a_{1m}^1 & a_{2m}^1 & \cdots & a_{n_1m}^1\\\end{bmatrix}$

where $n_1$ is the number of neuron in the $Layer^1$. 

We can denote the output of each layer with $a_{ij}^l$, where $l$ is the index of layer, $i$ is the index of neuron in $Layer^l$, $j$ is the index of the sample. The output of $Layer^{l}$ will then be fed to the next layer $Layer^{l+1}$ as the input. This process continues until the last layer $Layer^{k}$ of the network, generating the prediction $\mathbf{\hat{Y}}=\mathbf{A}^k$. Note that the number of neuron in $Layer^k$ has to be the same as the shape of $\mathbf{Y}$ (number of classes in classification, or number of targets in regression). Note that we apply different initialization for the weight in the different neuron, and therefore the neurons in the same layer generate slightly different output (will discuss in Weight Initialization).

![First Layer](assets/multilayer_perceptron_first_layer.png)

### Back Propagation

The goal to train this multilayer perceptron network is to find the optimal weight $\mathbf{W}^*$ in every layer that optimizes the objective function between the output of the network $\mathbf{\hat{Y}}$ and the ground truth target $\mathbf{Y}$. To be more specific, we first need to compute the gradient of the objective function with respect to the output of the network $\mathbf{\hat{Y}}$: 

$\nabla_\mathbf{\hat{Y}}L(\mathbf{Y}, \mathbf{\hat{Y}})$

The Jacobian matrix of all the output from every neuron in $Layer^k$ with respect to the linear combination $\mathbf{X}^k$:

$J_{f^k}(\mathbf{\mathbf{x}})=\frac{\partial\mathbf{a}^k_i}{\partial\mathbf{x}_i^k}$

And the Jacobian matrix of the output of linear combination with respect to the weight $\mathbf{w}$:

$J_{\mathbf{A}^{k-1}w^k_i}(\mathbf{a_i^{k-1}})=\frac{\partial\mathbf{x}_i^k}{\partial\mathbf{w}_i^k}=\mathbf{a}_i^{k-1}$

where $k$ denotes the layer in the multilayer perceptron network. The gradient of the objective function with respect to the weight in the last layer $\mathbf{w}^k$ is:

$\nabla_\mathbf{w_1^k}L(\mathbf{Y}, \mathbf{\hat{Y}})=(\frac{\partial\mathbf{x}_i^{k}}{\partial\mathbf{w}_i^k})^T(\frac{\partial\mathbf{a}_i^k}{\partial\mathbf{x}_i^k})^T\nabla_\mathbf{\hat{Y}}L(\mathbf{Y}, \mathbf{\hat{Y}})$

The process continues from the output layer to the input layer; therefore, this process is called back propagation. The gradient of the objective function with respect to the weight $\mathbf{w}_i$ (contribute to neuron $i$) in layer $l$ can be written as:

$\nabla_{w_i^l}L(\mathbf{Y}, \mathbf{\hat{Y}})=(\frac{\partial\mathbf{x}_i^l}{\partial\mathbf{w}_i^l})^T(\frac{\partial\mathbf{a}_i^l}{\partial\mathbf{x}_i^l})^T\sum\limits_{j=1}^{n_{l+1}}\nabla_\mathbf{w_j^{l+1}}L(\mathbf{Y}, \mathbf{\hat{Y}})$

After computing the gradient, we can apply gradient descent in combination with optimizers to optimize the weight in each layer $\mathbf{W}^l$

![Multilayer Perceptron](assets/multilayer_perceptron.png)

## Visualize Dataset

In [2]:
mnist = tfds.image.MNIST()
mnist_data = mnist.as_dataset(batch_size=-1, shuffle_files=True)
mnist_train, mnist_test = mnist_data["train"], mnist_data["test"]

In [3]:
image, label = mnist_train['image'][0], mnist_train['label'][0]

fig = figure(title=f'Label: {label}', plot_height=250, plot_width=250, tools=[])
fig.x_range.range_padding = fig.y_range.range_padding = 0
fig.image(image=[np.flipud(tf.reshape(image, (28, 28)).numpy())], x=0, y=0, dw=10, dh=10)
show(fig)

## Define Network Structure

In [4]:
class MLP(tf.keras.Model):
    def __init__(self, encoding_dim, activation='relu'):
        super().__init__()
        self.encoding_dim = encoding_dim
        self.activation = activation
        self.network = tf.keras.Sequential([
            keras.layers.Flatten(input_shape=(28, 28, 1)),
            keras.layers.Dense(self.encoding_dim, activation=self.activation),
            keras.layers.Dense(10)
        ])

    def call(self, x):
        return self.network(x)

## Initialize Network and Define Network Properties

In [5]:
model = MLP(encoding_dim=128)
model.compile(optimizer=optimizers.Adam(), 
              loss=losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=[metrics.SparseCategoricalAccuracy()])

In [6]:
history = model.fit(mnist_train['image'], mnist_train['label'], epochs=10, validation_split=0.25)

Train on 45000 samples, validate on 15000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [7]:
train_accuracy = history.history['sparse_categorical_accuracy']
val_accuracy = history.history['val_sparse_categorical_accuracy']
train_loss, val_loss = history.history['loss'], history.history['val_loss']

In [8]:
num_epochs = np.arange(len(train_loss)) + 1
fig_loss = figure(title=f'Loss', plot_height=500, plot_width=500, tools=[])
fig_loss.line(num_epochs, train_loss, color='salmon', line_width=5, legend_label='train')
fig_loss.line(num_epochs, val_loss, color='limegreen', line_width=5, legend_label='validation')
fig_acc = figure(title=f'Accuracy', plot_height=500, plot_width=500, tools=[])
fig_acc.line(num_epochs, train_accuracy, color='salmon', line_width=5, legend_label='train')
fig_acc.line(num_epochs, val_accuracy, color='limegreen', line_width=5, legend_label='validation')
show(row(fig_acc, fig_loss))

In [9]:
y_pred = model.predict(mnist_test['image'])

In [10]:
image, label, pred = mnist_test['image'][0], mnist_test['label'][0], y_pred.argmax(axis=1)[0]

fig = figure(title=f'Label: {label}, Prediction: {pred}', plot_height=250, plot_width=250, tools=[])
fig.x_range.range_padding = fig.y_range.range_padding = 0
fig.image(image=[np.flipud(tf.reshape(image, (28, 28)).numpy())], x=0, y=0, dw=10, dh=10)
show(fig)

In [11]:
image, label, pred = mnist_test['image'][16], mnist_test['label'][16], y_pred.argmax(axis=1)[16]

fig = figure(title=f'Label: {label}, Prediction: {pred}', plot_height=250, plot_width=250, tools=[])
fig.x_range.range_padding = fig.y_range.range_padding = 0
fig.image(image=[np.flipud(tf.reshape(image, (28, 28)).numpy())], x=0, y=0, dw=10, dh=10)
show(fig)