## Import Packages, Environment Setting

In [1]:
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.losses as losses
import tensorflow.keras.metrics as metrics
import tensorflow.keras.optimizers as optimizers

import tensorflow_datasets as tfds

import numpy as np
import itertools as it

from bokeh.plotting import figure, show
from bokeh.layouts import row
from bokeh.io import output_notebook, reset_output

reset_output()
output_notebook()

## Multilayer Perceptron
### Forward Pass
The multilayer perceptron consists of multiple layers of operations. The output of each __neuron__ in the layer is the linear combination of the input matrix followed by a activation function. For instance:

$$
\mathbf{a}^1_1 = f^1(\mathbf{X^0w^1_1}+b^1)
$$

The output of every neuron in the same layer will then be concatenate by columns into a matrix:

$$
\mathbf{A}=[a_{i,j}]=
\begin{bmatrix}
    \mathbf{a}_1 & \mathbf{a}_2 & \cdots &　\mathbf{a}_1
\end{bmatrix}=
\begin{bmatrix}
    a_{1,1} & a_{1,2} & \cdots & a_{1,n}\\
    a_{2,1} & a_{2,2} & \cdots & a_{2,n}\\
    \vdots & \vdots & \ddots & \vdots\\
    a_{m,1} & a_{m,2} & \cdots & a_{m,n}\\
\end{bmatrix}
$$

and feed to the next layer as the input. This process continue until the last layer of the network, generate the prediction $\mathbf{\hat{Y}}$. 

### Backpropagation
The objective to train this network is to find the optimal weight $\mathbf{W}^*$ that minimize the objective function between the output of the network $\mathbf{\hat{Y}}$ and the groud truth target $\mathbf{Y}$. To be more specific, we first compute the gradient of the objective function with respect to the output of the network $\mathbf{\hat{Y}}$,  the Jacobian matrix of every activation function from the last layer with respect to the linear transformation and the Jacobian matrix of linear transformation with respect to the weight:
$$
\nabla_\mathbf{\hat{Y}}L(\mathbf{Y}, \mathbf{\hat{Y}}),\;\;\;J_{f^k}(\mathbf{x})=\frac{\partial\mathbf{a}_i^k}{\partial\mathbf{x}_i^k},\;\;\;J_{\mathbf{A}^{k-1}w^k_i}(\mathbf{a_i^{k-1}})=\frac{\partial\mathbf{x}_i^k}{\partial\mathbf{w}_i^k}=\mathbf{a}_i^{k-1}
$$
The gradient of the objective function with respect to the weight in the last layer is:
$$
\nabla_\mathbf{w_1^k}L(\mathbf{Y}, \mathbf{\hat{Y}})=(\frac{\partial\mathbf{x}_i^{k}}{\partial\mathbf{w}_i^k})^T(\frac{\partial\mathbf{a}_i^k}{\partial\mathbf{x}_i^k})^T\nabla_\mathbf{\hat{Y}}L(\mathbf{Y}, \mathbf{\hat{Y}})
$$
The process continue until the input layer. The gradient of the objective function with respect to the weight $\mathbf{w}_i$ in layer $l$ can be written as:
$$
\nabla_{w_i^l}L(\mathbf{Y}, \mathbf{\hat{Y}})=(\frac{\partial\mathbf{x}_i^l}{\partial\mathbf{w}_i^l})^T(\frac{\partial\mathbf{a}_i^l}{\partial\mathbf{x}_i^l})^T\sum\limits_{j=1}^{n_{l+1}}\nabla_\mathbf{w_j^{l+1}}L(\mathbf{Y}, \mathbf{\hat{Y}})
$$
After computing the gradient, We can apply gradient descent in combination with optimizers to optimize the weight in each layer $\mathbf{W}^l$

![multilayer_perceptron](assets/multilayer_perceptron.png)

In [2]:
mnist = tfds.image.MNIST()
mnist_data = mnist.as_dataset(batch_size=-1, shuffle_files=True)
mnist_train, mnist_test = mnist_data["train"], mnist_data["test"]

In [3]:
image, label = mnist_train['image'][0], mnist_train['label'][0]

fig = figure(title=f'Label: {label}', plot_height=250, plot_width=250, tools=[])
fig.x_range.range_padding = fig.y_range.range_padding = 0
fig.image(image=[np.flipud(tf.reshape(image, (28, 28)).numpy())], x=0, y=0, dw=10, dh=10)
show(fig)

## Define Network Structure

In [4]:
model = keras.Sequential([
    keras.layers.Flatten(input_shape=(28, 28, 1)),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(10)
])

## Define Network Properties

In [5]:
model.compile(optimizer=optimizers.Adam(), 
              loss=losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=[metrics.SparseCategoricalAccuracy()])

In [6]:
history = model.fit(mnist_train['image'], mnist_train['label'], epochs=10, validation_split=0.25)

Train on 45000 samples, validate on 15000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [7]:
train_accuracy = history.history['sparse_categorical_accuracy']
val_accuracy = history.history['val_sparse_categorical_accuracy']
train_loss, val_loss = history.history['loss'], history.history['val_loss']

In [8]:
num_epochs = np.arange(len(train_loss)) + 1
fig_loss = figure(title=f'Loss', plot_height=500, plot_width=500, tools=[])
fig_loss.line(num_epochs, train_loss, color='salmon', line_width=5, legend_label='train')
fig_loss.line(num_epochs, val_loss, color='limegreen', line_width=5, legend_label='validation')
fig_acc = figure(title=f'Accuracy', plot_height=500, plot_width=500, tools=[])
fig_acc.line(num_epochs, train_accuracy, color='salmon', line_width=5, legend_label='train')
fig_acc.line(num_epochs, val_accuracy, color='limegreen', line_width=5, legend_label='validation')
show(row(fig_acc, fig_loss))

In [9]:
y_pred = model.predict(mnist_test['image'])

In [10]:
image, label, pred = mnist_test['image'][0], mnist_test['label'][0], y_pred.argmax(axis=1)[0]

fig = figure(title=f'Label: {label}, Prediction: {pred}', plot_height=250, plot_width=250, tools=[])
fig.x_range.range_padding = fig.y_range.range_padding = 0
fig.image(image=[np.flipud(tf.reshape(image, (28, 28)).numpy())], x=0, y=0, dw=10, dh=10)
show(fig)

In [11]:
image, label, pred = mnist_test['image'][16], mnist_test['label'][16], y_pred.argmax(axis=1)[16]

fig = figure(title=f'Label: {label}, Prediction: {pred}', plot_height=250, plot_width=250, tools=[])
fig.x_range.range_padding = fig.y_range.range_padding = 0
fig.image(image=[np.flipud(tf.reshape(image, (28, 28)).numpy())], x=0, y=0, dw=10, dh=10)
show(fig)