## Import Packages, Environment Setting

In [1]:
import numpy as np
import os
import pandas as pd
import tensorflow as tf

from bokeh.io import output_notebook, export_png, reset_output
from bokeh.layouts import row, column
from bokeh.palettes import linear_palette, Magma256
from bokeh.plotting import figure, output_file, show, ColumnDataSource
from bokeh.models import Range1d, CustomJS, Slider, ColorBar, LinearColorMapper
from tensorflow.keras import models
from tensorflow.keras import activations
from selenium.webdriver import Chrome, ChromeOptions

reset_output()
output_notebook()

In [3]:
options = ChromeOptions()

options.add_argument('--headless')
metrics = { "deviceMetrics": { "pixelRatio": 1.0 } }
options.add_experimental_option("mobileEmulation", metrics)
web_driver = Chrome(options=options)

In [20]:
def visualize(name, X, y, gradient, ymin=None, ymax=None, color='black'):
    X = X.numpy()
    y = y.numpy()
    gradient = gradient.numpy()

    
    xmin, xmax = np.min(X), np.max(X)
    if not ymin:
        ymin = np.min(y)
    if not ymax:
        ymax = np.max(y)
    fig1 = figure(title=name, tools=[], x_axis_label='X', y_axis_label='f(X)')
    fig1.line(X, y, line_width=5, color=color)
    fig1.x_range=Range1d(xmin, xmax)
    fig1.y_range=Range1d(ymin, ymax)

    ymin, ymax = -1.25, 1.25
    fig2 = figure(
        title=f'{name} (Derivative)', 
        tools=[], 
        x_axis_label='X', 
        y_axis_label='gradient(X)'
    )
    fig2.line(X, gradient, line_width=5, color=color)
    fig2.x_range=Range1d(xmin, xmax)
    fig2.y_range=Range1d(ymin, ymax)

    fig1.title.text_font_size = '16pt'
    fig1.xaxis.axis_label_text_font_size = '16pt'
    fig1.yaxis.axis_label_text_font_size = '16pt'
    fig2.title.text_font_size = '16pt'
    fig2.xaxis.axis_label_text_font_size = '16pt'
    fig2.yaxis.axis_label_text_font_size = "16pt"
    fig1.title.align = 'center'
    fig2.title.align = 'center'

    show(row(fig1, fig2))
    export_png(row(fig1, fig2), filename=f'{name}.png')
    
def visualize2D(name, y, gradient):
    figure_list = []

    labels = (
        f'{name} y', 
        f'{name} y1 derivative with respect to X', 
        f'{name} y2 derivative with repect to X'
    )
    
    for target, label in zip((y, gradient[0], gradient[1]), labels):
        for i in range(2):
            fig = figure(
                title=f'{label}{i+1}', 
                tools=[], 
                x_axis_label='X1',
                y_axis_label='X2', 
                x_range=(-1, 1), 
                y_range=(-1, 1)
            )

            color_mapper = LinearColorMapper(
                Magma256,
                low=target.numpy().min(),
                high=target.numpy().max()
            )
            color_bar = ColorBar(
                color_mapper=color_mapper,
                border_line_color=None,
                location=(0,0),
                label_standoff=8,
                scale_alpha=0.5
            )

            fig.add_layout(color_bar, 'right')
            fig.image(
                image=[target.numpy()[:, i].reshape(50, 50)],
                name='y1',
                x=-1, y=-1, dw=2, dh=2,
                alpha=0.5,
                color_mapper=color_mapper)

            figure_list.append(fig)

    show(column(
        row(figure_list[0], figure_list[1]), 
        row(figure_list[2], figure_list[3]), 
        row(figure_list[4], figure_list[5])
    ))
    
    export_png(column(
        row(figure_list[0], figure_list[1]), 
        row(figure_list[2], figure_list[3]), 
        row(figure_list[4], figure_list[5])), filename=f'{name}.png')

In [5]:
X = tf.Variable(tf.linspace(-10., 10., 100))

## Identity Function
* Equation
$$f(x)=x$$
* Derivative
$$\begin{align}\frac{\partial f(x)}{\partial x}=1\end{align}$$
* Properties:
    * Mathematically __does not__ affect the network at all. 
    * Practically, in order to unify the design structure of the neural network (as linear transformation followed by activation function), most of the deep learning framework implemented identify function as one of the activation functions.

In [8]:
with tf.GradientTape() as g:
    y = activations.linear(X)
gradient = g.gradient(y, X)
visualize('Identity', X, y, gradient)

## Sigmoid
* Equation
$$\sigma(x)=\frac{1}{1+\exp(-x)}$$
* [Derivative](https://math.stackexchange.com/questions/78575/derivative-of-sigmoid-function-sigma-x-frac11e-x)
$$\frac{\partial\sigma(x)}{\partial x}=\sigma(x)(1-\sigma(x))$$
* Properties:
    * Neural network with only one hidden layer of sigmoid function is equivalent to __logistic regression__.
    * One very useful property of sigmoid function is that the range is $0$ to $1$ (bounded range); therefore, it is commonly used when we want to model __probability__ (e.g. output layer of binary classification problem). 
    * In binary classification problem, sigmoid function is a special case of __softmax function__.
    * In multi-class classification, sigmoid is preferable over softmax when we assume __a sample can belong to multiple class__ since sigmoid function treat the probability of each class separately.
    * Could lead to __gradient vanishing__ problem.

In [9]:
with tf.GradientTape() as g:
    y = activations.sigmoid(X)
gradient = g.gradient(y, X)
visualize('Sigmoid', X, y, gradient, color='royalblue', ymin=-1.25, ymax=1.25)

## Hyperbolic Tangent
* Equation
$$\tanh(x)=\frac{\exp(x)-\exp(-x)}{\exp(x)+\exp(-x)}$$

* Derivative
$$\frac{\partial(\tanh(x))}{\partial x}=\frac{1}{\cosh^2(x)}$$

* Description:
    * Commonly used in the gate of __long short-term memory (LSTM)__ and __gate recurrent unit (GRU)__.

In [10]:
with tf.GradientTape() as g:
    y = activations.tanh(X)
gradient = g.gradient(y, X)
visualize('Hyperbolic Tangent', X, y, gradient, color='deepskyblue', ymin=-1.25, ymax=1.25)

## Rectified Linear Unit (ReLU)
* Equation
$$\text{ReLU}(x)=\max(0, x)$$

* Derivative
$$\begin{align}\frac{\partial(\text{ReLU}(x))}{\partial x}=\begin{cases}1\;&\text{if } x > 0\\0\;&\text{otherwise}\end{cases}\end{align}$$

* Properties:
    * The most commonly used activation function in the hiddent layer of multilayer perceptron network.
    * The output of the activation function is a sparse (have many zeros) tensor. Therefore, it requires less computational time and memory to reach convergence.
    * Computational efficient to compute the derivative since it does not require exponential computation (about 1.5x faster than sigmoid and 2x faster than hyperbolic tangent).
    * The sparisty could potentially __prevent overfitting__.
    * However, the sparsity might also lead to __dying ReLU__ problem (some weight will never be updated)
    * Might cause __gradient exploding__ problem (will discuss in recurrent neural network) when the gradient of the linear combination before applying ReLU is large.
    * Alleviate gradient vanishing problem.

In [11]:
with tf.GradientTape() as g:
    y = activations.relu(X, alpha=0.00)
gradient = g.gradient(y, X)
visualize('ReLU', X, y, gradient, color='lightcoral', ymin=-10)

## Leaky ReLU
* Equation
$$
\begin{align}f(x)=\begin{cases}
x\;\;\;\;\;\;\;\;\;&\text{if }x > 0\\
ax &\text{otherwise}\end{cases}\end{align}
$$

* Derivative
$$\begin{align}\frac{\partial f(x)}{\partial x}=\begin{cases}1\;&\text{if } x > 0\\a\;&\text{otherwise}\end{cases}\end{align}$$

* Parameters
    * $a$ is a hyperparameter controling the slope of negative values (default in Tensorflow: $0.2$)
* Properties:
    * Computationally more efficient than ELU.
    * Might cause gradient exploding problem when the gradient of the linear combination before applying Leaky ReLU is large.
    * Alleviate gradient vanishing problem.
    * Alleviate the dying ReLU problem.

In [12]:
with tf.GradientTape() as g:
    y = activations.relu(X, alpha=0.2)
gradient = g.gradient(y, X)
visualize('Leaky ReLU', X, y, gradient, color='mediumvioletred', ymin=-10)

## Exponential Linear Unit (ELU)
* Equation
\begin{align}f(x)=\begin{cases}x\;\;\;\;\;\;\;\;\;&\text{if }x > 0\\a(\exp(x)-1) &\text{otherwise}\end{cases}\end{align}where $a$ is a predetermined hyper parameter (default in Tensorflow: $1$).
* Derivative
\begin{align}\frac{\partial f(x)}{\partial x}=\begin{cases}1\;&\text{if }x > 0\\a\exp(x)&\text{otherwise}\end{cases}\end{align}
* Parameters
    * $a$ is a coefficient of exponential transformation on negative values (default in Tensorflow: $1$)
* Properties:
    * $f(x)\to-a$ when $x \to -\infty$.
    * Converge faster than Leaky ReLU.
    * Might cause gradient exploding problem when the gradient of the linear combination before applying ELU is large.
    * Alleviate gradient vanishing problem.
    * Alleviate the dying ReLU problem.

In [13]:
with tf.GradientTape() as g:
    y = activations.elu(X)
gradient = g.gradient(y, X)
visualize('ELU', X, y, gradient, color='violet', ymin=-10)

## Swish
* Equation
$$f(x)=\frac{x}{1+\exp(-x)}$$
* Derivative
$$\frac{\partial f(x)}{\partial x}=f(x)+\sigma(x)(1-f(x))$$
* Properties:
    * Generally outperform ReLU with similar computational cost in many image related applications.
    * Might cause gradient exploding problem when the gradient of the linear combination before applying Swish is large.
    * Alleviate gradient vanishing problem.
    * Alleviate the dying ReLU problem.

In [None]:
with tf.GradientTape() as g:
    y = X * activations.sigmoid(X)
gradient = g.gradient(y, X)
visualize('Swish', X, y, gradient, color='limegreen', ymin=-10, ymax=10)

### Softmax
* Equation
\begin{align}\mathbf{\hat{y}}=\text{softmax}(\mathbf{x})=\frac{\exp(\mathbf{x})}{\sum\limits_{i=1}^{n} \exp(x_i)}=\begin{bmatrix}
    \frac{\exp(x_1)}{\sum\limits_{i=1}^{n} \exp(x_i)} \\
    \frac{\exp(x_2)}{\sum\limits_{i=1}^{n} \exp(x_i)} \\
    \vdots\\ 
    \frac{\exp(x_n)}{\sum\limits_{i=1}^{n} \exp(x_i)} \\
\end{bmatrix}\end{align}
where $\mathbf{x}=[x_i] \in \mathbb{R}^n$

* [Derivative](https://eli.thegreenplace.net/2016/the-softmax-function-and-its-derivative/)
$$
\frac{\partial\mathbf{\hat{y}}}{\partial\mathbf{x}}=\begin{bmatrix}
    \frac{\partial\hat{y}_1}{\partial x_1} & \frac{\partial\hat{y}_1}{\partial x_2} & \cdots & \frac{\partial\hat{y}_1}{\partial x_n} \\
    \frac{\partial\hat{y}_2}{\partial x_1} & \frac{\partial\hat{y}_2}{\partial x_2} & \cdots & \frac{\partial\hat{y}_2}{\partial x_n} \\
    \vdots & \vdots & \ddots & \vdots \\ 
    \frac{\partial\hat{y}_n}{\partial x_1} & \frac{\partial\hat{y}_n}{\partial x_2} & \cdots & \frac{\partial\hat{y}_n}{\partial x_n}
\end{bmatrix} = \begin{bmatrix}
    \hat{y}_1(1-\hat{y}_1) & \hat{y}_1(0-\hat{y}_2) & \cdots & \hat{y}_1(0-\hat{y}_n) \\
    \hat{y}_2(0-\hat{y}_1) & \hat{y}_2(1-\hat{y}_2) & \cdots & \hat{y}_2(0-\hat{y}_n) \\
    \vdots & \vdots & \ddots & \vdots \\ 
    \hat{y}_n(0-\hat{y}_1) & \hat{y}_n(0-\hat{y}_2) & \cdots & \hat{y}_n(1-\hat{y}_n) 
\end{bmatrix} = \mathbf{1}\hat{y}^T\otimes(\mathbf{I}-\hat{y}\mathbf{1}^T)
$$
where $\mathbf{1} = \{1\}^n$
* Properties:
    * Note that the input for softmax shoud be a __vector__ instead of a scalar.
    * Similar to sigmoid funciton, the range of softmax function is $0$ to $1$ (bounded range); therefore, it is also commonly used when we want to model __probability__. 
    * In multi-class classification, softmax is preferable over sigmoid when we assume that __a sample can only belong to one class__.
* Range: $0$ to $1$

In [21]:
X = tf.Variable(tf.linspace(-1., 1., 50))
X = tf.meshgrid(X, X)

X1 = tf.reshape(X[0], (2500, 1))
X2 = tf.reshape(X[1], (2500, 1))

with tf.GradientTape(persistent=True) as g:
    g.watch([X1, X2])
    y = activations.softmax(tf.concat([X1, X2], axis=1))
    y1, y2 = tf.split(y, [1, 1], axis=1)

gradient = list()
gradient.append(tf.concat([g.gradient(y1, X1), g.gradient(y1, X2)], axis=1))
gradient.append(tf.concat([g.gradient(y2, X1), g.gradient(y2, X2)], axis=1))
visualize2D('Softmax', y, gradient)