In [1]:
import numpy as np
import tensorflow as tf

from bokeh.io import output_notebook, reset_output
from bokeh.plotting import figure, show
from bokeh.models import Range1d

reset_output()
output_notebook()

In [2]:
def visualize(name, X, y, ymin=None, ymax=None, fig=None, display=True, x_axis_label='y - ŷ', classification=False, **kwargs):
    X = X.numpy()
    y = y.numpy()

    xmin, xmax = np.min(X), np.max(X)
    if classification:
        ymin, ymax = 0, np.max(y)
    else:
        ymin, ymax = 0, np.max(np.power(X, 2))
    if not fig:
        fig = figure(title=name, tools=[], x_axis_label=x_axis_label, y_axis_label='Loss')
        fig.x_range=Range1d(xmin, xmax)
        fig.y_range=Range1d(ymin, ymax)

    fig.line(X, y, line_width=5, **kwargs)
    
    if display:
        show(fig)
        
    return fig

In [3]:
X = tf.linspace(-10., 10., 100)

## Regression Objective Functions
Suppose $\mathbf{y}$ is the actual values we want to predict, and $\mathbf{\hat{y}} \in \mathbb{R}^m$ is the prediction made by our model. In other words, $\mathbf{\hat{y}}=f(\mathbf{X}, \mathbf{w})$.  we can use several objective functions to evaluate our model. In this section, we will discuss the most commonly used ones: `MeanAbsoluteError`,  `MeanSquaredError` and `HuberError`.


### Mean Absoute Error (L1 Loss)
* Equation
$$
L(\mathbf{y}, \mathbf{\hat{y}})=\frac{1}{m}\sum\limits_{i=1}^m|y_i - \hat{y}_i|
$$

* Derivative
$$
\frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial\mathbf{\hat{y}}}=[d_i]=\begin{cases}
\frac{1}{m}\;&\text{if } \hat{y}_i > y_i\\
-\frac{1}{m}&\text{otherwise}\end{cases}
$$

* Properties
    * Less sensitive to samples with large residual between prediction and actual value.

In [4]:
y = tf.abs(X)
_ = visualize('Mean Absolute Error', X, y, **{'color': 'deepskyblue'})

### Mean Squared Error (L2 Loss)
* Equation
$$
L(\mathbf{y}, \mathbf{\hat{y}})=\frac{1}{m}\sum\limits_{i=1}^m(y_i - \hat{y}_i)^2
$$
* Derivative
$$
\frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial\mathbf{\hat{y}}}=-\frac{2}{m}(\mathbf{y}-\mathbf{\hat{y}})
$$

* Properties
    * More sensitive to samples with large residual between prediction and actual value.


In [5]:
y = tf.pow(X, 2)
_ = visualize('Mean Squared Error', X, y, **{'color': 'lightcoral'})

### Huber Error
* Equation
$$
\begin{align}
L(\mathbf{y}, \mathbf{\hat{y}})=\sum\limits_{i=1}^m
\begin{cases}\frac{1}{2}(y_i-\hat{y}_i)^2\;\;\;&\text{if}\;|y_i-\hat{y}_i|\leq\delta\\
\delta(|y_i-\hat{y}_i|-\frac{1}{2}\delta)&\text{otherwise}\end{cases}
\end{align}
$$

* Derivative
$$
\begin{align}
\frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial \mathbf{\hat{y}}}=[d_i]=
\begin{cases}\hat{y}_i-y_i\;\;\;&\text{if}\;|y_i-\hat{y}_i|\leq\delta\\
-\delta&\text{if}\;|y_i-\hat{y}_i|>\delta\text{ and }|y_i-\hat{y}_i|>0\\
\delta&\text{otherwise}\end{cases}
\end{align}
$$
* Properties
    * Hyperparamter $\delta$ can be used to control how much penalty should be given to samples with large residual.

In [6]:
delta = 1
y = tf.where(tf.abs(X) <= delta, 0.5 * tf.pow(X, 2), delta * (tf.abs(X) - 0.5 * delta))
fig = visualize('Huber Error', X, y, display=False, **{'color': 'thistle', 'legend_label': f'delta:{1:>3}'})
delta = 5
y = tf.where(tf.abs(X) <= delta, 0.5 * tf.pow(X, 2), delta * (tf.abs(X) - 0.5 * delta))
fig = visualize('Huber Error', X, y, fig=fig, display=False, **{'color': 'plum', 'legend_label': f'delta:{5:>3}'})
delta = 10
y = tf.where(tf.abs(X) <= delta, 0.5 * tf.pow(X, 2), delta * (tf.abs(X) - 0.5 * delta))
fig = visualize('Huber Error', X, y, fig=fig, **{'color': 'mediumorchid', 'legend_label': f'delta:{10:>3}'})

## Binary Classification
### Cross Entropy
Suppose $y_i=1$ denotes the sample $i$ of class $A$, $y_i=0$ denotes the sample of class $B$, $\hat{y}_i$ being the network prediction of the probability of sample $i$ being classified as class $A$. 

* Equation
$$
H(\mathbf{y}, \mathbf{\hat{y}})=-[\sum\limits_{i=1}^my_i\ln \hat{y}_i+(1-y_i)\ln(1-\hat{y}_i)]
$$

* Derivative
$$
\frac{\partial H(\mathbf{y}, \mathbf{\hat{y}})}{\partial \mathbf{\hat{y}}}=-\frac{\mathbf{y}}{\mathbf{\hat{y}}}+\frac{1-\mathbf{y}}{1-\mathbf{\hat{y}}}
$$

* Properties
    * The inferred probability $\mathbf{\hat{y}}$ is more accurate compare to hinge loss.
    * __Sigmoid__ or __softmax__ activation functions are preferred to used in the output layer.
    

In [7]:
y_pred = tf.linspace(0.01, 0.99, 100) 
loss = -1 * tf.math.log(y_pred)
fig = visualize(
    'Binary Cross Entropy', 
    y_pred, 
    loss,
    classification=True, display=False,
    x_axis_label='ŷ (Class A)', 
    **{'color': 'limegreen', 'legend_label': 'Truth (Class A)'} 
)
loss = -1 * tf.math.log(1 - y_pred)
fig = visualize(
    'Binary Cross Entropy', 
    y_pred, 
    loss, 
    fig=fig,
    classification=True,
    **{'color': 'palegreen', 'legend_label': 'Truth (Class B)'} 
)

### Hinge Loss
Suppose $y_i=1$ for the sample being labled as class $A$, $y_i=-1$ as class $B$, $\hat{y}_i>0$ suggests network predict the sample being classified as class $A$. $f(x_i)<0$ suggests the network predict the sample being classified as class $B$:

* Equation
$$
L(\mathbf{y}, \mathbf{\hat{y}})=\sum\limits_{i=1}^m\max(0, 1-y_i\hat{y}_i)
$$

* Derivative
$$
\begin{align}\frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial \mathbf{\hat{y}}}=[d_i]=
\begin{cases}-y_i\;&\text{if }1-y_i\hat{y}_i<0\\0&\text{otherwise}\end{cases}\end{align}
$$

* Properties
    * Data point far away from the decision boundary do not contribute to the loss function.
    * Need additional methods (e.g. Platt scaling) to estimate probability for each class.
    * __Linear__ or __hyperbolic tangent__ activation functions are preferred to used in the output layer.

In [8]:
y_pred = tf.linspace(-3., 3., 100) 
loss = tf.where(1. + y_pred > 0., 1. + y_pred, tf.zeros_like(y_pred))
fig = visualize(
    'Hinge Loss',
    y_pred,
    loss,
    x_axis_label='ŷ',
    classification=True,
    display=False,
    **{'color': 'blueviolet', 'legend_label': 'Truth (Class A)'}
)
loss = tf.where(1. - y_pred > 0., 1. - y_pred, tf.zeros_like(y_pred))
fig = visualize(
    'Hinge Loss',
    y_pred,
    loss,
    fig=fig,
    classification=True,
    **{'color': 'plum', 'legend_label': 'Truth (Class B)'}
)

## Multiclass Classification
### Cross Entropy
Suppose the samples are from $K$ classes. $y_{ij}=1$ and $y_{ik}=0\;(j \ne k)$ for the sample $i$ being labeled as class $j$, $\hat{y}_{ij}$ being the network prediction of the probability the sample $i$ being classified as class $j$. Note that $\sum\limits_{j=1}^K\hat{y}_{ij}=1$ (which can be achieved using the __softmax__ function):

* Equation
$$
H(\mathbf{y}, \mathbf{\hat{y}})=-\sum\limits_{i=1}^m\sum\limits_{j=1}^Ky_{ij}\ln\hat{y}_{ij}
$$

* Derivative
$$
\frac{\partial H(\mathbf{y}, \mathbf{\hat{y}})}{\partial\mathbf{\hat{y}}}=[d_{ij}]=-\frac{y_{ij}}{\hat{y}_{ij}}
$$

### Hinge Loss (_Weston and Watkins_)
Suppose the samples are from $K$ classes. $y_{ij}=1$ and $y_{ik}=-1\;(j \ne k)$ for the sample $i$ being labeled as class $i$, $\hat{y}_{ij}>0$ suggests network predict the sample being classified as class $j$:
* Equation
$$
L(\mathbf{y}, \mathbf{\hat{y}})=\sum\limits_{i=1}^m\sum\limits_{j=1}^K\max(0, 1+y_{ij}\hat{y}_{ij})
$$

* Derivative
$$
\frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial \mathbf{\hat{y}}}=[d_{ij}]=
\begin{cases}-y_{ij}\;&\text{if }1-y_{ij}\hat{y}_{ij}<0\\0&\text{otherwise}\end{cases}
$$

## Why are MSE and MAE not suitable for classification
### Sigmoid Output
Let us first decompose a classification network with sigmoid activation function:
$$
\mathbf{\hat{y}}=\sigma(\mathbf{\hat{X}}),\;\;\;\mathbf{\hat{x}}=\mathbf{X}\mathbf{w}+b
$$
Now, consider the gradient of square error with respect to $\mathbf{\hat{y}}$:
$$
\begin{align}
\nabla_{\mathbf{\hat{y}}}L(\mathbf{y}, \mathbf{\hat{y}})&=
\begin{bmatrix}
\frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial\hat{y}_1} \\
\frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial\hat{y}_2} \\
\vdots \\
\frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial\hat{y}_m}
\end{bmatrix}\\&=\begin{bmatrix}
-\frac{2}{m}(y_1 - \hat{y}_1) \\
-\frac{2}{m}(y_2 - \hat{y}_2) \\
\vdots \\
-\frac{2}{m}(y_1 - \hat{y}_m)
\end{bmatrix}\\&=\frac{2}{m}(\mathbf{y}-\mathbf{\hat{y}})
\end{align}
$$
the jacobian matrix of sigmoid function with respect to $\mathbf{\hat{x}}$:
$$
\begin{align}
\frac{\partial\mathbf{\hat{y}}}{\partial\mathbf{\mathbf{\hat{x}}}}&=\begin{bmatrix}
    \frac{\partial\hat{y}_1}{\partial\hat{x}_1} & \frac{\partial\hat{y}_1}{\partial\hat{x}_2} & \cdots & \frac{\partial\hat{y}_1}{\partial\hat{x}_m} \\
    \frac{\partial\hat{y}_2}{\partial\hat{x}_1} & \frac{\partial\hat{y}_2}{\partial\hat{x}_2} & \cdots & \frac{\partial\hat{y}_2}{\partial\hat{x}_m} \\
    \vdots & \vdots & \ddots & \vdots \\ 
    \frac{\partial\hat{y}_m}{\partial\hat{x}_1} & \frac{\partial\hat{y}_m}{\partial\hat{x}_2} & \cdots & \frac{\partial\hat{y}_m}{\partial\hat{x}_m}
\end{bmatrix}\\&=
\begin{bmatrix}
\sigma(\hat{x}_1)(1-\sigma(\hat{x}_1)) & 0 & \cdots & 0 \\
0 & \sigma(\hat{x}_2)(1-\sigma(\hat{x}_2))  & \cdots & 0 \\
\vdots & \vdots & \ddots & \vdots \\ 
0 & 0 & \cdots & \sigma(\hat{x}_m)(1-\sigma(\hat{x}_m)) \\
\end{bmatrix}\\&=
\mathbf{1}\sigma(\mathbf{\hat{x}})^T\otimes(\mathbf{I}-\sigma(\mathbf{\hat{x}})\mathbf{1}^T)
\end{align}$$

and the jacobiam matrix of $\mathbf{\hat{x}}$ with respect to $\mathbf{w}$
$$
\mathbf{\frac{\partial \hat{x}}{\partial w}}=\begin{bmatrix}
    \frac{\partial\hat{x}_1}{\partial w_1} & \frac{\partial\hat{x}_1}{\partial w_2} & \cdots & \frac{\partial\hat{x}_1}{\partial w_n} \\
    \frac{\partial\hat{x}_2}{\partial w_1} & \frac{\partial\hat{x}_2}{\partial w_2} & \cdots & \frac{\partial\hat{x}_2}{\partial w_n} \\
    \vdots & \vdots & \ddots & \vdots \\ 
    \frac{\partial\hat{x}_m}{\partial w_1} & \frac{\partial\hat{x}_m}{\partial w_2} & \cdots & \frac{\partial\hat{x}_m}{\partial w_n}
\end{bmatrix}=\mathbf{X}
$$

We can then compute the gradient of $L(\mathbf{y}, \mathbf{\hat{y}})$ with respect to ${\mathbf{\hat{x}}}$:
$$
\begin{align}
\nabla_{\mathbf{\hat{x}}}L(\mathbf{y}, \mathbf{\hat{y}})&=\begin{bmatrix}
\frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial\hat{x}_1} \\
\frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial\hat{x}_2} \\
\vdots \\
\frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial\hat{x}_m} \\
\end{bmatrix}\\&=
\begin{bmatrix}
\frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial\hat{y}_1}\frac{\partial\hat{y}_1}{\partial\hat{x}_1} + \frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial\hat{y}_2}\frac{\partial\hat{y}_2}{\partial\hat{x}_1}+\cdots+\frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial\hat{y}_m}\frac{\partial\hat{y}_m}{\partial\hat{x}_1}\\
\frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial\hat{y}_1}\frac{\partial\hat{y}_1}{\partial\hat{x}_2} + \frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial\hat{y}_2}\frac{\partial\hat{y}_2}{\partial\hat{x}_2}+\cdots+\frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial\hat{y}_m}\frac{\partial\hat{y}_m}{\partial\hat{x}_2}\\
\vdots \\
\frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial\hat{y}_1}\frac{\partial\hat{y}_1}{\partial\hat{x}_m} + \frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial\hat{y}_2}\frac{\partial\hat{y}_2}{\partial\hat{x}_m}+\cdots + \frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial\hat{y}_m}\frac{\partial\hat{y}_m}{\partial\hat{x}_m}\\
\end{bmatrix}\\&=
(\frac{\partial\mathbf{\hat{y}}}{\partial\mathbf{\mathbf{\hat{x}}}})^T\nabla_{\mathbf{\hat{y}}}L(\mathbf{y}, \mathbf{\hat{y}})
\end{align}
$$

And finaaly, we can derive the gradient of  $L(\mathbf{y}, \mathbf{\hat{y}})$ with respect to ${\mathbf{w}}$:
$$
\begin{align}
\nabla_{\mathbf{w}}L(\mathbf{y}, \mathbf{\hat{y}})&=\begin{bmatrix}
\frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial w_1} \\
\frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial w_2} \\
\vdots \\
\frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial w_n} \\
\end{bmatrix}\\&=
\begin{bmatrix}
\frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial\hat{x}_1}\frac{\partial\hat{x}_1}{\partial w_1} + \frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial\hat{x}_2}\frac{\partial\hat{x}_2}{\partial w_1}+\cdots+\frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial\hat{x}_m}\frac{\partial\hat{x}_m}{\partial w_1}\\
\frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial\hat{x}_1}\frac{\partial\hat{x}_1}{\partial w_2} + \frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial\hat{x}_2}\frac{\partial\hat{x}_2}{\partial w_2}+\cdots+\frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial\hat{x}_m}\frac{\partial\hat{x}_m}{\partial w_2}\\
\vdots \\
\frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial\hat{x}_1}\frac{\partial\hat{x}_1}{\partial w_n} + \frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial\hat{x}_2}\frac{\partial\hat{x}_2}{\partial w_n}+\cdots + \frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial\hat{x}_m}\frac{\partial\hat{x}_m}{\partial w_n}\\
\end{bmatrix}\\&=
(\mathbf{\frac{\partial \hat{x}}{\partial w}})^T(\frac{\partial\mathbf{\hat{y}}}{\partial\mathbf{\mathbf{\hat{x}}}})^T\nabla_{\mathbf{\hat{y}}}L(\mathbf{y}, \mathbf{\hat{y}})\\&=
\mathbf{X}^T [\mathbf{1}\sigma(\mathbf{\hat{x}})^T\otimes(\mathbf{I}-\sigma(\mathbf{\hat{x}})\mathbf{1}^T)] ^ T \frac{2}{m}(\mathbf{y}-\mathbf{\hat{y}})
\end{align}
$$


Therefore $\nabla_{\mathbf{w}}L(\mathbf{y}, \mathbf{\hat{y}})\to0$ when $\sigma(\mathbf{\hat{x}})\to1$ or $\sigma(\mathbf{\hat{x}})\to0$. (__when the model is certain about the prediction, the weight will not update__). We can observe similar result for network using __softmax__ as the output layer and MAE objective function. On ther other hand, one can easily see that cross entropy does not have this problem (subsitute the first term with the derivative of cross entropy).

### Hyperbolic Tangent, Linear Output
If the activation function of the output layer is __linear__ or __hyperbolic tangent__ (in this case, $y_i=1$ for the sample being labled as class A, $y_i=-1$ as class B, $f(x_i)>0$ suggests network __predict__ the sample being classified as class A). The predictor makes correct prediction if $y_i\hat{y}_i>0$. However, the MSE and MAE increase when $y_i\hat{y}_i>1$.

In [9]:
y_pred = tf.linspace(-3., 3., 500)
y_true = tf.ones(500)

loss = tf.abs(y_true - y_pred)
fig = visualize(
    'y·ŷ and loss',
    y_true * y_pred,
    loss,
    display=False,
    x_axis_label='y·ŷ',
    **{'color': 'deepskyblue', 'legend_label': 'Mean Absolute Error'}
)
loss = tf.pow(y_true - y_pred, 2)
fig = visualize(
    'Mean Squared Error',
    y_true * y_pred,
    loss,
    fig=fig,
    display=False,
    **{'color': 'lightcoral', 'legend_label': 'Mean Squared Error'}
)
loss = tf.math.log(1 + np.exp(-1 * y_pred * y_true)) / np.log(2)
loss = -1 * y_true * tf.math.log(1 / (1 + tf.math.exp(-1 * y_pred))) / tf.math.log(2.)
fig = visualize(
    'Sigmoid with Cross Entropy',
    y_true * y_pred, loss,
    fig=fig,
    display=False,
    **{'color': 'limegreen', 'legend_label': 'Sigmoid with Cross Entropy', 'line_dash': 'dotted'}
)
loss = tf.maximum(tf.zeros_like(y_pred * y_true), tf.ones_like(y_pred * y_true) - y_pred * y_true) + 0.025
fig = visualize(
    'Hinge Loss',
    y_true * y_pred,
    loss,
    fig=fig,
    display=False,
    **{'color': 'blueviolet', 'legend_label': 'Hinge Loss', 'line_dash': 'dashed'}
)
loss = tf.where(y_true * y_pred < 0, tf.ones_like(y_true * y_pred), tf.zeros_like(y_true * y_pred)) 
fig = visualize(
    'Ideal Loss',
    y_true * y_pred,
    loss,
    fig=fig,
    **{'color': 'black', 'legend_label': 'Ideal Loss'}
)