In [15]:
import re
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

from IPython.core.display import HTML
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.animation import FuncAnimation
from collections import OrderedDict


def plot_3d(ax, func, xrange, yrange, cmap='viridis', elev=None, azim=None):
    X = np.arange(xrange[0], xrange[1], 0.01)
    Y = np.arange(yrange[0], yrange[1], 0.01)
    X, Y = np.meshgrid(X, Y)
    Z = func(X, Y)
    
    ax.plot_surface(X, Y, Z, cmap=cmap)
    ax.view_init(elev=elev, azim=azim)
    ax.grid(False)
    ax.dist = 7.5
    ax.axis('off')
    
    ax.set_xlim(xrange)
    ax.set_ylim(yrange)
    ax.set_zlim(np.min(Z), np.max(Z))
    
    
def plot_2d(ax, func, xrange=[-4.5, 4.5], yrange=[-4.5, 4.5], logz=False, num_lines=60, **kwargs):
    X = np.arange(xrange[0], xrange[1], 0.01)
    Y = np.arange(yrange[0], yrange[1], 0.01)
    X, Y = np.meshgrid(X, Y)
    Z = func(X, Y)
    Z = np.log(Z) if logz else Z
    ax.contour(X, Y, Z, num_lines, **kwargs)
    ax.set_xlim(xrange)
    ax.set_ylim(yrange)
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
    
    
def train(func, init_value, optimizer, num_steps=100):
    tf.reset_default_graph()
    x = tf.Variable(init_value, dtype='float32')
    loss = func(x[0], x[1])
    train_step = optimizer.minimize(loss)
    x_values, loss_values = [], []
    init = tf.global_variables_initializer()
    
    with tf.Session() as sess:
        sess.run(init)
        for _ in range(num_steps):
            x_value, loss_value = sess.run([x, loss])
            x_values.append(x_value)
            loss_values.append(loss_value)
            sess.run(train_step)
    return np.array(x_values), np.array(loss_values)


def plot_path(
    func, optimizers, init_value, min_point=None, num_steps=200, xrange=(-4.5, 4.5), yrange=(-4.5, 4.5),
    azim=None, elev=None, cmap='viridis', num_lines=60, **kwargs,
):
    x_values, loss_values = OrderedDict(), OrderedDict()
    for name, optimizer in optimizers.items():
        x_values[name], loss_values[name] = train(func, init_value, optimizer, num_steps)

    fig = plt.figure(figsize=(10, 3))
    ax1 = fig.add_subplot(121, projection='3d')
    plot_3d(ax1, func, xrange, yrange, elev=elev, azim=azim, cmap=cmap)
    for i, name in enumerate(x_values.keys()):
        ax1.plot(x_values[name][:, 0], x_values[name][:, 1], loss_values[name], zorder=10, linewidth=1.5, color='C%d' % i, label=name)
    ax1.plot([min_point[0]], [min_point[1]], [func(min_point[0], min_point[1])], zorder=10, marker=(5, 1), markersize=20, alpha=0.4, color='C3') if min_point else None
    
    ax2 = fig.add_subplot(122)
    plot_2d(ax2, func, xrange, yrange, num_lines=num_lines, cmap=cmap, **kwargs)
    for i, (name, x_value) in enumerate(x_values.items()):
        ax2.plot(x_value[:, 0], x_value[:, 1], '-o', markersize=1.5, linewidth=1.5, color='C%d' % i, label=name)
    ax2.plot([min_point[0]], [min_point[1]], marker=(5, 1), markersize=20, alpha=0.4, color='C3') if min_point else None
    ax2.legend()
    

def make_animation(
    func, optimizers, init_value, min_point=None, num_steps=200, xrange=(-4.5, 4.5), yrange=(-4.5, 4.5),
    azim=None, elev=None, cmap='viridis', num_lines=60, dirname=None, dpi=200, **kwargs
):
    # Calculate optimization paths
    x_values, loss_values = OrderedDict(), OrderedDict()
    for name, optimizer in optimizers.items():
        x_values[name], loss_values[name] = train(func, init_value, optimizer, num_steps)
    
    # Plot 3D background
    fig = plt.figure(figsize=(14, 4))
    ax1 = fig.add_subplot(121, projection='3d')
    plot_3d(ax1, func, xrange, yrange, elev=elev, azim=azim, cmap=cmap)
    ax1.plot([min_point[0]], [min_point[1]], [func(min_point[0], min_point[1])], marker=(5, 1), markersize=20, alpha=0.4, color='C3') if min_point else None
    
    # Obtain placeholders for lines and points
    dt1_lines, dt1_points = {}, {}
    for i, name in enumerate(x_values.keys()):
        dt1_lines[name] = ax1.plot([], [], [], zorder=10, linewidth=1.5, color='C%d' % i, label=name)[0]
        dt1_points[name] = ax1.plot([], [], [], zorder=10, marker='o', markersize=5, color='C%d' % i)[0]
    
    # Plot 2D background
    ax2 = fig.add_subplot(122)
    plot_2d(ax2, func, xrange, yrange, num_lines=num_lines, cmap=cmap, **kwargs)
    ax2.plot([min_point[0]], [min_point[1]], marker=(5, 1), markersize=20, alpha=0.4, color='C3') if min_point else None
    plt.close()
    
    # Obtain placeholders for lines and points
    dt2_lines, dt2_points = {}, {}
    for i, name in enumerate(x_values.keys()):
        dt2_lines[name] = ax2.plot([], [], linewidth=1.5, color='C%d' % i, label=name)[0]
        dt2_points[name] = ax2.plot([], [], marker='o', markersize=5, color='C%d' % i)[0]
    ax2.legend()
    
    def animate(index):
        for i, name in enumerate(x_values.keys()):
            dt1_lines[name].set_data(x_values[name][:index, 0], x_values[name][:index, 1])
            dt1_lines[name].set_3d_properties(loss_values[name][:index])
            dt1_points[name].set_data([x_values[name][index - 1, 0]], [x_values[name][index - 1, 1]])
            dt1_points[name].set_3d_properties(loss_values[name][index - 1])
            
            dt2_lines[name].set_data(x_values[name][:index, 0], x_values[name][:index, 1])
            dt2_points[name].set_data([x_values[name][index - 1, 0]], [x_values[name][index - 1, 1]])
        return list(dt1_lines.values()) + list(dt1_points.values())
                         
    animation = FuncAnimation(fig, animate, frames=num_steps, interval=200, blit=True)
    return HTML(re.sub('width="\d+" height="\d+"', 'width="1000" height="300"', animation.to_html5_video()))

### Gradient Descent

There's a popular story about gradient descent. A man is trying to climb down a mountain covered in fog. He can only see a few feet around him, so he comes up with an idea. Before making a step, he looks down and determines which direction is most steeply downhill, and then proceeds a small step in that direction. The process goes on and it may take a while, depending on how well he measures the steepness of the hill and how far he steps. However, if he consistently stays on the right track, he will eventually find his way down the mountain.

The direction of steepest descent is the gradient at the point. Mathematically, if $\theta$ denotes the weight vector we're trying to optimize and $\nabla_{\theta} \mathcal{L}(\theta)$ is the gradient of the cost function $\mathcal{L}(\theta)$, the update equation for gradient descent is
\begin{align} \theta = \theta - \eta \nabla_{\theta} \mathcal{L}(\theta), \label{eq:gd} \end{align}
where $\eta$ is the learning rate. Since the same learning rate applies to all components of $\theta$, gradient descent doesn't work well if these components have different scales. In such situations, gradient descent can make little improvements for some weight while unnecessarily bouncing around to update some other weight (see [Figure 1](#fig:momentum)).

Usually, the loss function in deep learning is not convex, so gradient descent often gets trapped in local minima. Other optimizers face the same problem, but gradient descent is often more prone to bad local minima. What's even worse, gradient descent may get stuck at saddle points, simply because the gradients at those points are all zeros (see [Figure 3](#fig:valley)).

In [16]:
quad = lambda x, y: x ** 2 + 4 * y ** 2
optimizers = {'GD': tf.train.GradientDescentOptimizer(learning_rate=0.2), 'Momentum': tf.train.MomentumOptimizer(learning_rate=0.2, momentum=0.3)}
make_animation(quad, optimizers, init_value=(-4, 4), min_point=(0, 0), azim=5, num_lines=30, num_steps=20)

<a id='fig:quad'>Figure 1</a>: A loss surface and its contours together with the time evolution of gradient descent and momentum optimization. The loss function is given by $f(x,y)=x^2+4y^2$. Gradient descent bounces around more dramatically than momentum optimization. Note that a large value of the momentum parameter $\beta$
often results in overshooting and coming back.

### Momentum Optimization

Equation \eqref{eq:gd} implies that gradient descent doesn't care about the previous progress. It always follows the local gradient, even if the previous gradients can improve the direction of the current step. Momentum optimization fixes this problem by keeping track of the momentum vector $m$ and using it to update the weights:

\begin{align}
    m_t &= \beta m_{t - 1} + \eta \nabla_{\theta} \mathcal{L}(\theta), \label{eq:momentum} \\\
    \theta &= \theta - m_t
\end{align}

In Equation \eqref{eq:momentum}, the momentum vector $m$ acts as a smoother, accumulating the previous gradients with some friction specified by $\beta$. When $\beta = 0$, it's just vanilla gradient descent. When $\beta$ is close to 1 (typically 0.9 or 0.99), the previous gradients in $m$ may cancel out each other and thus smooth out the trajectory. In [Figure 1](#fig:quad), for example, the momentum optimizer benefits from having the vertical movements counterbalanced by the momentum vector, allowing it to make more progress in the horizontal direction.

Momentum optimization also helps speed up convergence. Suppose the gradient remains constant, i.e. $\nabla_{\theta} \mathcal{L}(\theta) = v$. The magnitude of an update with gradient descent is always $\eta v$ while it follows from Equation \eqref{eq:momentum} that the an update with momentum after a large number of iterations is close to $(1 - \eta)^{-1}v$. Given $\eta$ close to 1, this is a huge speedup. It explains why momentum optimization escapes from local minima and plateaus a lot faster than gradient descent (see [Figure 3](#fig:hyper)).

### Nesterov Accelerated Gradient

Nesterov Accelerated Gradient (NAG) is a variant of momentum optimization. Its update equations are almost identical to those in vanilla momentum optimization:

\begin{align}
m_t &= \beta m_{t - 1} + \eta \nabla_{\theta} \mathcal{L}(\theta - \beta m_{t - 1}), \label{eq:nag} \\\
\theta &= \theta - m_t
\end{align}

The only difference is that NAG calculates the local gradient at $\theta - \beta m_{t - 1}$ instead of $\theta$. Since the extra term $-\beta m_{t-1}$ will be included in $\theta$ after the gradient calculation, we understand that NAG takes a peek at the next update step and measures the gradient at a point slightly closer to the optimum. The tweak helps reduce unnecessary oscillations and thus speed up convergence in practice.

In [39]:
booth = lambda x, y: (x + 2 * y - 7) ** 2 + (2 * x + y - 5) ** 2
optimizers = OrderedDict([
    ('GD', tf.train.GradientDescentOptimizer(learning_rate=1e-2)),
    ('Momentum', tf.train.MomentumOptimizer(learning_rate=1e-2, momentum=0.9)),
    ('NAG', tf.train.MomentumOptimizer(learning_rate=1e-2, momentum=0.9, use_nesterov=True)),
    ('Adagrad', tf.train.AdagradOptimizer(learning_rate=1e-2)),
    ('RMSProp', tf.train.RMSPropOptimizer(learning_rate=1e-2)),
    ('Adam', tf.train.AdamOptimizer(learning_rate=1e-2)),
])
make_animation(booth, optimizers, num_steps=70, init_value=(1, -2), min_point=(1, 3), num_lines=40, xrange=(-3, 6), yrange=(-3, 6), elev=70)

<a id='fig:booth'>Figure 2</a>: The trajectory of gradient descent, vanilla momentum and NAG using the Booth function $f(x, y) = (x + 2y - 7)^2 + (2x + y - 5)^2$. Compared to vanilla momentum optimization, NAG oscillates less wildly and gets to the minimum faster. Interestingly, Adagrad, RMSProp, and Adam seem to get stuck at the very beginning and converge very slowly.

### AdaGrad

All of the aforementioned approaches set the learning rate global and equal for all of the weights. Consequently, they require a bit of learning rate tuning to work well. Adagrad is an example of a class of optimizers that use adaptive learning rates. AdaGrad makes the learning rate adaptive by accumulating the squares of the gradients into a vector $g_t$ and scaling down the gradient vector according to $g_t$:

\begin{align}
g_t &= g_{t - 1} +  \nabla_{\theta} \mathcal{L}(\theta) \otimes \nabla_{\theta} \mathcal{L}(\theta), \\\
\theta &= \theta - \eta \, \nabla_{\theta} \mathcal{L}(\theta) \oslash  \sqrt{g_t + \epsilon}.
\end{align}
Here, $\otimes$ and $\oslash$ denote elementwise multiplication and division, respectively, and $\epsilon$ is a tiny constant to avoid division by zero. Intuitively, the weights corresponding to steep dimensions have larger gradients and therefore should have their learning rates decayed faster than those corresponding to gentler dimensions. The main drawback of Adagrad is that the vector $g_t$ keeps increasing, eventually making the learning rates become so small that the optimizer stops learning too early.

In [43]:
hyper = lambda x, y: y ** 2 - x ** 2
optimizers = OrderedDict([
    ('GD', tf.train.GradientDescentOptimizer(learning_rate=0.1)),
    ('Momentum', tf.train.MomentumOptimizer(learning_rate=0.1, momentum=0.9)),
    ('NAG', tf.train.MomentumOptimizer(learning_rate=0.1, momentum=0.9, use_nesterov=True)),
    ('Adagrad', tf.train.AdagradOptimizer(learning_rate=0.1)),
    ('RMSProp', tf.train.RMSPropOptimizer(learning_rate=0.1)),
    ('Adam', tf.train.AdamOptimizer(learning_rate=0.1)),
])
make_animation(hyper, optimizers, num_steps=50, init_value=(0.0001, 3), num_lines=40, xrange=(-4, 4), yrange=(-4, 4), azim=-30)

<a id='fig:hyper'>Figure 3</a>: The paths taken by various optimization algorithms next to a loss surface and its contours. The loss function $f(x) = y^2 - x^2$ has a saddle point at $(0, 0)$ and is minimized at $(0, -\infty)$. Gradient descent slowly moves towards the saddle point and gets a bit immobilized at some point. Momentum optimization and NAG are also attracted to the saddle point, but they gradually accumulate gradients to finally escape it. Adaptive optimization methods, however, stay far away from the saddle point and find the right direction quickly by reducing the learning rate along the $y$-axis while increasing the learning rate along the $x$-axis. With a bit of momentum, Adam breaks symmetry very early and becomes the fastest method. Adagrad also discovers the right direction quickly, but it turns sluggish after accumulating a lot of gradients.

### RMSProp

RMSProp alleviate the early stopping problem of AdaGrad by using a moving average of the squared gradients instead:

\begin{align}
g_t &= \beta g_{t - 1} +  (1 - \beta) \nabla_{\theta} \mathcal{L}(\theta) \otimes \nabla_{\theta} \mathcal{L}(\theta), \label{eq:rmsprop1} \\\
\theta &= \theta - \eta \, \nabla_{\theta} \mathcal{L}(\theta) \oslash  \sqrt{g_t + \epsilon}. \label{eq:rmsprop2}
\end{align}

The idea is to prevent $g_t$ from accumulating the gradients too quickly with an exponential decay rate $\beta$, which is usually set to 0.9. RMSProp was very well-received though it was never published officially.

### Adam Optimization

Adaptive moment estimation, often known as Adam, is another adaptive learning rate method. It combines RMSProp with momentum optimization, keeping track of both the first moment and the second moment of the gradients:

\begin{align}
m_t &= \beta_1 m_{t - 1} + (1 - \beta_1) \nabla_{\theta} \mathcal{L}(\theta), \label{eq:adam1} \\\
g_t &= \beta_2 g_{t - 1} +  (1 - \beta_2) \nabla_{\theta} \mathcal{L}(\theta) \otimes \nabla_{\theta} \mathcal{L}(\theta), \label{eq:adam2} \\\
\hat{m}_t &= m_t \, / \,(1 - \beta_1^t), \label{eq:adam3} \\\
\hat{g}_t &= g_t \, / \,(1 - \beta_2^t), \label{eq:adam4} \\\
\theta &= \theta - \eta \, \hat{m}_t \sqrt{\hat{g}_t + \epsilon} \label{eq:adam5}.
\end{align}

Equation \eqref{eq:adam1} is similar to Equation \eqref{eq:momentum}, except that Adam uses an exponentially decay average instead of an exponential decay sum. Also, Equation \eqref{eq:adam2} and \eqref{eq:adam5} are essentially equation \eqref{eq:rmsprop1} and \eqref{eq:rmsprop2} from RMSProp, respectively. The other two equations are there just to make sure that the first moment and the second moment are not biased toward 0, especially at the beginning of training.

Since its creation in 2015, Adam has become the default choice for optimization. It converges fast and tolerates lousy learning rates by combining the strengths of multiple approaches. It should be noted, however, that Adam and other adaptive optimization methods can sometimes lead to worse local minima than vanilla momentum or NAG (see [Wilson et al.](https://arxiv.org/pdf/1705.08292.pdf) and [Figure 2](#fig:booth)).

In [46]:
beale = lambda x, y: (1.5 - x + x * y) ** 2 + (2.25 - x + x * y ** 2) ** 2 + (2.625 - x + x * y ** 3) ** 2
optimizers = OrderedDict([
    ('GD', tf.train.GradientDescentOptimizer(learning_rate=1e-3)),
    ('Momentum', tf.train.MomentumOptimizer(learning_rate=2.5e-3, momentum=0.7)),
    ('NAG', tf.train.MomentumOptimizer(learning_rate=5.5e-4, momentum=0.9, use_nesterov=True)),
    ('Adagrad', tf.train.AdagradOptimizer(learning_rate=1)),
    ('RMSP', tf.train.RMSPropOptimizer(learning_rate=1.5e-1)),
    ('Adam', tf.train.AdamOptimizer(learning_rate=1)),
])
make_animation(beale, optimizers, init_value=(1.5, 2), min_point=(3, 0.5), num_lines=300, logz=True, azim=-20, vmin=-7, vmax=7, alpha=0.5)

<a id='fig:beale'>Figure 4</a>: Different trajectory of various optimization routines along with the surface and contours of the Beale function, which is minimized at $(3, 0)$. The learning rates are separately optimized for fastest convergence. The momentum-based methods, including Adam, often overshoot, but they come back rather quickly and converge quite fast. Adam and RMSProp lead the race again using their adaptive learning rates while Adagrad stalls behind. It's interesting that vanilla momentum gets stuck in local mimima for a while and still finds its way out. NAG doesn't have the same problem because it looks ahead one step before measuring the gradients.

In [1]:
from IPython.core.display import HTML
HTML(open('../css/custom.css', 'r').read())