# Faster Optimizers

In [None]:
# Set up some basisc fucntion
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "./image/"
CHAPTER_ID = "deep"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

In [None]:
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("./tmp/data/")  

## Momentum optimization

Momentum optimization cares a great deal about what previous gradients were: at
each iteration, it adds the local gradient to the momentum vector m (multiplied by the learning rate $\eta$), and it updates the weights by simply subtracting this momentum vector. 

In other words, the gradient is used as an acceleration, not as a speed. To simulate some sort of friction mechanism and prevent the momentum from growing too large, the algorithm introduces a new hyperparameter $\beta$, simply
called the momentum, which must be set between 0 (high friction) and 1 (no friction). A typical momentum value is 0.9.

1. $ m \leftarrow \beta m + \eta \nabla_{\theta}J(\theta) $.
2. $ \theta \leftarrow \theta - m$.


In deep neural networks that don’t use Batch Normalization, the upper layers will often end up having inputs with very different scales, so using Momentum optimization helps a lot. It can also help roll past local optima.

In [None]:
learning_rate = 0.01
optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                       momentum=0.9)

## Nesterov Accelerated Gradient

One small variant to Momentum optimization, proposed by Yurii Nesterov in 1983, is almost always faster than vanilla Momentum optimization. The idea of Nesterov
Momentum optimization, or Nesterov Accelerated Gradient (NAG), is to measure the
gradient of the cost function not at the local position but slightly ahead in the direction of the momentum.
The only difference from vanilla Momentum optimization is that the gradient is measured at θ + βm rather than at θ:

1. $ m \leftarrow \beta m + \eta \nabla_{\theta}J(\theta + \beta m) $.
2. $ \theta \leftarrow \theta - m$.

This small tweak works because in general the momentum vector will be pointing in the right direction (i.e., toward the optimum), so it will be slightly more accurate to use the gradient measured a bit farther in that direction rather than using the gradient at the original position.

In [None]:
optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                       momentum=0.9, use_nesterov=True)

## AdaGrad  

AdaGrad often performs well for simple quadratic problems, but unfortunately it
often stops too early when training neural networks. The learning rate gets scaled down so much that the algorithm ends up stopping entirely before reaching the global optimum. So even though TensorFlow has an AdagradOptimizer, you should not use it to train deep neural networks (it may be efficient for simpler tasks such as Linear Regression, though).

In short, this algorithm decays the learning rate, but it does so faster for steep dimensions than for dimensions with gentler slopes. This is called an adaptive learning rate.

One additional benefit is that it requires much less tuning of the learning
rate hyperparameter $\eta$.

1. $ s \leftarrow s +  \nabla_{\theta}J(\theta) \bigotimes  \nabla_{\theta}J(\theta) $.
2. $ \theta \leftarrow \theta -  \eta \nabla_{\theta}J(\theta) \oslash \sqrt{s + \epsilon}$.

The first step accumulates the square of the gradients into the vector s (the $\bigotimes$ symbol represents the element-wise multiplication).

The second step is almost identical to Gradient Descent, but with one big difference: the gradient vector is scaled down by a factor of $\sqrt{s + \epsilon}$  (the $\oslash$ symbol represents the element-wise division, and ϵ is a smoothing term to avoid division by zero, typically set to $10^{-10}$).

In [None]:
optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate)

## RMSProp

Although AdaGrad slows down a bit too fast and ends up never converging to the
global optimum, the RMSProp algorithm14 fixes this by accumulating only the gradients from the most recent iterations (as opposed to all the gradients since the beginning of training). It does so by using exponential decay in the first step:

1. $ s \leftarrow \beta s +  (1 - \beta) \nabla_{\theta}J(\theta) \bigotimes  \nabla_{\theta}J(\theta) $.
2. $ \theta \leftarrow \theta -  \eta \nabla_{\theta}J(\theta) \oslash \sqrt{s + \epsilon}$.

The decay rate $\beta$ is typically set to 0.9. Yes, it is once again a new hyperparameter, but this default value often works well, so you may not need to tune it at all.

Except on very simple problems, this optimizer almost always performs much better than AdaGrad. It also generally performs better than Momentum optimization and Nesterov Accelerated Gradients. In fact, it was the preferred optimization algorithm of many researchers until Adam optimization came around.

In [None]:
optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate,
                                      momentum=0.9, decay=0.9, epsilon=1e-10)

## Adam Optimization (Best first choice)

**Adam**, which stands for adaptive moment estimation, combines the ideas of Momentum optimization and RMSProp: just like Momentum optimization it keeps track of an exponentially decaying average of past gradients, and just like RMSProp it keeps track of an exponentially decaying average of past squared gradients:

1. $ m \leftarrow \beta_1 m + (1-\beta_1) \nabla_{\theta}J(\theta)$
2. $ s \leftarrow  \beta_2 s +  (1 - \beta_2) \nabla_{\theta}J(\theta) \bigotimes  \nabla_{\theta}J(\theta) $.
3. $ m \leftarrow \frac{m}{1 - \beta_1^{k}}$
4. $s \leftarrow \frac{s}{1-\beta_2^{k}}$ 
5. $ \theta \leftarrow \theta -  \eta  m \oslash \sqrt{s + \epsilon}$.

- $k$ represents the iteration number (starting at 1).


If you just look at steps 1, 2, and 5, you will notice Adam’s close similarity to both Momentum optimization and RMSProp.

Steps 3 and 4 are somewhat of a technical detail: since m and s are initialized at 0, they will be biased toward 0 at the beginning of training, so these two steps will help boost m and s at the beginning of training.

The momentum decay hyperparameter $\beta_1$ is typically initialized to 0.9, while the scaling decay hyperparameter $\beta_2$ is often initialized to 0.999.

In fact, since Adam is an adaptive learning rate algorithm (like AdaGrad and
RMSProp), it requires less tuning of the learning rate hyperparameter $\eta$. You can often use the default value $\eta$ = 0.001, making Adam even easier to use than Gradient Descent.

In [None]:
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)

### Remark:

All the optimization techniques discussed so far only rely on the first-order partial derivatives (**Jacobians**). 

The optimization literature contains amazing algorithms based on the second-order partial derivatives (the **Hessians**).  

Unfortunately, these algorithms are very hard to apply to deep neural networks because there are $n^2$ Hessians per output (where n is the number of parameters), as opposed to just n Jacobians per output. 

Since DNNs typically have tens of thousands of parameters, the second-order optimization algorithms often don’t even fit in memory, and even when they do, computing the Hessians is just too slow.

## Learning Rate Scheduling
Finding a good learning rate can be tricky. If you set it way too high, training may actually diverge (as we discussed in Chapter 4). If you set it too low, training will eventually converge to the optimum, but it will take a very long time. If you set it slightly too high, it will make progress very quickly at first, but it will end up dancing around the optimum, never settling down (unless you use an adaptive learning rate optimization algorithm such as AdaGrad, RMSProp, or Adam, but even then it may take time to settle).

If you have a limited computing budget, you may have to interrupt training before it has converged properly, yielding a suboptimal solution.

In [None]:
reset_graph()

n_inputs = 28 * 28  # MNIST
n_hidden1 = 300
n_hidden2 = 50
n_outputs = 10

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")

with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1")
    hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name="hidden2")
    logits = tf.layers.dense(hidden2, n_outputs, name="outputs")

with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")

In [None]:
with tf.name_scope("train"):       #  Check the Exponential Decay scheduling
    initial_learning_rate = 0.1
    decay_steps = 10000
    decay_rate = 1/10
    global_step = tf.Variable(0, trainable=False, name="global_step")
    learning_rate = tf.train.exponential_decay(initial_learning_rate, global_step,
                                               decay_steps, decay_rate)
    optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.9)
    training_op = optimizer.minimize(loss, global_step=global_step)

In [None]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()

n_epochs = 5
batch_size = 50

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(mnist.train.num_examples // batch_size):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        accuracy_val = accuracy.eval(feed_dict={X: mnist.test.images,
                                                y: mnist.test.labels})
        print(epoch, "Test accuracy:", accuracy_val)

    save_path = saver.save(sess, "./my_model_final.ckpt")