### Example of how to use `torch.optim.SGD` 
 + The Rosenbrock function is minimized
 + In a similar way `torch.optim.Adam` could be used

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

import scipy.optimize as so

import torch
from torch import optim

In [None]:
def rosenbrock(x):
    """Rosenbrock function."""
    return (1-x[0])**2 + 100*(x[1] - x[0]**2)**2

def rosenbrock_contour(iterates=None):
    """Plot contours of the Rosenbrock function."""
    n = 250
    X, Y = np.meshgrid(np.linspace(-2,2,n), 
                       np.linspace(-1,3,n))
    fig = plt.figure(figsize=(14,8))
    plt.contour(X, Y, rosenbrock([X,Y]), np.logspace(-0.5, 3.5, 20, base=10), cmap='gray')

    if iterates is not None: 
        if isinstance(iterates, dict):
            for key, value in iterates.items():
                plt.plot(*(zip(*value)), ls='--', marker='o', label='lr: {}'.format(key))
        else:
            plt.plot(*(zip(*iterates)), 'bo--')
            
    plt.xlabel('x')
    plt.ylabel('y')
    if isinstance(iterates, dict): plt.legend()
    
x0 = np.array([-1.9, 2])

In [None]:
def gd(lr=1e-3, mu=0.0, nesterov=False, use_torch_opt=False, N=500, model=rosenbrock):
    """Gradient descent.
    
    Parameters
    -----------
    lr : :obj:`float`
        Learning rate.

    mu : :obj:`float` [0, 1)
        Momentum, see [1] (1-2).

    nesterov : :obj:`bool`
        If `True` use Nesterov accelerated gradient,
        see [1] (3-4).

    use_torch_opt : :obj:`bool`
        If `True` use torch.optim.SGD. Note that there are 
        slight differences in the interpretation of the 
        `lr` and `mu` parameters (see the documentation of optim.SGD),
        hence the results would be different from the manual version.

    N : :obj:`int`
        Number of iterations.

    model : :obj:`callable`
        The model.
        
    References
    -----------
    [1] http://proceedings.mlr.press/v28/sutskever13.pdf
        
    Returns
    --------
    :obj:`list(numpy.array)`
        The iterates.
    """
    x = torch.tensor(x0, requires_grad=True)
    
    opt = None
    if use_torch_opt:
        opt = optim.SGD([x], lr=lr, momentum=mu, nesterov=nesterov)
        
    iterates = [x.clone().data.numpy()]
    v = torch.zeros(len(x)).double()
    for k in range(N):
        if opt is None:  # perform step manually
            if nesterov: y = model(x + mu * v)
            else: y = model(x)
            
            y.backward()
            with torch.no_grad():
                v = mu * v - lr * x.grad
                x += v
                x.grad.zero_()
        else:
            model(x).backward()
            opt.step()
            opt.zero_grad()

        iterates.append(x.clone().data.numpy())

    return iterates

### Powell method
 + just for fun

In [None]:
rosenbrock_contour(so.fmin_powell(rosenbrock, x0=x0, retall=True)[1])

### Gradient descent

In [None]:
lrs = [1e-3/2, 3*1e-3/2, 2*1e-3]  # learning rates to test with

rosenbrock_contour({round(lr, 4): gd(lr) for lr in lrs})
rosenbrock_contour({round(lr, 4): gd(lr, use_torch_opt=True) for lr in lrs})

### Gradient descent with momentum

In [None]:
rosenbrock_contour(gd(lr=1e-3/2, mu=0.9))
rosenbrock_contour(gd(lr=1e-3/2, mu=0.9, use_torch_opt=True))

### Nesterov accelerated gradient

In [None]:
rosenbrock_contour(gd(lr=1e-3/2, mu=0.9, nesterov=True))
rosenbrock_contour(gd(lr=1e-3/2, mu=0.9, nesterov=True, use_torch_opt=True))