In [1]:
import numpy as np

import holoviews as hv; hv.extension('plotly','bokeh', logo=False)
import panel as pn;     pn.extension()
from   panel.interact import interact

In [2]:
def gradient_descent( f, df, x, mu, n_max, tol = 1.e-6):
    y      = f(x)               # starting height
    vals_x = [x]
    vals_y = [y]

    for i in range(n_max):      # don't try more than n times at most
        delta_x = -mu*df(x)     # move downhill (hence the minus sign)
        x_new   = x + delta_x
        y_new   = f(x_new)
        vals_x.append(x_new); vals_y.append(y_new)

        if np.linalg.norm(delta_x) < tol: return (x_new,y_new), (vals_x, vals_y)   # converged
        x,y = x_new,y_new

    return None, (vals_x,vals_y)  # failed to converge

In [3]:
%output backend="plotly"
def show_objective_surface(f,lims=(-3,3), step=0.25):
    x_vals = np.arange(*lims,step)
    xx,yy  = np.meshgrid(x_vals,x_vals)
    z      = list(map( f, np.array([xx.ravel(),yy.ravel()]).T ))
    return hv.TriSurface((xx.ravel(),yy.ravel(), z), ["x_1", "x_2", "y"])

In [4]:
%output backend="bokeh"
def show_objective_surface_contours(f,lims=(-3,3),n=101,l=[0.1,0.4,1,2,5,10,20]):

    from holoviews.operation import contours as hv_contours
    v   = np.linspace(*lims,n)
    x,y = np.meshgrid(v, v)
    z   = np.array(list(map( f, np.array([x.ravel(),y.ravel()]).T ))).reshape(x.shape)
    img = hv.Image((v,v,z), ["x_1", "x_2"], "y").opts(title="Contour Lines in x_1,x_2 plane")

    return img, hv_contours(img, levels=l, filled=True, overlaid=False).opts(padding=0)

In [5]:
def show_gradient_descent(hf, f, df, x, mu, n_max, ylim, tol=1e-4):
    sol, vals = gradient_descent(f,df,x,mu,n_max,tol)

    h = hv.Curve(vals).opts(xlim=(-5,5), ylim=ylim)*\
        hv.Scatter(vals).opts(size=4, color="red", tools=['hover'])
    if sol is not None:
        h = h*hv.Scatter((sol)).opts(size=5, color="black")
    hy = hv.Curve(vals[1], "iteration index", "y")\
           .opts(ylim=ylim,height=350, title="y versus Iteration Index")

    return (hf*h).opts(width=450,height=350)+hy
def show_gradient_descent_on_contour(f, df, x, mu, n_max, ylim, tol=1e-4):
    sol, steps = gradient_descent(f,df,x,mu,n_max,tol)
    x_i = [p[0] for p in steps[0]]
    y_i = [p[1] for p in steps[0]]
    d_i = [np.linalg.norm(p) for p in steps[0]]
    progress = [ d_i[i]/d_i[i-1] for i in range(1,len(d_i)) ]

    if sol is not None: sol = np.round(sol[0], 5)

    _, hf = show_objective_surface_contours(f,lims=(-3,3), l=[0.1,0.4,1,2,5,10,20])
    return pn.Row(
        hf.opts(width=350,height=350,xlim=(-3,3),ylim=(-3,3),title=f"Contour Lines, Solution {sol}" )\
        *hv.Curve(  (x_i, y_i)).opts(color="red")\
        *hv.Scatter((x_i, y_i)).opts(tools=['hover'],color="red", size=4),
        hv.Curve( steps[1], "index", "y" ).opts(height=350, title="y versus Iteration Index", tools=["hover"], show_grid=True),
        hv.Curve( (range(1,len(d_i)), progress), "index", "norm( x_i ) / norm(x_(i-1)"  ).opts(height=350, title="Relative Progress", tools=["hover"], show_grid=True)
    )

# <div style="float:center;width:100%;text-align: center;"><strong style="height:60px;color:darkred;font-size:40px;">Gradient Descent Example</strong></div>

# 1. Objective Function

This notebook follows the introduction to [Gradient Descent](GradientDescent.ipynb), and carries out an analysis of a very simple system<br> presented by [Boyd and Vandenberghe](https://web.stanford.edu/~boyd/cvxbook/bv_cvxbook.pdf) :

$\qquad y = \frac{1}{2} \left( x_1^2 + b x_2^2\right)\;\;$ for $0 \le b \le 1$,<br> i.e.,<br>
$\qquad y = x^t A x,\;\;\text{ where } A=\frac{1}{2} \begin{pmatrix} 1 & 0 \\ 0 & b \end{pmatrix}$ 

In [12]:
def obj_func( x, b=1 ):          return 0.5*(x[0]*x[0]+b*x[1]*x[1])
def gradient_obj_func( x, b=1 ): return np.array([x[0], b*x[1]])

def show_change_with_b(b):
    surface       = show_objective_surface(lambda x: obj_func(x,b))#.opts(title="Objective Surface")
    img, contours = show_objective_surface_contours(lambda x: obj_func(x,b))
    return pn.Column(
              pn.pane.Markdown("## Objective Surface and Contour Lines"),
              pn.Row( hv.render( surface, backend='plotly'),
              pn.Column( pn.Spacer(height=130), hv.render( contours, backend='bokeh') )))
interact( show_change_with_b, b = pn.widgets.FloatSlider(name="b", start=0, end=1, step=0.01,value=0.1))

**Observe** that as $b$ decreases, the valley containing the minimum becomes narrower and flatter!

# 2. Gradient Descent

Given the objective function
$\;\; F(x) = \frac{1}{2} \left( x_1^2 + b x_2^2 \right), \;\;$ the gradient is given by $\;\; \nabla F = \begin{pmatrix} x_1 \\ b x_2 \end{pmatrix}$,<br>
$\qquad$ and the origin is a global minimum.

For this simple example, we can readily compute the successive points reached by the gradient descent algorithm.

$\qquad \begin{pmatrix}x^{(n+1)}_1 \\  x^{(n+1)}_2 \end{pmatrix} = \begin{pmatrix}x^{(n)}_1 \\  x^{(n)}_2 \end{pmatrix} - \mu  \begin{pmatrix}x^{(n)}_1 \\  b x^{(n)}_2 \end{pmatrix}  = \begin{pmatrix}(1-\mu)\ x^{(n)}_1 \\  (1 - \mu b)\ x^{(n)}_2 \end{pmatrix} $

Which has solution

$\qquad \begin{pmatrix}x^{(n+1)}_1 \\  x^{(n+1)}_2 \end{pmatrix}
= \begin{pmatrix} 1 - \mu & 0 \\ 0 & 1 - \mu b \end{pmatrix}^n \begin{pmatrix}x^{(0)}_1 \\ x^{(0)}_2  \end{pmatrix} $

The value of the objective function at any one estimate $x^{(n)}$ is given by
$F(x^{(n)}) = (1-\mu)^{n-1} \left( x^{(0)}_1 \right)^2 + (1- b \mu)^{n-1} \left( x^{(0)}_2 \right)^2 $<br>
$\qquad$ which converges to 0 provided $0 < \mu < 2$ and $0 < b \le 1$. 

In [13]:
%output backend='bokeh'
x_0  = np.array( [3.,3])
interact( lambda mu, b: show_gradient_descent_on_contour(lambda x: obj_func(x,b),lambda x: gradient_obj_func(x,b),x_0,mu,150,(0,50), tol=1e-6),
          mu = pn.widgets.FloatSlider(name="mu", start=0,end=2,step=0.01,value=1.6),
          b  = pn.widgets.FloatSlider(name="b",  start=0,   end=1,step=0.05,value=0.1), )

**Observe:**
* Each step moves form the current location in the derection of the negative gradient, i.e., orthogonal to the level curves
* When $b < 1$ the negative gradient does not point at the origin, resulting in a "zig-zag" pattern when $\mu$ is large enough<br>
to cross the valley floor.
* At we approach the critical point at the origin, the gradient tends to zero: the steps taken get smaller.
* when $b = 1$, the contour lines are circles: each step is in the right direction (but may overshoot the critical point)
* when $b = 1, \mu = 1$, we reach the minimum in a single step.
* as $b$ gets smaller, more iterations are required to approach the critical point.
* as we approach the critical point, progress in covering the remaining distance becomes linear! (**First order convergence**)

# 3. Improved Algorithm

## 3.1. Exact Line Search

Given an objective function $y = F(\mathbf{x})$, the steepest descent algorithm moves from some starting point $\mathbf x$<br>
$\qquad$ in the direction $-\nabla F$, i.e., along the line $\left\{\ \mathbf x - \mu \nabla F(\mathbf x), \; \forall \mu \in \mathbb{R} \right\}\ $.

One possible alternative to fixing the step size $\mu$ is to select it such that it minimizes the objective function<br>
along the direction of the negative gradient (i.e., descend as far as possible):<br>
$\qquad\quad \mu =\; argmin_{\mu}\ F(\ { \mathbf x - \mu \nabla F(\mathbf x) }\ )$

This is known as **Exact Line Search.**

**Remark:** The gradient at the destination point will be orthogonal to this direction.

____

This is easily computed analytically for the problem $F(x) = \frac{1}{2} \left( x_1^2 + b x_2^2 \right).$

The value $\mu$ that minimizes
$\; F( \mathbf x - \mu \nabla F ) =  \frac{1}{2} \left( b^2 (1-\mu)^2 + b (1 - b \mu)^2 \right)$
$\;$ is given by
$\; \mu =  \frac{x_1^2 + b^2 x_2^2}{x_1^2 + b^3 x_2^2}$

In [8]:
def mu_func(x,b=1): return (x[0]*x[0]+b*b*x[1]*x[1])/(x[0]*x[0]+b**3*x[1]*x[1])

def gradient_descent_with_line_search_example( f, df, x, mu, n_max, tol = 1.e-6):
    y      = f(x)
    vals_x = [x]
    vals_y = [y]

    for i in range(n_max):
        delta_x = -mu(x)*df(x)
        x_new   = x + delta_x
        y_new   = f(x_new)
        vals_x.append(x_new); vals_y.append(y_new)

        if np.linalg.norm(delta_x) < tol: return (x_new,y_new), (vals_x, vals_y)   # converged
        x,y = x_new,y_new

    return None, (vals_x,vals_y)  # failed to converge

def show_gradient_descent_on_contour_with_line_search_example( f, df, x, mu, n_max, ylim, tol=1e-4):
    sol, steps = gradient_descent_with_line_search_example( f, df, x, mu, n_max,tol)
    x_i = [p[0] for p in steps[0]]
    y_i = [p[1] for p in steps[0]]

    if sol is not None: sol = np.round(sol[0], 5)

    _, hf = show_objective_surface_contours(f,lims=(-3,3), l=[0.1,0.4,1,2,5,10,20])
    return pn.Row(
        hf.opts(width=350,height=350,xlim=(-3,3),ylim=(-3,3),title=f"Contour Lines, Solution {sol}" )\
        *hv.Curve(  (x_i, y_i)).opts(color="red")\
        *hv.Scatter((x_i, y_i)).opts(tools=['hover'],color="red", size=4),
        hv.Curve( steps[1], "index", "y" ).opts(height=350, title="y versus Iteration Index", tools=["hover"], show_grid=True)
    )

In [9]:
%output backend='bokeh'
x_0  = np.array( [3.,3])
interact( lambda b: show_gradient_descent_on_contour_with_line_search_example(
                        lambda x: obj_func(x,b), lambda x: gradient_obj_func(x,b), np.array( [3*b, 3.]),
                        lambda x: mu_func(x,b), 150, (0,50),  tol=1e-6),
          b  = pn.widgets.FloatSlider(name="b", start=0, end=1,step=0.01,value=0.3), )

If we choose the starting point $\;\mathbf x^{(0)} = \begin{pmatrix}b \\ 1\end{pmatrix}\;$ the formulae for the iterates simplify considerably:

$\qquad \begin{pmatrix}x^{(n+1)}_1 \\  x^{(n+1)}_2 \end{pmatrix} = \begin{pmatrix}(1-\mu)\ x^{(n)}_1 \\  (1 - \mu b)\ x^{(n)}_2 \end{pmatrix}, \; \text{ where } \; \mu = \frac{\left(x_1^{(n)}\right)^2 + b^2 \left(x_2^{(n)}\right)^2}{\left(x_1^{(n)}\right)^2 + b^3 \left(x_2^{(n)}\right)^2}$

$ \qquad \therefore \quad \begin{pmatrix}x^{(n)}_1 \\  x^{(n)}_2 \end{pmatrix}
= \left(\frac{1-b}{1+b}\right)^n \begin{pmatrix} (-1)^n b \\ 1 \end{pmatrix}\;\;$ and
$\;\; F\left( x^{(n)} \right) = \left(\frac{1-b}{1+b}\right)^{2 n} F(b,1)$

**Remarks:**
* With $b = 1$, we obtain the exact solution in a single step: $\;\;F(x^{(1)}) = 0.$
* When $b \ll 1$, we find $\;\;F(x^{(n)}) \approx (1-2b)^{2n} F(b,1)$,<br> i.e.,
progress toward the solution virtually stops when $b$ is small.<br>
The objective function is a narrow valley, and the iterates move sideways in a zig-zag pattern,<br>
ever so slowly converging toward the solution.

## 3.2. Accelerated Descent: Momentum

**Idea:** incorporate the previous step size:<br>
$\qquad  x^{(n+1)} = x^{(n)} - \Delta^{(n)}\;\;$
where $\;\; \Delta^{(n)} = \mu \nabla F(x^{(n)}) +\beta \Delta^{(n-1)}$

In [15]:
def gradient_descent_with_momentum( f, df, x, mu, beta, n_max, tol = 1.e-6):
    y       = f(x)               # starting height
    vals_x  = [x]
    vals_y  = [y]
    delta_x = np.array([0.,0])

    for i in range(n_max):      # don't try more than n times at most
        delta_x = mu*df(x) + beta*delta_x
        x_new   = x - delta_x
        y_new   = f(x_new)
        vals_x.append(x_new); vals_y.append(y_new)

        if np.linalg.norm(delta_x) < tol: return (x_new,y_new), (vals_x, vals_y)   # converged
        x,y = x_new,y_new

    return None, (vals_x,vals_y)  # failed to converge

def show_gradient_descent_with_momentum_on_contour(f, df, x, mu, beta, n_max, ylim, tol=1e-4):
    sol, steps = gradient_descent(f,df,x,mu,n_max,tol)
    x_i        = [p[0] for p in steps[0]]
    y_i        = [p[1] for p in steps[0]]
    d_i        = [np.linalg.norm(p) for p in steps[0]]
    progress   = [ d_i[i]/d_i[i-1] for i in range(1,len(d_i)) ]

    h_orig   = hv.Curve((x_i, y_i), label="original")\
                 .opts(color="olive", muted_alpha=0)
    h_y_orig = hv.Curve( steps[1], "index", "y", label="original" )\
                .opts(height=350, color="olive", title="y versus Iteration Index", tools=["hover"], show_grid=True)
    h_p_orig = hv.Curve( (range(1,len(d_i)), progress), "index", "norm( x_i ) / norm(x_(i-1)", label="original"  )\
                 .opts(height=350, color="olive", title="Relative Progress", tools=["hover"], show_grid=True)

    sol, steps = gradient_descent_with_momentum(f,df,x,mu,beta,n_max,tol)
    x_i    = [p[0] for p in steps[0]]
    y_i    = [p[1] for p in steps[0]]

    d_i = [np.linalg.norm(p) for p in steps[0]]
    progress_M = [ d_i[i]/d_i[i-1] for i in range(1,len(d_i)) ]

    if sol is not None: sol = np.round(sol[0], 5)

    _, hf = show_objective_surface_contours(f,lims=(-3,3), l=[0.1,0.4,1,2,5,10,20])
    return pn.Row( 
        (hf.opts(width=350,height=350,xlim=(-3,3),ylim=(-3,3),title=f"Contour Lines, Solution {sol}" )\
        *h_orig*hv.Curve(  (x_i, y_i),label="with momentum")\
                  .opts(tools=['hover'], muted_alpha=0)).opts( legend_position="top"),
        (h_y_orig*hv.Curve( steps[1],  label="with momentum" )\
                   .opts( tools=["hover"], muted_alpha=0, line_width=3))\
                 .opts(legend_position="top"),
        (h_p_orig*hv.Curve( (range(1,len(d_i)), progress_M),  label="with momentum" )\
                    .opts( title="Relative Progress", tools=["hover"], line_width=3, muted_alpha=0, height=350 ))\
                 .opts(legend_position="top")
    )

In [16]:
x_0  = np.array( [3.,3])
interact( lambda mu, beta, b: show_gradient_descent_with_momentum_on_contour(lambda x: obj_func(x,b),lambda x: gradient_obj_func(x,b),x_0,mu,beta,150,(0,50), tol=1e-6),
          mu   = pn.widgets.FloatSlider(name="mu",   start=0,end=2,step=0.01,value=1.6),
          beta = pn.widgets.FloatSlider(name="beta", start=0,end=1,step=0.01,value=0.1),
          b    = pn.widgets.FloatSlider(name="b",    start=0,end=1,step=0.01,value=0.1),
        )