In [None]:
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

## <font color='blue' size=6pt> Paths of SGD, Momentum, RMSProp and ADAM </font>

### <font color='blue' size=5pt> Gradient Descent Methods for Optimization </font>

All the current optimization algorithms are based on a variant of gradient descent.

Let's denote the vector of weights at step $t$ by $w_t$ and the gradient of the objective function with respect to the weights by $g_t$. The idea is that the gradient descent algorithm updates the weights under the following principle:

$$\large w_t = w_{t-1} - \eta\cdot g_{t,t-1}$$

When the objective function (whose gradient with respect to the weights) is represented by $g_t$ and has multiple local minima, or it has a very shallow region containing the minima, the plain gradient descent algorithm may not converge to the position sought for. To remediate this deficiency research proposed alternatives by varying the way we evaluate the learning rate each step or how we compute the "velocity" for updating the weights:

<font color='green'>$$\Large w_t = w_{t-1} - \eta_t\cdot v_t$$ </font>

In the equation above, $\eta_t$ is an adaptive learning rate and $v_t$ a modified gradient.

### <font color='blue' size=5pt> Dynamic Learning Rates </font>

We can consider an exponential decay, such as

$$\large \eta_t = \eta_0 e^{-\lambda\cdot t}$$

or a polynomial decay

$$\large \eta_t = \eta_0 (\beta t+1)^{-\alpha}$$

### <font color='blue' size=5pt> Momentum Gradient Descent </font>

$$\large g_{t,t-1} = \partial_w \frac{1}{|\text{B}_t|}\sum_{i\in \text{B}_t}f(x_i,w_{t-1})=\frac{1}{|\text{B}_t|}\sum_{i\in \text{B}_t}h_{i,t-1}$$

where

$$\large v_t = \beta v_{t-1} + g_{t,t-1}$$

and $\beta\in (0,1).$

For an explicit formula, we have

$$\large v_t = \sum_{\tau=0}^{t-1} \beta^{\tau}g_{t-\tau,t-\tau-1}$$


and

$$\large w_t = w_{t-1} - \alpha v_t$$

where

$$\large \alpha = \frac{\eta}{1-\beta}$$


### <font color='blue' size=5pt> AdaGrad (Adaptive Gradient Descent) </font>

$$\large s_t = s_{t-1} + g_{t}^2$$

and

$$\large w_t= w_{t-1} - \frac{\eta}{\sqrt{s_t+\epsilon}}\cdot g_t$$

### <font color='blue' size=5pt> RMSProp (Root Mean Square Propagation) </font>

$$\large s_t = \gamma\cdot s_{t-1} + (1-\gamma)\cdot g_{t}^2$$

and

$$\large w_t= w_{t-1} - \frac{\eta}{\sqrt{s_t+\epsilon}}\cdot g_t$$

Thus, we have

$$\large s_t = (1-\gamma)\cdot (g_t^2+\gamma g_{t-1}^2+\gamma^2 g_{t-2} +\gamma^3 g_{t-2} + ... + \gamma^{\tau} g_0)$$

### <font color='blue' size=5pt> ADAM (Adaptive Momentum Gradient Descent) </font>

$$\large v_t = \beta_1  v_{t-1} +(1-\beta_1) g_t$$

and

$$\large s_t = \beta_2 s_{t-1} + (1-\beta_2) g_t^2 $$

We further consider

$$\large \hat{v}_t = \frac{v_t}{1-\beta_1}, \text{and  } \hat{s}_t = \frac{s_t}{1-\beta_2} $$


and

$$\large \hat{g}_{t} = \frac{\eta\cdot \hat{v}_t}{\sqrt{\hat{s}_t}+\epsilon}$$

The updates to the weights are implemented as follows

$$\large w_t = w_{t-1} - \hat{g}_t$$


In [None]:

# Parameters
width, height = 1920, 1200
nx, ny = width // 5, height // 5
h = 1e-7
drawing_time = 30

domain_x = np.linspace(-2, 2, nx)
domain_y = np.linspace(-2, 2, ny)
X, Y = np.meshgrid(domain_x, domain_y)

# Function definitions
def f(x, y):
    return (-2 * np.exp(-((x - 1) ** 2 + y ** 2) / 0.2) +
            -3 * np.exp(-((x + 1) ** 2 + y ** 2) / 0.2) +
            x ** 2 + y ** 2)

def grad_f(x, y):
    grad_x = (f(x + h, y) - f(x, y)) / h
    grad_y = (f(x, y + h) - f(x, y)) / h
    return np.array([grad_x, grad_y])

Z = f(X, Y)

# Optimizers
def get_sgd_path(x0, y0, learning_rate, num_steps):
    path = np.zeros((num_steps, 2))
    path[0] = np.array([x0, y0])
    for i in range(1, num_steps):
        grad = grad_f(*path[i-1])
        path[i] = path[i-1] - learning_rate * grad
    return path

def get_momentum_path(x0, y0, learning_rate, num_steps, momentum):
    path = np.zeros((num_steps, 2))
    path[0] = np.array([x0, y0])
    velocity = np.zeros(2)
    for i in range(1, num_steps):
        grad = grad_f(*path[i-1])
        velocity = momentum * velocity - learning_rate * grad
        path[i] = path[i-1] + velocity
    return path

def get_rmsprop_path(x0, y0, learning_rate, num_steps, decay_rate, eps):
    path = np.zeros((num_steps, 2))
    path[0] = np.array([x0, y0])
    cache = np.zeros(2)
    for i in range(1, num_steps):
        grad = grad_f(*path[i-1])
        cache = decay_rate * cache + (1 - decay_rate) * grad**2
        path[i] = path[i-1] - learning_rate * grad / (np.sqrt(cache) + eps)
    return path

def get_adam_path(x0, y0, learning_rate, num_steps, beta_1, beta_2, eps):
    path = np.zeros((num_steps, 2))
    path[0] = np.array([x0, y0])
    m, v = np.zeros(2), np.zeros(2)
    for i in range(1, num_steps):
        grad = grad_f(*path[i-1])
        m = beta_1 * m + (1 - beta_1) * grad
        v = beta_2 * v + (1 - beta_2) * grad**2
        m_hat = m / (1 - beta_1**(i+1))
        v_hat = v / (1 - beta_2**(i+1))
        path[i] = path[i-1] - learning_rate * m_hat / (np.sqrt(v_hat) + eps)
    return path

# Generate paths

def gen_paths(x0,y0):
  sgd_path = get_sgd_path(x0, y0, 0.02, 500)
  momentum_path = get_momentum_path(x0, y0, 0.01, 200, 0.8)
  rmsprop_path = get_rmsprop_path(x0, y0, 0.01, 300, 0.99, 1e-6)
  adam_path = get_adam_path(x0, y0, 0.01, 500, 0.7, 0.999, 1e-6)

  # Plotting
  fig = make_subplots(rows=1, cols=1, subplot_titles=("Contour Plot and Optimization Paths",))

  fig.add_trace(go.Contour(
      z=Z,
      x=domain_x,
      y=domain_y,
      contours=dict(
          coloring='heatmap',
          showlabels=True,
          labelfont=dict(size=12, color='white', family='Arial')
      ),
      colorbar=dict(title="Function value")
  ), row=1, col=1)

  fig.add_trace(go.Scatter(
      x=sgd_path[:, 0], y=sgd_path[:, 1],
      mode='lines+markers',
      name='SGD',
      line=dict(color='black')
  ), row=1, col=1)

  fig.add_trace(go.Scatter(
      x=momentum_path[:, 0], y=momentum_path[:, 1],
      mode='lines+markers',
      name='Momentum',
      line=dict(color='blue')
  ), row=1, col=1)

  fig.add_trace(go.Scatter(
      x=rmsprop_path[:, 0], y=rmsprop_path[:, 1],
      mode='lines+markers',
      name='RMSProp',
      line=dict(color='red')
  ), row=1, col=1)

  fig.add_trace(go.Scatter(
      x=adam_path[:, 0], y=adam_path[:, 1],
      mode='lines+markers',
      name='Adam',
      line=dict(color='green')
  ), row=1, col=1)

  fig.update_layout(
      title="Optimization Path Visualizations",
      xaxis_title="X-axis",
      yaxis_title="Y-axis",
      legend=dict(x=0.1, y=0.9)
  )

  fig.show()


In [None]:
gen_paths(1.3,0.5)

### More Gradient-based Optimizers Included
---

In [None]:
# Parameters
width, height = 1920, 1080
nx, ny = width // 5, height // 5
h = 1e-7
drawing_time = 30

domain_x = np.linspace(-2, 2, nx)
domain_y = np.linspace(-2, 2, ny)
X, Y = np.meshgrid(domain_x, domain_y)

# Function definitions
def f(x, y):
    return (-2 * np.exp(-((x - 1) ** 2 + y ** 2) / 0.2) +
            -3 * np.exp(-((x + 1) ** 2 + y ** 2) / 0.2) +
            x ** 2 + y ** 2)

def grad_f(x, y):
    grad_x = (f(x + h, y) - f(x, y)) / h
    grad_y = (f(x, y + h) - f(x, y)) / h
    return np.array([grad_x, grad_y])

Z = f(X, Y)

# Optimizers
def get_sgd_path(x0, y0, learning_rate, num_steps):
    path = np.zeros((num_steps, 2))
    path[0] = np.array([x0, y0])
    for i in range(1, num_steps):
        grad = grad_f(*path[i-1])
        path[i] = path[i-1] - learning_rate * grad
    return path

def get_momentum_path(x0, y0, learning_rate, num_steps, momentum):
    path = np.zeros((num_steps, 2))
    path[0] = np.array([x0, y0])
    velocity = np.zeros(2)
    for i in range(1, num_steps):
        grad = grad_f(*path[i-1])
        velocity = momentum * velocity - learning_rate * grad
        path[i] = path[i-1] + velocity
    return path

def get_rmsprop_path(x0, y0, learning_rate, num_steps, decay_rate, eps):
    path = np.zeros((num_steps, 2))
    path[0] = np.array([x0, y0])
    cache = np.zeros(2)
    for i in range(1, num_steps):
        grad = grad_f(*path[i-1])
        cache = decay_rate * cache + (1 - decay_rate) * grad**2
        path[i] = path[i-1] - learning_rate * grad / (np.sqrt(cache) + eps)
    return path

def get_adam_path(x0, y0, learning_rate, num_steps, beta_1, beta_2, eps):
    path = np.zeros((num_steps, 2))
    path[0] = np.array([x0, y0])
    m, v = np.zeros(2), np.zeros(2)
    for i in range(1, num_steps):
        grad = grad_f(*path[i-1])
        m = beta_1 * m + (1 - beta_1) * grad
        v = beta_2 * v + (1 - beta_2) * grad**2
        m_hat = m / (1 - beta_1**(i+1))
        v_hat = v / (1 - beta_2**(i+1))
        path[i] = path[i-1] - learning_rate * m_hat / (np.sqrt(v_hat) + eps)
    return path

def get_ftrl_path(x0, y0, learning_rate, num_steps, alpha, beta, l1, l2):
    path = np.zeros((num_steps, 2))
    path[0] = np.array([x0, y0])
    z, n = np.zeros(2), np.zeros(2)
    for i in range(1, num_steps):
        grad = grad_f(*path[i-1])
        sigma = (np.sqrt(n + grad**2) - np.sqrt(n)) / alpha
        z += grad - sigma * path[i-1]
        n += grad**2
        path[i] = -((beta + np.sqrt(n)) / alpha + l2) ** -1 * (z - np.sign(z) * l1)
    return path

def get_adagrad_path(x0, y0, learning_rate, num_steps, eps):
    path = np.zeros((num_steps, 2))
    path[0] = np.array([x0, y0])
    cache = np.zeros(2)
    for i in range(1, num_steps):
        grad = grad_f(*path[i-1])
        cache += grad**2
        path[i] = path[i-1] - learning_rate * grad / (np.sqrt(cache) + eps)
    return path

# Paths
initial_x = 0.7
initial_y = -1.4
sgd_path = get_sgd_path(initial_x,initial_y, 0.02, 500)
momentum_path = get_momentum_path(initial_x,initial_y, 0.01, 200, 0.8)
rmsprop_path = get_rmsprop_path(initial_x,initial_y, 0.01, 300, 0.99, 1e-6)
adam_path = get_adam_path(initial_x,initial_y, 0.01, 500, 0.7, 0.999, 1e-6)
ftrl_path = get_ftrl_path(initial_x,initial_y, 0.01, 500, 0.1, 1, 1, 1)
adagrad_path = get_adagrad_path(initial_x,initial_y, 0.01, 500, 1e-6)

# Plotting
fig = make_subplots(rows=1, cols=1, subplot_titles=("Contour Plot and Optimization Paths",))

fig.add_trace(go.Contour(
    z=Z,
    x=domain_x,
    y=domain_y,
    contours=dict(
        coloring='heatmap',
        showlabels=True,
        labelfont=dict(size=12, color='white', family='Arial')
    ),
    colorbar=dict(title="Function value")
), row=1, col=1)

sgd_trace = go.Scatter(
    x=sgd_path[:, 0], y=sgd_path[:, 1],
    mode='lines+markers',
    name='SGD',
    line=dict(color='black')
)
momentum_trace = go.Scatter(
    x=momentum_path[:, 0], y=momentum_path[:, 1],
    mode='lines+markers',
    name='Momentum',
    line=dict(color='blue')
)
rmsprop_trace = go.Scatter(
    x=rmsprop_path[:, 0], y=rmsprop_path[:, 1],
    mode='lines+markers',
    name='RMSProp',
    line=dict(color='red')
)
adam_trace = go.Scatter(
    x=adam_path[:, 0], y=adam_path[:, 1],
    mode='lines+markers',
    name='Adam',
    line=dict(color='green')
)
ftrl_trace = go.Scatter(
    x=ftrl_path[:, 0], y=ftrl_path[:, 1],
    mode='lines+markers',
    name='FTRL',
    line=dict(color='orange')
)
adagrad_trace = go.Scatter(
    x=adagrad_path[:, 0], y=adagrad_path[:, 1],
    mode='lines+markers',
    name='AdaGrad',
    line=dict(color='purple')
)

fig.add_trace(sgd_trace, row=1, col=1)
fig.add_trace(momentum_trace, row=1, col=1)
fig.add_trace(rmsprop_trace, row=1, col=1)
fig.add_trace(adam_trace, row=1, col=1)
fig.add_trace(ftrl_trace, row=1, col=1)
fig.add_trace(adagrad_trace, row=1, col=1)

fig.update_layout(
    title="Optimization Path Visualizations",
    xaxis_title="X-axis",
    yaxis_title="Y-axis",
    legend=dict(x=0.1, y=0.9)
)

fig.show()
