# Installing Libraries (Python version >= 3.8)

In [None]:
import sys

version = sys.version_info
print(version)
assert version.major == 3 and version.minor >= 8

In [None]:
!python -m pip install numpy == 1.23.5 matplotlib == 3.7.4

$$
W_{t+1} = W_t - \eta\frac{\partial L}{\partial W_t} \\
$$

$$
W_t: Parameters\,(Weights) \\
\eta: Learning\,Rate \\
L: Loss\,Function \\
$$

In [None]:
class SGD:
    """Stochastic Gradient Descent."""

    def __init__(self, lr: float = 0.01) -> None:
        """Initialize instance.

        :param lr: Learning rate.
        """
        self.lr = lr

    def update(self, params: dict, grads: dict) -> None:
        """Update parameters.

        :param params: Parameters.
        :param grads: Gradients.
        """
        for key in params.keys():
            params[key] -= self.lr * grads[key]

$$
v_{t+1} = \alpha v_t - \eta\frac{\partial L}{\partial W_t} \\
W_{t+1} = W_t + v_{t+1} \\
$$

$$
\alpha: Momentum\,Coefficient \\
v_t: Velocity \\
\eta: Learning\,Rate \\
L: Loss\,Function \\
W_t: Parameters\,(Weights) \\
$$

In [None]:
class Momentum:
    """Momentum."""

    def __init__(self, lr: float = 0.01, momentum: float = 0.9) -> None:
        """Initialize instance.

        :param lr: Learning rate.
        :param momentum: Momentum.
        """
        self.lr = lr
        self.momentum = momentum
        self.v = None  # velocity

    def update(self, params: dict, grads: dict) -> None:
        """Update parameters.

        :param params: Parameters.
        :param grads: Gradients.
        """
        if self.v is None:
            self.v = {}
            for key, val in params.items():
                self.v[key] = np.zeros_like(val)

        for key in params.keys():
            self.v[key] = self.momentum * self.v[key] - self.lr * grads[key]
            params[key] += self.v[key]

$$
h_{t+1} = h_t + \frac{\partial L}{\partial W_t} \odot \frac{\partial L}{\partial W_t} \\
W_{t+1} = W_t - \eta\frac{1}{\sqrt{h_{t+1}} + \epsilon} \odot \frac{\partial L}{\partial W_t} \\
$$

$$
h_t: Squared\,Gradients \\
\eta: Learning\,Rate \\
\epsilon: Smoothing\,Term \\
L: Loss\,Function \\
W_t: Parameters\,(Weights) \\
$$

In [None]:
class AdaGrad:
    """AdaGrad."""

    def __init__(self, lr: float = 0.01) -> None:
        """Initialize instance.

        :param lr: Learning rate.
        """
        self.lr = lr
        self.h = None  # squared gradients

    def update(self, params: dict, grads: dict) -> None:
        """Update parameters.

        :param params: Parameters.
        :param grads: Gradients.
        """
        if self.h is None:
            self.h = {}
            for key, val in params.items():
                self.h[key] = np.zeros_like(val)

        for key in params.keys():
            self.h[key] += grads[key] * grads[key]
            params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7)

$$
m_{t+1} = \beta_1 m_t + (1 - \beta_1)\frac{\partial L}{\partial W_t} \\
v_{t+1} = \beta_2 v_t + (1 - \beta_2)\frac{\partial L}{\partial W_t} \odot \frac{\partial L}{\partial W_t} \\
\hat{m}_{t+1} = \frac{m_{t+1}}{1 - \beta_1^t} \\
\hat{v}_{t+1} = \frac{v_{t+1}}{1 - \beta_2^t} \\
W_{t+1} = W_t - \eta\frac{\hat{m}_{t+1}}{\sqrt{\hat{v}_{t+1}} + \epsilon} \\
$$

$$
\beta_1: Exponential\,Decay\,Rate\,for\,the\,First\,Moment\,Estimate \\
\beta_2: Exponential\,Decay\,Rate\,for\,the\,Second\,Moment\,Estimate \\
m_t: First\,Moment\,Estimate \\
v_t: Second\,Moment\,Estimate \\
\eta: Learning\,Rate \\
\epsilon: Smoothing\,Term \\
L: Loss\,Function \\
W_t: Parameters\,(Weights) \\
$$

In [None]:
class Adam:
    """Adam."""

    def __init__(self, lr: float = 0.001, beta1: float = 0.9, beta2: float = 0.999) -> None:
        """Initialize instance.

        :param lr: Learning rate.
        :param beta1: Exponential decay rate for the first moment estimates.
        :param beta2: Exponential decay rate for the second moment estimates.
        """
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.iter = 0  # iteration
        self.m = None  # first moment estimates
        self.v = None  # second moment estimates

    def update(self, params: dict, grads: dict) -> None:
        """Update parameters.

        :param params: Parameters.
        :param grads: Gradients.
        """
        if self.m is None:
            self.m, self.v = {}, {}
            for key, val in params.items():
                self.m[key] = np.zeros_like(val)
                self.v[key] = np.zeros_like(val)

        self.iter += 1
        lr_t = self.lr * np.sqrt(1.0 - self.beta2 ** self.iter) / (1.0 - self.beta1 ** self.iter)

        for key in params.keys():
            self.m[key] += (1 - self.beta1) * (grads[key] - self.m[key])
            self.v[key] += (1 - self.beta2) * (grads[key] ** 2 - self.v[key])

            params[key] -= lr_t * self.m[key] / (np.sqrt(self.v[key]) + 1e-7)

In [None]:
import matplotlib.pyplot as plt
import numpy as np


def f(x, y):
    return x ** 2 / 20.0 + y ** 2


def df(x, y):
    return x / 10.0, 2.0 * y


init_pos = (-7.0, 2.0)
params = {
    "x": init_pos[0],
    "y": init_pos[1],
}
grads = {
    "x": 0,
    "y": 0,
}
optimizers = {
    "SGD": SGD(lr=0.95),
    "Momentum": Momentum(lr=0.1),
    "AdaGrad": AdaGrad(lr=1.5),
    "Adam": Adam(lr=0.3),
}

idx = 1

for key in optimizers:
    optimizer = optimizers[key]
    x_history = []
    y_history = []
    params["x"], params["y"] = init_pos[0], init_pos[1]

    for i in range(30):
        x_history.append(params["x"])
        y_history.append(params["y"])

        grads["x"], grads["y"] = df(params["x"], params["y"])
        optimizer.update(params, grads)

    x = np.arange(-10, 10, 0.01)
    y = np.arange(-5, 5, 0.01)

    X, Y = np.meshgrid(x, y)
    Z = f(X, Y)

    # for simple contour line
    mask = Z > 7
    Z[mask] = 0

    # plot
    plt.subplot(2, 2, idx)
    idx += 1
    plt.plot(x_history, y_history, "o-", color="red")
    plt.contour(X, Y, Z)
    plt.ylim(-10, 10)
    plt.xlim(-10, 10)
    plt.plot(0, 0, "+")
    plt.title(key)
    plt.xlabel("x")
    plt.ylabel("y")
    plt.subplots_adjust(wspace=0.4, hspace=0.6)

plt.show()