# Exercise 6 Solution - Adam
### Task
Implement the Adam optimizer. For help refer to algorithm 2. A class structure is provided. After the implementation, compare the Adam optimizer with standard gradient descent 

### Learning goals
- Understand the Adam optimizer
- Experience the difference between Adam and standard gradient descent

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors
import copy

**define starting position and function to optimize**

In [None]:
np.random.seed(100)  # Generate the data.
x1 = 2.
x2 = 2.
params0 = [np.array([x1]), np.array([x2])]

f = lambda x1, x2: 100 * (x2 - x1**2) ** 2 + (1 - x1) ** 2  # Rosenbrock function
dfdx = lambda x1, x2: [
    np.array(400 * (-x2 + x1**2) * x1 + 2 * (x1 - 1)),
    np.array(200 * (x2 - x1**2)),
]

**Adam optimizer** 

In [None]:
class AdamOptimizer:
    def __init__(self, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = None  # list to store all first statistical moments
        self.n = None  # list to store all second statistical moments
        self.t = 0  # keeps track of how many epochs have been performed

    def updateParams(self, params, grads):
        if self.m is None:
            self.m = [np.zeros_like(param) for param in params]  # initializing list
        if self.n is None:
            self.n = [np.zeros_like(param) for param in params]  # initializing list

        updatedParams = []

        self.t += 1  # exponent increases with epochs

        for p, g, m, n in zip(params, grads, self.m, self.n):
            m[:] = self.beta1 * m + (1 - self.beta1) * g
            n[:] = self.beta2 * n + (1 - self.beta2) * (g**2)

            mhat = m / (1 - self.beta1**self.t)
            nhat = n / (1 - self.beta2**self.t)

            updatedP = p - self.lr * mhat / (np.sqrt(nhat) + self.epsilon)
            updatedParams.append(updatedP)

        return updatedParams

**optimization with gradient descent**

In [None]:
lr = 1e-3 
epochs = 1000

params = copy.deepcopy(params0)

optimizationPathGD = np.zeros((2, epochs))
for epoch in range(epochs):
    cost = f(params[0], params[1]).item()
    optimizationPathGD[0, epoch] = params[0]
    optimizationPathGD[1, epoch] = params[1]
    grad = dfdx(params[0], params[1])
    
    params[0] -= lr * grad[0]
    params[1] -= lr * grad[1]
    
    if epoch % 100 == 0:
        string = "Epoch: {}/{}\t\tCost = {:.2e}"
        print(string.format(epoch, epochs, cost))

**optimization with Adam**

In [None]:
lr = 5e-1
epochs = 1000
optimizer = AdamOptimizer(lr=lr)

params = copy.deepcopy(params0)

optimizationPathAdam = np.zeros((2, epochs))
for epoch in range(epochs):
    cost = f(params[0], params[1]).item()
    optimizationPathAdam[0, epoch] = params[0]
    optimizationPathAdam[1, epoch] = params[1]
    grad = dfdx(params[0], params[1])

    params = optimizer.updateParams(params, grad)

    if epoch % 100 == 0:
        string = "Epoch: {}/{}\t\tCost = {:.2e}"
        print(string.format(epoch, epochs, cost))

**visualize the optimization path**

In [None]:
x1_ = np.linspace(-1, 3, 200)
x2_ = np.linspace(-1, 3, 200)
x1_, x2_ = np.meshgrid(x1_, x2_, indexing="ij")

fig, ax = plt.subplots(figsize=(12,8))
cp = ax.pcolormesh(x1_, x2_, f(x1_, x2_), cmap=plt.cm.jet, norm=colors.LogNorm(), shading='auto')
ax.plot(optimizationPathGD[0], optimizationPathGD[1], 'k', label="gradient descent")
ax.plot(optimizationPathGD[0], optimizationPathGD[1], 'ko')
ax.plot(optimizationPathAdam[0], optimizationPathAdam[1], "r", label="Adam")
ax.plot(optimizationPathAdam[0], optimizationPathAdam[1], "rs")
ax.plot([1], [1], "bs", markersize=12, label="minimum")

fig.colorbar(cp)
ax.legend()
plt.show()