Paper: https://arxiv.org/abs/1412.6980

In [1]:
import numpy as np

In [None]:
def gradient_descent(f, df, initial_point, learning_rate, num_iterations, threshold):
  # takes in a function, derivative, starting point, and preforms gradient descent

  point = np.array(initial_point, dtype=float)
  for i in range(num_iterations):
    gradient = df(point)
    step = learning_rate * gradient

    if np.linalg.norm(step) < threshold:
      break

    point -= step
    print("Iteration ", i, ": ", point)

  return point

In [None]:
def adam_optimizer(f, df, initial_point, learning_rate, num_iterations, threshold, beta1=0.9, beta2=0.999, epsilon=1e-8):
    # takes in the same stuff as GD but uses Adam optimization, aka momentum

    point = np.array(initial_point, dtype=float)
    m = np.zeros_like(point)  # First moment vector
    v = np.zeros_like(point)  # Second moment vector

    for i in range(1, num_iterations + 1):
        gradient = df(point)

        # Update biased first and second moment estimates
        m = beta1 * m + (1 - beta1) * gradient
        v = beta2 * v + (1 - beta2) * (gradient**2)

        # Correct bias in first and second moment estimates
        m_hat = m / (1 - beta1**i)
        v_hat = v / (1 - beta2**i)

        step = learning_rate * m_hat / (np.sqrt(v_hat) + epsilon)

        if np.linalg.norm(step) < threshold:
            break

        point -= step
        print("Iteration ", i, ": ", point)

    return point

In [None]:
# apparently Himmelblau is quite famous
def f_himmelblau(point):
    x, y = point
    return (x**2 + y - 11)**2 + (x + y**2 - 7)**2

def df_himmelblau(point):
    x, y = point
    dfdx = 4*x*(x**2 + y - 11) + 2*(x + y**2 - 7)
    dfdy = 2*(x**2 + y - 11) + 4*y*(x + y**2 - 7)
    return np.array([dfdx, dfdy])

In [5]:
initial_point_2d = [-3, -3]
learning_rate = 0.01
num_iterations = 100
threshold = 1e-5

In [6]:
min_point_2d = gradient_descent(f_himmelblau, df_himmelblau, initial_point_2d, learning_rate, num_iterations, threshold)
print(f"The minimum of the 2D function is at (x, y) = ({min_point_2d[0]:.4f}, {min_point_2d[1]:.4f})")

Iteration  0 :  [-3.58 -3.02]
Iteration  1 :  [-3.72316352 -3.17224768]
Iteration  2 :  [-3.75617542 -3.24979003]
Iteration  3 :  [-3.77344987 -3.27232489]
Iteration  4 :  [-3.77718455 -3.28020938]
Iteration  5 :  [-3.7788135  -3.28223212]
Iteration  6 :  [-3.77912151 -3.2829331 ]
Iteration  7 :  [-3.7792695  -3.28310289]
Iteration  8 :  [-3.77929341 -3.2831647 ]
Iteration  9 :  [-3.77930698 -3.28317873]
The minimum of the 2D function is at (x, y) = (-3.7793, -3.2832)


In [None]:
# use higher learning rate
min_point_adam = adam_optimizer(f_himmelblau, df_himmelblau, initial_point_2d, learning_rate*10, num_iterations, threshold)
print(f"The minimum of the 2D function using Adam is at (x, y) = ({min_point_adam[0]:.4f}, {min_point_adam[1]:.4f})")

Iteration  1 :  [-3.1 -3.1]
Iteration  2 :  [-3.19980428 -3.07669764]
Iteration  3 :  [-3.29880587 -3.08279707]
Iteration  4 :  [-3.39640502 -3.12543436]
Iteration  5 :  [-3.49195639 -3.18272612]
Iteration  6 :  [-3.58461494 -3.23753581]
Iteration  7 :  [-3.67319003 -3.27168564]
Iteration  8 :  [-3.75594405 -3.28474024]
Iteration  9 :  [-3.83057656 -3.2900635 ]
Iteration  10 :  [-3.89455701 -3.30121889]
Iteration  11 :  [-3.94573905 -3.32324829]
Iteration  12 :  [-3.98292711 -3.35128582]
Iteration  13 :  [-4.00605133 -3.37435618]
Iteration  14 :  [-4.01593115 -3.38201297]
Iteration  15 :  [-4.01389501 -3.3737519 ]
Iteration  16 :  [-4.00153271 -3.35654421]
Iteration  17 :  [-3.98057886 -3.33931267]
Iteration  18 :  [-3.95286185 -3.32865139]
Iteration  19 :  [-3.92027412 -3.32506951]
Iteration  20 :  [-3.88472156 -3.3237125 ]
Iteration  21 :  [-3.84804236 -3.31823214]
Iteration  22 :  [-3.81191084 -3.30526945]
Iteration  23 :  [-3.77776286 -3.2864665 ]
Iteration  24 :  [-3.74677143 -3.2