In [1]:
# 导入必需的库
import numpy as np
import matplotlib.pyplot as plt
import scipy.io
import math
import sklearn
import sklearn.datasets

from opt_utils import load_params_and_grads, initialize_parameters, forward_propagation, backward_propagation
from opt_utils import compute_cost, predict, predict_dec, plot_decision_boundary, load_dataset
from testCases import *

# 设置绘图的参数
%matplotlib inline
plt.rcParams['figure.figsize'] = (7.0, 4.0)
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'


  assert(parameters['W' + str(l)].shape == layer_dims[l], layer_dims[l-1])
  assert(parameters['W' + str(l)].shape == layer_dims[l], 1)


# Adam: `adaptive moment estimation`

Adam 是 RMSprop 和动量梯度下降两种算法的结合。

## 主要步骤

1. 根据前面的梯度值计算指数加权平均值，存在变量 v 中，对其进行修正，存储在 $v^{corrected}$ 中
2. 根据前面的梯度值的平方计算指数加权平均值，存在变量 s 中，对其进行修正，存储在 $s^{corrected}$ 中
3. 根据 $v^{corrected}$ 和 $s^{corrected}$ 来更新参数

## 涉及公式如下

$$\begin{cases}
v_{dW^{[l]}} = \beta_1 v_{dW^{[l]}} + (1 - \beta_1) \frac{\partial \mathcal{J} }{ \partial W^{[l]} } \\
v^{corrected}_{dW^{[l]}} = \frac{v_{dW^{[l]}}}{1 - (\beta_1)^t} \\
s_{dW^{[l]}} = \beta_2 s_{dW^{[l]}} + (1 - \beta_2) (\frac{\partial \mathcal{J} }{\partial W^{[l]} })^2 \\
s^{corrected}_{dW^{[l]}} = \frac{s_{dW^{[l]}}}{1 - (\beta_1)^t} \\
W^{[l]} = W^{[l]} - \alpha \frac{v^{corrected}_{dW^{[l]}}}{\sqrt{s^{corrected}_{dW^{[l]}}} + \varepsilon}
\end{cases}$$

## 变量解释

1. t 表示梯度下降的次数 
2. L 表示神经网络的层数
3. $\beta_1$ 和 $\beta_2$ 是控制指数加权平均值的超参数
4. $\alpha$ 是学习率
5. $\varepsilon$ 用来防止除 0

In [4]:
# 初始化指数加权平均值

def initialize_adam(parameters):
    L = len(parameters)//2
    v = {}
    s = {}

    for l in range(L):
        v['dW' + str(l + 1)] = np.zeros_like(parameters['W' + str(l + 1)])
        v['db' + str(l + 1)] = np.zeros_like(parameters['b' + str(l + 1)])

        s['dW' + str(l + 1)] = np.zeros_like(parameters['W' + str(l + 1)])
        s['db' + str(l + 1)] = np.zeros_like(parameters['b' + str(l + 1)])
    
    return v, s

In [6]:
# 使用 adam 更新参数

def update_parameters_with_adam(parameters, grads, v, s, t, learning_rate=0.01, beta1=0.9,beta2=0.999, 
                                epsilon=1e-8):
    L = len(parameters)//2
    v_corrected = {}
    s_corrected = {}

    for l in range(L):
        # 计算 v 值
        v['dW' + str(l + 1)] = beta1 * v['dW' + str(l + 1)] + (1 - beta1) * grads['dW' + str(l + 1)]
        v['db' + str(l + 1)] = beta1 * v['db' + str(l + 1)] + (1 - beta1) * grads['db' + str(l + 1)]

        # 对 v 值进行修正
        v_corrected['dW' + str(l + 1)] = v["dW" + str(l + 1)] / (1 - np.power(beta1, t))
        v_corrected['db' + str(l + 1)] = v['db' + str(l + 1)] / (1 - np.power(beta1, t))

        # 计算 s 值
        s["dW" + str(l + 1)] = beta2 * s["dW" + str(l + 1)] + (1 - beta2) * np.power(grads['dW' + str(l + 1)], 2)
        s["db" + str(l + 1)] = beta2 * s["db" + str(l + 1)] + (1 - beta2) * np.power(grads['db' + str(l + 1)], 2)
    

        # 对 s 值进行修正
        s_corrected["dW" + str(l + 1)] = s["dW" + str(l + 1)] / (1 - np.power(beta2, t))
        s_corrected["db" + str(l + 1)] = s["db" + str(l + 1)] / (1 - np.power(beta2, t))

        # 更新参数
        parameters['W' + str(l + 1)] = parameters['W' + str(l + 1)] - learning_rate * v_corrected['dW' + str(l + 1)] /np.sqrt(s_corrected["dW" + str(l + 1)] + epsilon)
        parameters['b' + str(l + 1)] = parameters['b' + str(l + 1)] - learning_rate * v_corrected['db' + str(l + 1)] /np.sqrt(s_corrected["db" + str(l + 1)] + epsilon)

    return parameters, v, s
