# Mini-Batch 梯度下降

In [1]:
# 加载必需的库

import numpy as np
import matplotlib.pyplot as plt
import scipy.io
import math
import sklearn
import sklearn.datasets

from opt_utils import load_params_and_grads, initialize_parameters, forward_propagation, backward_propagation
from opt_utils import compute_cost, predict, predict_dec, plot_decision_boundary, load_dataset
from testCases import *

%matplotlib inline
plt.rcParams['figure.figsize'] = (7.0, 4.0)
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'


  assert(parameters['W' + str(l)].shape == layer_dims[l], layer_dims[l-1])
  assert(parameters['W' + str(l)].shape == layer_dims[l], 1)


In [2]:
# 梯度下降中，用于更新参数的函数

def update_parameters_with_gd(parameters, grads, learning_rate):
    # 获取神经网络的层数
    L = len(parameters) // 2

    # 遍历每一层
    for l in range(L):
        parameters["W" + str(l + 1)] = parameters["W" + str(l + 1)] - learning_rate * grads["dW" + str(l + 1)]
        parameters["b" + str(l + 1)] = parameters["b" + str(l + 1)] - learning_rate * grads["db" + str(l + 1)]
    
    return parameters

# batch 梯度下降和随机梯度下降的差别

- **batch梯度下降**:

``` python
X = data_input
Y = labels
parameters = initialize_parameters(layers_dims)
for i in range(0, num_iterations):
    a, caches = forward_propagation(X, parameters)
    cost = compute_cost(a, Y)
    grads = backward_propagation(a, caches, parameters)
    parameters = update_parameters(parameters, grads)
        
```

- **随机梯度下降**:

```python
X = data_input
Y = labels
parameters = initialize_parameters(layers_dims)
for i in range(0, num_iterations):
    for j in range(0, m): # 遍历循环每一个样本
        a, caches = forward_propagation(X[:,j], parameters)
        cost = compute_cost(a, Y[:,j])
        grads = backward_propagation(a, caches, parameters)
        parameters = update_parameters(parameters, grads)
```

左图是随机梯度下降，右图是batch梯度下降

![](./images/kiank_sgd.png)

左图是随机梯度下降，右图是mini-batch梯度下降

![](./images/kiank_minibatch.png)

注意

- 这3个梯度下降的区别仅仅在于它们每次学习的样本数量不同。
- 无论是哪种梯度下降，学习率都是必须要精心调的。
- 通常来说，如果数据集很大，那么mini-batch梯度下降会比另外2种要高效。

# Mini-Batch 梯度下降步骤

## 1. 洗牌

![](./images/kiank_shuffle.png)

## 2. 分割

![](./images/kiank_partition.png)

In [4]:
# 函数实现

def random_mini_batches(X, Y, mini_batch_size=64, seed=0):
    # 初始化随机数种子
    np.random.seed(seed)

    m = X.shape[1]
    mini_batches = []

    # 第一步，洗牌
    # 生成 m 范围内的随机整数，如 m=3，则结果可能是 [2, 0, 1]
    permutation = list(np.random.permutation(m))
    shuffled_X = X[:, permutation]
    shuffled_Y = Y[:, permutation].reshape((1, m))

    # 第二步，分割
    # 获取子训练集的个数（不包括最后一个除不尽的）
    num_complete_minibatches = math.floor(m/mini_batch_size)
    for k in range(num_complete_minibatches):
        mini_batch_X = shuffled_X[:, k * mini_batch_size: (k+1) * mini_batch_size]
        mini_batch_Y = shuffled_Y[:, k * mini_batch_size: (k+1) * mini_batch_size]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    # 最后处理无法除尽的子训练集
    if m % mini_batch_size != 0:
        mini_batch_X = shuffled_X[:, num_complete_minibatches * mini_batch_size:]
        mini_batch_Y = shuffled_Y[:, num_complete_minibatches * mini_batch_size:]

        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    return mini_batches

# 重点

- 洗牌和分割是 mini-batch 的两个重要步骤。
- mini-batch 的大小选择一般是 2 的次方：16，32，64，128...