首先引入 build cnn (step 1) 中编写的代码

In [1]:
# 导入必需的库

import numpy as np
import h5py
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['figure.figsize'] = (5.0, 4.0) 
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

%load_ext autoreload
%autoreload 2

np.random.seed(1)

In [5]:
# 零填充
def zero_pad(X, pad):
    return np.pad(X, ((0, 0), (pad, pad), (pad, pad), (0, 0)), 'constant', constant_values=0)

In [6]:
# 单步卷积
def conv_single_step(a_slice_prev, W, b):
    s = np.multiply(a_slice_prev, W) + b
    return np.sum(s)

In [7]:
# 卷积网络的前向传播
def conv_forward(A_prev, W, b, hparameters):
    (m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
    (f, f, n_C_prev, n_C) = W.shape

    stride = hparameters['stride']  
    pad = hparameters['pad']       

    n_H = int((n_H_prev - f + 2 * pad) / stride) + 1
    n_W = int((n_W_prev - f + 2 * pad) / stride) + 1

    Z = np.zeros((m, n_H, n_W, n_C))

    A_prev_pad = zero_pad(A_prev, pad)

    for i in range(m):
        a_prev_pad = A_prev_pad[i]   
        for h in range(n_H):            
            for w in range(n_W):   
                vert_start = h * stride
                vert_end = vert_start + f
                horiz_start = w * stride
                horiz_end = horiz_start + f
                a_slice_prev = a_prev_pad[vert_start:vert_end, horiz_start:horiz_end]
                for c in range(n_C):   
                    Z[i, h, w, c] = conv_single_step(a_slice_prev, W[..., c], b [..., c])
    
    assert(Z.shape == (m, n_H, n_W, n_C))

    cache = (A_prev, W, b, hparameters)

    return Z, cache

In [12]:
# unit test

np.random.seed(1)
A_prev = np.random.randn(10, 4, 4, 3)
W = np.random.randn(2, 2, 3, 8)
b = np.random.randn(1, 1, 1, 8)
hparameters = {"pad" : 2,
               "stride": 1}

Z, cache_conv = conv_forward(A_prev, W, b, hparameters)
print("Z's mean =", np.mean(Z))
print("cache_conv[0][1][2][3] =", cache_conv[0][1][2][3])

Z's mean = 0.15585932488906465
cache_conv[0][1][2][3] = [-0.20075807  0.18656139  0.41005165]


# 池化层

池化层可以将矩阵的尺寸变小，降低神经网络的计算量，让网络的预测鲁棒性更好。

两种池化层：

1. 最大池化层 (Max-pooling layer)：取输入矩阵的子矩阵中最大值元素作为输出矩阵的一个元素
2. 平均池化层 (Average-pooling layer)：取输入矩阵的子矩阵中所有元素的平均值作为输出矩阵的一个元素。

`子矩阵` 在这里也叫做 `窗口`。

<table>
<td>
<img src="images/max_pool1.png" style="width:500px;height:230px;">
<td>

<td>
<img src="images/a_pool.png" style="width:500px;height:230px;">
<td>
</table>

池化层没有参数，因为它的过滤器是虚拟的不存在，但有超参数，如窗口大小 f，步长 s。

## 池化的前向传播

计算公式如下：

$$ n_H = \lfloor \frac{n_{H_{prev}} - f}{stride} \rfloor +1 $$
$$ n_W = \lfloor \frac{n_{W_{prev}} - f}{stride} \rfloor +1 $$
$$ n_C = n_{C_{prev}}$$

In [8]:
def pool_forward(A_prev, hparameters, mode = "max"):
    '''
    参数：
    A_prev -- 输入矩阵，也就是上一层的输出矩阵，维度是 (m, n_H_prev, n_W_prev, n_C_prev)
    hparameters -- 超参数，窗口大小 f 和步长 s
    mode -- 池化模式，最大池传入 'max'，平均池传入 'average'

    返回值：
    A -- 池化层的输出参数，维度是 (m, n_H, n_W, n_C)
    cache -- 缓存一些数据
    '''

    (m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape

    f = hparameters['f']                # 窗口大小
    stride = hparameters['stride']      # 步长

    # 计算输出矩阵的大小
    n_H = int((n_H_prev - f) / stride) + 1
    n_W = int((n_W_prev - f) / stride) + 1
    n_C = n_C_prev

    # 初始化输出矩阵
    A = np.zeros((m, n_H, n_W, n_C))

    for i in range(m):                  # 遍历所有样本
        for h in range(n_H):            # 遍历输出矩阵的高
            for w in range(n_W):        # 遍历输出矩阵的宽
                # 计算本次池化区域的索引
                vert_start = h * stride
                vert_end = vert_start + f
                horiz_start = w * stride
                horiz_end = horiz_start + f
                for c in range(n_C):    # 遍历输出矩阵的深度
                    # 取出将池化的子矩阵窗口
                    a_prev_slice = A_prev[i, vert_start:vert_end, horiz_start:horiz_end, c]

                    # 执行池化
                    if mode == 'max':
                        A[i, h, w, c] = np.max(a_prev_slice)
                    elif mode == 'average':
                        A[i, h, w, c] = np.average(a_prev_slice)

    assert(A.shape == (m, n_H, n_W, n_C))

    cache = (A_prev, hparameters)

    return A, cache

In [9]:
# 单元测试
np.random.seed(1)
A_prev = np.random.randn(2, 4, 4, 3)
hparameters = {"stride" : 1, "f": 4}

A, cache = pool_forward(A_prev, hparameters)
print("mode = max")
print("A =", A)
print()
A, cache = pool_forward(A_prev, hparameters, mode = "average")
print("mode = average")
print("A =", A)

mode = max
A = [[[[1.74481176 1.6924546  2.10025514]]]


 [[[1.19891788 1.51981682 2.18557541]]]]

mode = average
A = [[[[-0.09498456  0.11180064 -0.14263511]]]


 [[[-0.09525108  0.28325018  0.33035185]]]]


# CNN 的反向传播

CNN 的反向传播比较复杂，了解即可

## 卷积层的反向传播

首先我们来学习一下如何实现卷积层的反向传播。

### 1. 计算dA:
下面的公式被用来计算某个样本的某个过滤器的dA:

$$ dA += \sum _{h=0} ^{n_H} \sum_{w=0} ^{n_W} W_c \times dZ_{hw} \tag{1}$$

$W_c$是表示这个过滤器。

这个公式对应的python代码如下：
```python
da_prev_pad[vert_start:vert_end, horiz_start:horiz_end, :] += W[:,:,:,c] * dZ[i, h, w, c]
```

### 2. 计算dW:
下面的公式被用来计算某个过滤器的dW:

$$ dW_c  += \sum _{h=0} ^{n_H} \sum_{w=0} ^ {n_W} a_{slice} \times dZ_{hw}  \tag{2}$$

$a_{slice}$表示输入矩阵中的被卷积的子矩阵。

上面的公式对应于下面的python代码:
```python
dW[:,:,:,c] += a_slice * dZ[i, h, w, c]
```

### 3. 计算db:

这个公式用来计算某个过滤器的db:

$$ db = \sum_h \sum_w dZ_{hw} \tag{3}$$

上面的公式对应于下面的python代码:
```python
db[:,:,:,c] += dZ[i, h, w, c]

In [10]:
def conv_backward(dZ, cache):
    '''
    参数：
    dZ -- 后一层相关的dZ，维度是(m, n_H, n_W, n_C)
    cache -- 前面的conv_forward()函数保存下来的缓存数据
    
    Returns:
    dA_prev -- 本卷积层输入矩阵的dA，维度是(m, n_H_prev, n_W_prev, n_C_prev)
    dW -- 本卷积层相关的dW,维度是(f, f, n_C_prev, n_C)
    db -- 本卷积层相关的db,维度是(1, 1, 1, n_C)
    '''

    (A_prev, W, b, hparameters) = cache
    
    (m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
    
    (f, f, n_C_prev, n_C) = W.shape
    
    stride = hparameters["stride"] # 步长
    pad = hparameters["pad"] # padding数量
    
    (m, n_H, n_W, n_C) = dZ.shape
    
    dA_prev = np.zeros((m, n_H_prev, n_W_prev, n_C_prev))                           
    dW = np.zeros((f, f, n_C_prev, n_C))
    db = np.zeros((1, 1, 1, n_C))

    A_prev_pad = zero_pad(A_prev, pad)
    dA_prev_pad = zero_pad(dA_prev, pad)
    
    for i in range(m):                       # 遍历每一个样本
        
        a_prev_pad = A_prev_pad[i]
        da_prev_pad = dA_prev_pad[i]
        
        for h in range(n_H):                   # 遍历输出矩阵的高
            for w in range(n_W):               # 遍历输出矩阵的宽
                for c in range(n_C):           # 遍历输出矩阵的深度
                    
                    # 计算输入矩阵中的子矩阵的索引
                    vert_start = h
                    vert_end = vert_start + f
                    horiz_start = w
                    horiz_end = horiz_start + f
                    
                    # 取出当前进行卷积的子矩阵
                    a_slice = a_prev_pad[vert_start:vert_end, horiz_start:horiz_end, :]

                    # 用上面的公式来计算偏导数
                    da_prev_pad[vert_start:vert_end, horiz_start:horiz_end, :] += W[:,:,:,c] * dZ[i, h, w, c]
                    dW[:,:,:,c] += a_slice * dZ[i, h, w, c]
                    db[:,:,:,c] += dZ[i, h, w, c]
                    
        dA_prev[i, :, :, :] = da_prev_pad[pad:-pad, pad:-pad, :]

    assert(dA_prev.shape == (m, n_H_prev, n_W_prev, n_C_prev))
    
    return dA_prev, dW, db

In [13]:
np.random.seed(1)
dA, dW, db = conv_backward(Z, cache_conv)
print("dA_mean =", np.mean(dA))
print("dW_mean =", np.mean(dW))
print("db_mean =", np.mean(db))
# print(dA.shape)

dA_mean = 9.608990675868995
dW_mean = 10.581741275547563
db_mean = 76.37106919563735


## 池化层的反向传播

### 1. 最大池化的反向传播

在实现池化层之前，我们需要先实现两个工具函数，第一个是 `create_mask_from_window()`，它可以根据输入矩阵得到一个特殊的输出矩阵，这个输出矩阵中只有最大值处是 1，其余都是零。如下所示，X 是输入矩阵，M 是函数的输出矩阵: 

$$ X = \begin{bmatrix}
1 && 3 \\
4 && 2
\end{bmatrix} \quad \rightarrow  \quad M =\begin{bmatrix}
0 && 0 \\
1 && 0
\end{bmatrix}\tag{4}$$

提示:
- [np.max()]() 会返回矩阵中的最大元素。
- python语法 `A = (X == x)` 会生成一个矩阵 A，这个 A 与 X 的维度是一样的，A 里面其它元素都为 0，只有与小 x 的值相同的位置处为 1，也就是为 True。python 中 0 等于 False，1 等于 True:
```
A[i,j] = True if X[i,j] = x
A[i,j] = False if X[i,j] != x

In [14]:
def create_mask_from_window(x):

    # x是一个矩阵。np.max(x)会得到最大元素。
    # mask是一个与x维度相同的矩阵，里面其余元素都为0，只有x最大值元素的位置处为1
    mask = x == np.max(x)
    
    return mask

In [15]:
np.random.seed(1)
x = np.random.randn(2,3)
mask = create_mask_from_window(x)
print('x = ', x)
print("mask = ", mask)

x =  [[ 1.62434536 -0.61175641 -0.52817175]
 [-1.07296862  0.86540763 -2.3015387 ]]
mask =  [[ True False False]
 [False False False]]


### 2. 平均池化的反向传播

为了实现最大池化的反向传播，我们需要实现如下的工具函数 `distribute_value`。就是把一个数值平分成一个矩阵，例如把 1 平分成四分之一到一个矩阵中: 
$$ dZ = 1 \quad \rightarrow  \quad dZ =\begin{bmatrix}
1/4 && 1/4 \\
1/4 && 1/4
\end{bmatrix}\tag{5}$$

In [16]:
def distribute_value(dz, shape):
    """    
    参数:
    dz -- 一个数值
    shape -- 输出矩阵的维度
    
    返回值:
    a -- a的维度就是shape，里面的值是又dz平分而来的
    """
    (n_H, n_W) = shape
    
    # 计算平均值
    average = dz / (n_H * n_W)
    
    # 构建输出矩阵
    a = np.ones(shape) * average
    
    return a

In [17]:
a = distribute_value(2, (2,2))
print('distributed value =', a)

distributed value = [[0.5 0.5]
 [0.5 0.5]]


In [18]:
def pool_backward(dA, cache, mode = "max"):
    """
    池化层的反向传播

    参数:
    dA -- 本池化层的输出矩阵对应的偏导数
    cache -- 前向传播时缓存起来的数值
    mode -- 是最大池化还是平均池化，("max" 或 "average")
    
    Returns:
    dA_prev -- 本池化层的输入矩阵对应的偏导数
    """

    # A_prev是本池化层的输入矩阵
    (A_prev, hparameters) = cache
    
    stride = hparameters["stride"]
    f = hparameters["f"]
    
    m, n_H_prev, n_W_prev, n_C_prev = A_prev.shape
    m, n_H, n_W, n_C = dA.shape
    
    dA_prev = np.zeros(A_prev.shape)
    
    for i in range(m):                     
        a_prev = A_prev[i]
        for h in range(n_H):                  
            for w in range(n_W):             
                for c in range(n_C):          
                    vert_start = h
                    vert_end = vert_start + f
                    horiz_start = w
                    horiz_end = horiz_start + f
                    
                    if mode == "max":
                        a_prev_slice = a_prev[vert_start:vert_end, horiz_start:horiz_end, c]
                        mask = create_mask_from_window(a_prev_slice)
                        dA_prev[i, vert_start:vert_end, horiz_start:horiz_end, c] += np.multiply(mask, dA[i, h, w, c])
                        
                    elif mode == "average":
                        da = dA[i, h, w, c]
                        shape = (f, f)
                        dA_prev[i, vert_start:vert_end, horiz_start:horiz_end, c] += distribute_value(da, shape)
                        
    assert(dA_prev.shape == A_prev.shape)
    
    return dA_prev

In [19]:
# 单元测试

np.random.seed(1)
A_prev = np.random.randn(5, 5, 3, 2)
hparameters = {"stride" : 1, "f": 2}
A, cache = pool_forward(A_prev, hparameters)
dA = np.random.randn(5, 4, 2, 2)

dA_prev = pool_backward(dA, cache, mode = "max")
print("mode = max")
print('mean of dA = ', np.mean(dA))
print('dA_prev[1,1] = ', dA_prev[1,1])  
print()
dA_prev = pool_backward(dA, cache, mode = "average")
print("mode = average")
print('mean of dA = ', np.mean(dA))
print('dA_prev[1,1] = ', dA_prev[1,1]) 

mode = max
mean of dA =  0.14571390272918056
dA_prev[1,1] =  [[ 0.          0.        ]
 [ 5.05844394 -1.68282702]
 [ 0.          0.        ]]

mode = average
mean of dA =  0.14571390272918056
dA_prev[1,1] =  [[ 0.08485462  0.2787552 ]
 [ 1.26461098 -0.25749373]
 [ 1.17975636 -0.53624893]]
