# 梯度的定义

由全部变量的偏导数汇总
而成的向量称为梯度（gradient）

# 提前准备的函数

## softmax

In [14]:
#def softmax(a):
#    c = np.max(a)
#    exp_a = np.exp(a - c) # 溢出对策
#    sum_exp_a = np.sum(exp_a)
#    y = exp_a / sum_exp_a
#    return y

def softmax(x):
    if x.ndim == 2:
        print('传入softmax的是2维向量')
        x = x.T
        x = x - np.max(x, axis=0)
        y = np.exp(x) / np.sum(np.exp(x), axis=0)
        return y.T  #返回的是一维的向量

    x = x - np.max(x) # 溢出对策
    return np.exp(x) / np.sum(np.exp(x))

## mini-batch版交叉熵误差的实现

In [10]:
def cross_entropy_error(y, t):#返回的结果值是个标量
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
    batch_size = y.shape[0]
    return -np.sum(t * np.log(y + 1e-7)) / batch_size   #除以batch_size，表示结果是求取平均每个样本的交叉熵

# 数值微分法_基于数值微分计算参数的梯度

In [None]:
def numerical_diff(f,x):
    h = 1e-4
    return((f(x+h)-f(x-h))/(2*h))

In [49]:
def numerical_gradient(f,x):
    h = 1e-4 #0.0001
    grad = np.zeros_like(x) # 生成和x形状相同的数组
    
    for idx in range(x.size):
        tmp_val = x[idx]
        
        # f(x+h)的计算
        x[idx] = tmp_val + h
        fxh1 = f(x)  #目标变量改变+h，其他变量不变，传入函数中，得到标量值
        
        # f(x-h)的计算
        x[idx] = tmp_val - h
        fxh2 = f(x) #目标变量改变-h，其他变量不变，传入函数中，得到标量值
        grad[idx] = (fxh1 - fxh2) / (2*h)
        x[idx] = tmp_val # 还原值
        
    return grad


In [50]:
#测试
def function_2(x):
    return x[0]**2+x[1]**2

numerical_gradient(function_2, np.array([3.0, 4.0]))


array([6., 8.])

In [51]:
def gradient_descent(f, init_x, lr=0.01, step_num=100):
    x = init_x
    
    for i in range(step_num):
        grad = numerical_gradient(f, x)
        x -= lr * grad
    
    return x

参数f是要进行最优化的函数，init_x是初始值，lr是学习率learning rate，step_num是梯度法的重复次数。

numerical_gradient(f,x)会求函数的梯度，用该梯度乘以学习率得到的值进行更新操作，由step_num指定重复的次数。

使用这个函数可以求函数的极小值，顺利的话，还可以求函数的最小值。

问题：请用梯度法求${f(x_0,x_1)=x_0^2+x_1^2}$ 的最小值。



In [52]:
def function_2(x):
    return x[0]**2+x[1]**2

init_x = np.array([-3.0, 4.0])
gradient_descent(function_2,init_x,lr=0.1, step_num=100)

array([-6.11110793e-10,  8.14814391e-10])

## 神经网络的梯度

### 一个简单的神经网络

In [57]:
import sys, os
sys.path.append(os.pardir)
import numpy as np
from common.functions import softmax, cross_entropy_error
from common.gradient import numerical_gradient

class simpleNet:
    def __init__(self):
        self.W = np.random.randn(2,3) # 用高斯分布进行初始化
        
    def predict(self,x):
        return np.dot(x,self.W)
   
    def loss(self,x,t):
        z = self.predict(x)
        y = softmax(z)
        loss = cross_entropy_error(y, t)
        
        return loss
        
net = simpleNet()
net.W

array([[ 0.23210343,  0.80529943,  0.33053336],
       [ 1.15018482,  0.39713218, -1.69789563]])

In [58]:
x = np.array([0.6, 0.9])
p = net.predict(x)
print(p)

[ 1.1744284   0.84059862 -1.32978605]


In [59]:
np.argmax(p)

0

In [60]:
t = np.array([0, 0, 1]) # 正确解标签
net.loss(x,t)

3.090840170292014

In [62]:
def f(W):
    return net.loss(x, t)

dW = numerical_gradient(f, net.W)  #这里带入的参数必须是net.W。这样，对net.W中的值的更改才能实现对f函数中的loss函数的值的更改。
#这儿是一个核心点
# 这个函数的理解，表面看f(W)的函数内并没有使用W，实际上是隐藏在了loss函数中，
# numerical_gradient函数使得参数net.W矩阵依次更改每个位置的值，即+h，或-h，会使得loss函数计算使用新的W，得出新的值，
#继而实现数值微分方式下net.W每个位置处的偏导数
print(dW)

[[ 0.33372796  0.23899351 -0.57272147]
 [ 0.50059194  0.35849026 -0.8590822 ]]


### 2层神经网络的类

In [11]:
import sys, os
sys.path.append(os.pardir)
from common.functions import *
from common.gradient import numerical_gradient

class TwoLayerNet:
    def __init__(self,input_size,hidden_size,output_size,weight_init_std=0.01):
        #初始化权重
        self.params={}
        self.params['W1'] = weight_init_std*np.random.randn(input_size,hidden_size)
        self.params['b1'] = weight_init_std*np.zeros(hidden_size)
        self.params['W2'] = weight_init_std*np.random.randn(hidden_size,output_size)
        self.params['b2'] = weight_init_std*np.zeros(output_size)
    
    def predict(self,x,t):
        W1,W2 = self.params['W1'],self.params['W2']
        b1,b2 = self.params['b1'],self.params['b2']
        
        a1 = np.dot(x,W1)+b1
        z1 = sigmoid(a1)
        a2 = np.dot(z1, W2) + b2
        y = softmax(a2)
        
        return y 
        
    
    def loss(self,x,t):
        y = self.predict(x,t)
        return cross_entropy_error(y, t)
    
    def accuracy(self,x,t):
        y = self.predict(x,t)
        yout = np.argmax(y,axis=1)
        tout = np.argmax(t,axis=1)
        return np.sum(yout==tout)/float(x.shape[0])
    
    def numerical_gradient(self,x,t):
        #先定义要求梯度的函数，这里就是损失函数，但注意不是直接调用损失函数计算值
        loss_W = lambda W:self.loss(x,t)
        
        grads = {}
        grads['W1'] = numerical_gradient(loss_W,self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W,self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W,self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W,self.params['b2'])
        
        return grads

### mini-batch的实现

以TwoLayerNet类为对象，使用MNIST数据集进行学习

In [None]:
#很费时间-不建议运行
from dataset.mnist import load_mnist

(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label = True)

train_loss_list = []

# 超参数
iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

for i in range(iters_num):
    # 获取mini-batch
    batch_mask = np.random.choice(train_size, batch_size) #从0到train_size-1中随机选择batch_size个序号
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    # 计算梯度
    grad = network.numerical_gradient(x_batch, t_batch)
    # grad = network.gradient(x_batch, t_batch) # 高速版!
    
    # 更新参数
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]
    
    # 记录学习过程
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)

In [None]:
#每经过一个epoch会输出当前的训练样本准确率和测试样本准确率
from dataset.mnist import load_mnist

(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label = True)

# 超参数
iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []
iter_per_epoch = max(x_train.shape[0]/batch_size,1)  #每用完所有样本来测试所需要的次数

network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

for i in range(iters_num):
    print(f'序号:{i}')
    # 获取mini-batch
    batch_mask = np.random.choice(train_size, batch_size) #从0到train_size-1中随机选择batch_size个序号
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    # 计算梯度
    grad = network.numerical_gradient(x_batch, t_batch)
    # grad = network.gradient(x_batch, t_batch) # 高速版!
    
    # 更新参数
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]
    
    # 记录学习过程
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    
    # 计算每个epoch的识别精度
    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print("train acc, test acc | " + str(train_acc) + ", " + str(test_acc))


# 误差反向传播法

反向传播的核心是利用链式法则，对每一个参数(${\omega}$)的求导，转化为对组合成复合函数的多个简单函数（乘法、加法）的求导的乘积

## 乘法层的正向和反向传播实现

In [12]:
class MulLayer():
    def __init__(self):
        self.x = None
        self.y = None
        
    def forward(self,x,y):
        self.x = x
        self.y = y
        out = x*y
        return out
    
    def backward(self,dout):
        dx = dout*self.y
        dy = dout*self.x
        
        return dx,dy

### 以购买苹果为例

苹果的单价*为100，

购买个数为2，

消费税为10%(转换成如图的乘法后，则为1.1)

那么

价格=苹果的单价*个数*消费税=220

![图片](images/apple_buying.png)

该图由2个乘法层实现，这是的苹果单价、个数、消费税就是我们需要求导数的变量

用数学表达式来说明

即${y=x_1*x_2*x_3}$

其中:

${x_1}$表示苹果的价格

${x_2}$表示苹果的数量

${x_3}$表示消费税

现在需要做的就是求偏导数：${dx_1,dx_2,dx_3}$

### 正向传播时记录输入数据，以便在反向传播时应用

In [6]:
apple = 100
apple_num = 2
tax = 1.1
# layer
mul_apple_layer = Mullayer()
mul_tax_layer = Mullayer()

#forward
apple_price = mul_apple_layer.forward(apple,apple_num)
price = mul_tax_layer.forward(apple_price,tax)

print(price) 

#backward
dprice = 1
dapple_price,dtax = mul_tax_layer.backward(dprice)
dapple,dapple_num = mul_apple_layer.backward(dapple_price)
print(dapple,dapple_num,dtax)

220.00000000000003
2.2 110.00000000000001 200


## 加法层的实现

In [11]:
class AddLayer():
    def __init__(self):
        pass
        
    def forward(self,x,y): 
        out = x + y
        return out
    
    def backward(self,dout):
        dx = dout*1
        dy = dout*1
        return dx,dy

### 以购买苹果和橘子为例子

![图片](images/apple_orange_buying.png)

In [17]:
apple = 100
apple_num = 2
orange = 150
orange_num = 3
tax = 1.1

#layers
mul_apple_layer = MulLayer()
mul_orange_layer = MulLayer()
add_apple_orange_layer = AddLayer()
mul_tax_layer = MulLayer()

#forward
apple_price = mul_apple_layer.forward(apple,apple_num)
orange_price = mul_orange_layer.forward(orange,orange_num)
apple_orange_price = add_apple_orange_layer.forward(apple_price,orange_price)
price = mul_tax_layer.forward(apple_orange_price,tax)
print(price)

#backward
dprice = 1
dapple_orange_price,dtax = mul_tax_layer.backward(dprice)
dapple_price,dorange_price = add_apple_orange_layer.backward(dapple_orange_price)
dapple,dapple_num = mul_apple_layer.backward(dapple_price)
dorange,dorange_num = mul_orange_layer.backward(dorange_price)
print(dapple,dapple_num,dorange,dorange_num,dtax)

715.0000000000001
2.2 110.00000000000001 3.3000000000000003 165.0 650


## 执行步骤总结

- 首先，生成必要的层，以合适的顺序调用正向传播的forward()方法。
- 然后，用与正向传播相反的顺序调用反向传播的backward()方法，就可以求出想要的导数

## 激活函数层的实现

### ReLU层

In [4]:
class Relu():
    def __init__(self):
        self.mask = None
        
    def forward(self,x):  #注意x是个NumPy对象
        self.mask = (x<=0)
        out = x.copy()
        out[self.mask] = 0
        return out
    
    def backward(self,dout):
        dout[self.mask] = 0
        dx = dout
        return dx

### Sigmoid层实现

In [24]:
class Sigmoid():
    def __init__(self):
        self.out = None
        
    def forward(self,x):
        out = sigmoid(x)
        self.out = out
        return out
    
    def backward(self,dout):
        dx = dout * (1.0 - self.out) * self.out
        return dx

这个实现中，正向传播时将输出保存在了实例变量out中。然后，反向
传播时，使用该变量out进行计算。

## Affine层的实现

神经网络的正向传播中，为了计算加权信号的总和，使用了矩阵的乘积运算（NumPy中是np.dot())

神经元的加权和可以用${Y = np.dot(X, W) + B}$计算出来。然后，Y 经过激活函数转换后，传递给下一层。

注：神经网络的正向传播中进行的矩阵的乘积运算在几何学领域被称为“仿射变换”。因此，这里将进行仿射变换的处理实现为“Affine层”

### 批版本的Affine层

考虑N个数据一起进行正向传播的情况，也就是批版本的Affine层。

In [15]:
class Affine_old():
    def __init__(self,W, b):
        self.W = W
        self.b = b
        self.x = None
        self.dW = None
        self.db = None
        
    def forward(self,x):
        self.x = x
        out = np.dot(x,self.W)+self.b
        return out
    
    def backward(self,dout):
        dx = np.dot(dout,self.W.T)
        dW = np.dot(self.x.T,dout)
        db = np.sum(dout,axis=0)
        return dx       

In [19]:
#相比old版本，新增了对张量的处理代码
class Affine:
    def __init__(self, W, b):
        self.W =W
        self.b = b
        
        self.x = None
        self.original_x_shape = None
        # 权重和偏置参数的导数
        self.dW = None
        self.db = None

    def forward(self, x):
        # 对应张量
        self.original_x_shape = x.shape
        x = x.reshape(x.shape[0], -1)
        self.x = x

        out = np.dot(self.x, self.W) + self.b

        return out

    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)
        
        dx = dx.reshape(*self.original_x_shape)  # 还原输入数据的形状（对应张量）
        return dx

## Softmax-with-Loss 层 

In [17]:
class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None # 损失
        self.y = None # softmax的输出
        self.t = None #监督数据
        
    def forward(self,x,t):
        self.t = t
        self.y = softmax(x)
        self.loss = cross_entropy_error(self.y, self.t)
        return self.loss
    
    def backward(self,dout=1):
        batch_size = self.t.shape[0]
        
        if self.t.size == self.y.size: # 监督数据是one-hot-vector的情况
            dx = (self.y - self.t) / batch_size
        else:
            dx = self.y.copy()
            dx[np.arange(batch_size), self.t] -= 1
            dx = dx / batch_size
        return dx

## 实现两层神经网络

In [1]:
from collections import OrderedDict

class TwoLayerNet:
    def __init__(self,input_size,hidden_size,output_size,weight_init_std=0.01):
        
        #初始化权重
        self.params={}
        self.params['W1'] = weight_init_std * np.random.rand(input_size,hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.rand(hidden_size,output_size)
        self.params['b2'] = np.zeros(output_size)
        
        # 生成层
        self.layers = OrderedDict()
        self.layers['Affine1'] = Affine(self.params['W1'],self.params['b1'])
        self.layers['Relu1'] = Relu()
        self.layers['Affine2'] = Affine(self.params['W2'],self.params['b2'])
        
        self.lastlayer = SoftmaxWithLoss()
    
    def predict(self,x):
        for layer in self.layers.values():
            x = layer.forward(x)
        return x
    
    # x:输入数据, t:监督数据
    def loss(self,x,t):
        y = self.predict(x)
        return self.lastlayer.forward(y,t)
    
    # x:输入数据, t:监督数据
    def accuracy(self,x,t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)  #获取输出结果y中每一行(一行表示一个样本的结果值)中最大值的索引
        if t.ndim != 1 : 
            t = np.argmax(t, axis=1)  #获取监督数据t中每一行中最大的索引
        accuracy = np.sum(y == t) / float(x.shape[0])   #分母表示验证数据的大小
        return accuracy
    
     # x:输入数据, t:监督数据
    def numerical_gradient(self,x,t):
        pass
    
     # x:输入数据, t:监督数据
    def gradient(self,x,t):
        self.loss(x,t)  #触发正向传播
        dout = 1
        dout = self.lastlayer.backward(dout)
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)
            
        # 设定
        grads = {}
        grads['W1'] = self.layers['Affine1'].dW
        grads['b1'] = self.layers['Affine1'].db
        grads['W2'] = self.layers['Affine2'].dW
        grads['b2'] = self.layers['Affine2'].db
        
        return grads
        

## mini-batch的实现

In [13]:
#import sys, os
#sys.path.append(os.pardir)
import numpy as np
from common.layers import Affine
#from common.gradient import numerical_gradient
from collections import OrderedDict


from dataset.mnist import load_mnist

(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label = True)

# 超参数
iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []
iter_per_epoch = max(x_train.shape[0]/batch_size,1)  #每用完所有样本来测试所需要的次数

network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

for i in range(iters_num):
    # 获取mini-batch
    batch_mask = np.random.choice(train_size, batch_size) #从0到train_size-1中随机选择batch_size个序号
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    # 计算梯度
    grad = network.gradient(x_batch, t_batch)
    # grad = network.gradient(x_batch, t_batch) # 高速版!
    # 更新参数
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]
    
    # 记录学习过程
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    
    # 计算每个epoch的识别精度
    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print("train acc, test acc | " + str(train_acc) + ", " + str(test_acc))

train acc, test acc | 0.10218333333333333, 0.101
train acc, test acc | 0.8995, 0.9024
train acc, test acc | 0.9169166666666667, 0.9193
train acc, test acc | 0.9314833333333333, 0.9317
train acc, test acc | 0.9431, 0.9423
train acc, test acc | 0.94935, 0.9483
train acc, test acc | 0.9548, 0.9526
train acc, test acc | 0.9568833333333333, 0.9552
train acc, test acc | 0.9597333333333333, 0.9562
train acc, test acc | 0.9644, 0.96
train acc, test acc | 0.9675666666666667, 0.9616
train acc, test acc | 0.9701166666666666, 0.9633
train acc, test acc | 0.9707, 0.9648
train acc, test acc | 0.9724, 0.9654
train acc, test acc | 0.9749166666666667, 0.967
train acc, test acc | 0.9757333333333333, 0.966
train acc, test acc | 0.97635, 0.9677


# 卷积神经网络

卷积神经网络（Convolutional Neural Network，CNN）

## 卷积层的实现

In [None]:
class Convolution:
    def __init__(self, W, b, stride=1, pad=0):
        self.W = W
        self.b = b
        self.stride = stride
        self.pad = pad
        
        # 中间数据（backward时使用）
        self.x = None   
        self.col = None
        self.col_W = None
        
        # 权重和偏置参数的梯度
        self.dW = None
        self.db = None

    def forward(self, x):
        FN, C, FH, FW = self.W.shape # FN：滤波器个数，C：通道数，FH：滤波器高督，FW滤波器宽度
        N, C, H, W = x.shape
        out_h = 1 + int((H + 2*self.pad - FH) / self.stride)
        out_w = 1 + int((W + 2*self.pad - FW) / self.stride)

        col = im2col(x, FH, FW, self.stride, self.pad)  #使用im2col将4维度（N, C, H, W）的图像数据转化为二维的矩阵，大小为（N*out_w*out_h，滤波器大小）
        col_W = self.W.reshape(FN, -1).T  #将C个滤波器中每一个变成一行，形成(FN，滤波器大小)的矩阵，转置后形成(滤波器大小，FN)的矩阵

        out = np.dot(col, col_W) + self.b  # out为(N*out_w*out_h,FN)，b的形状为（1，FN）
        out = out.reshape(N, out_h, out_w, -1).transpose(0, 3, 1, 2)  #输出为N个形状为(out_h, out_w,FN)的矩阵，
                                                                    #transpose调整为形状为(N，FN，out_h,out_w)的矩阵
            
        self.x = x
        self.col = col
        self.col_W = col_W

        return out

    def backward(self, dout):
        FN, C, FH, FW = self.W.shape
        dout = dout.transpose(0,2,3,1).reshape(-1, FN)  #将后向传播的输入的形状变为(N*out_h*out_w,FN)

        self.db = np.sum(dout, axis=0)   #db就是dout对每一列求和
        self.dW = np.dot(self.col.T, dout)  # dW就是矩阵乘积
        self.dW = self.dW.transpose(1, 0).reshape(FN, C, FH, FW)  #变换成原始的形状

        dcol = np.dot(dout, self.col_W.T)   #求出dcol的值即为dout和self.col_W.T的矩阵乘积
        dx = col2im(dcol, self.x.shape, FH, FW, self.stride, self.pad)  #dcol由二维转换回去形状（N, C, H, W ），即为dx？？

        return dx

## pooling层的实现

In [None]:
class Pooling:
    def __init__(self, pool_h, pool_w, stride=1, pad=0):
        self.pool_h = pool_h
        self.pool_w = pool_w
        self.stride = stride
        self.pad = pad
        
        self.x = None
        self.arg_max = None

    def forward(self, x):
        N, C, H, W = x.shape
        out_h = int(1 + (H - self.pool_h) / self.stride)
        out_w = int(1 + (W - self.pool_w) / self.stride)

        col = im2col(x, self.pool_h, self.pool_w, self.stride, self.pad)
        col = col.reshape(-1, self.pool_h*self.pool_w)  #将形状调整为(N*out_h*out_w*C,pool_h*pool_w),pool_h*pool_w表示池化的大小，也就是一行表示一个要取池化结果的数据行

        arg_max = np.argmax(col, axis=1)   #取出每一行中的最大值所在索引，为提取池化操作结果做准备
        out = np.max(col, axis=1)       # 形状为(N*out_h*out_w*C,1)
        out = out.reshape(N, out_h, out_w, C).transpose(0, 3, 1, 2)   #输出结果为(N,C,out_h,out_w)

        self.x = x
        self.arg_max = arg_max

        return out

    # dout表示4维的张量
    def backward(self, dout):
        dout = dout.transpose(0, 2, 3, 1)  #将反向传播的输入转化为(N,out_h,out_w,C)
        
        pool_size = self.pool_h * self.pool_w
        dmax = np.zeros((dout.size, pool_size))  #dmax的形状为(N*out_h*out_w*C,pool_h*pool_w),并且使用0初始化dx
        dmax[np.arange(self.arg_max.size), self.arg_max.flatten()] = dout.flatten()  #dmax将正向传播时col取最大值的位置设置为dout相同位置的值
                                                                                    #该操作可以理解为取最大值的位置偏导数为1，让dout正常通过，其他位置为0，dout无法通过
        dmax = dmax.reshape(dout.shape + (pool_size,))  #dmax形状变为(N,out_h,out_w,C,pool_size)
        
        dcol = dmax.reshape(dmax.shape[0] * dmax.shape[1] * dmax.shape[2], -1)  #形状变为二维(N*out_h*out_w,C*pool_size)
        dx = col2im(dcol, self.x.shape, self.pool_h, self.pool_w, self.stride, self.pad) #dcol由二维转换回去形状（N, C, H, W ），即为dx？？
        
        return dx

In [None]:
class BatchNormalization:
    """
    http://arxiv.org/abs/1502.03167
    """
    def __init__(self, gamma, beta, momentum=0.9, running_mean=None, running_var=None):
        self.gamma = gamma
        self.beta = beta
        self.momentum = momentum
        self.input_shape = None # Conv层的情况下为4维，全连接层的情况下为2维  

        # 测试时使用的平均值和方差
        self.running_mean = running_mean
        self.running_var = running_var  
        
        # backward时使用的中间数据
        self.batch_size = None
        self.xc = None
        self.std = None
        self.dgamma = None
        self.dbeta = None

    def forward(self, x, train_flg=True):
        self.input_shape = x.shape
        if x.ndim != 2:
            N, C, H, W = x.shape
            x = x.reshape(N, -1)   #如果数据是4维(也就是接在卷积层之后)，将其变为2维

        out = self.__forward(x, train_flg)
        
        return out.reshape(*self.input_shape)  #将输出形状变为和输入相同形状
            
    def __forward(self, x, train_flg):
        if self.running_mean is None:
            N, D = x.shape
            self.running_mean = np.zeros(D)
            self.running_var = np.zeros(D)
                        
        if train_flg:
            mu = x.mean(axis=0)   #对每一列求均值
            xc = x - mu           
            var = np.mean(xc**2, axis=0)  
            std = np.sqrt(var + 10e-7)
            xn = xc / std        #将x的每一个值变为标准正太分布
            
            self.batch_size = x.shape[0]
            self.xc = xc
            self.xn = xn
            self.std = std
            self.running_mean = self.momentum * self.running_mean + (1-self.momentum) * mu
            self.running_var = self.momentum * self.running_var + (1-self.momentum) * var            
        else:
            xc = x - self.running_mean
            xn = xc / ((np.sqrt(self.running_var + 10e-7)))
            
        out = self.gamma * xn + self.beta   #输出为二维矩阵
        return out

    def backward(self, dout):
        if dout.ndim != 2:
            N, C, H, W = dout.shape
            dout = dout.reshape(N, -1)   

        dx = self.__backward(dout)

        dx = dx.reshape(*self.input_shape)
        return dx

    def __backward(self, dout):
        dbeta = dout.sum(axis=0)
        dgamma = np.sum(self.xn * dout, axis=0)
        dxn = self.gamma * dout
        dxc = dxn / self.std
        dstd = -np.sum((dxn * self.xc) / (self.std * self.std), axis=0)
        dvar = 0.5 * dstd / self.std
        dxc += (2.0 / self.batch_size) * self.xc * dvar
        dmu = np.sum(dxc, axis=0)
        dx = dxc - dmu / self.batch_size
        
        self.dgamma = dgamma
        self.dbeta = dbeta
        
        return dx

In [1]:
(1,2,3,4)+(4,)

(1, 2, 3, 4, 4)