In [6]:
import torch
import torchvision.transforms as transforms
import torchvision
from torch import nn,optim
import torch.nn.functional as F
import time

In [4]:
import numpy as np

In [1]:
import sys

In [2]:
sys.path.append(r'F:\study\ml\python_packages')

In [3]:
import d2l

- 多少个输出通道,就要分别每个通道做归一化,所以输出通道保持不变
- 训练与预测不同,需要需要通过均值,方差归一化,预测要用移动平均方法
- 2是全连接层,4是卷积层

In [7]:
def batch_norm(is_training,X,gamma,beta,moving_mean,moving_var,eps,momentum):
    if not is_training:
        X_hat=(X-moving_mean) / torch.sqrt(moving_var+eps)
    else:
        assert len(X.shape) in (2,4)
        if len(X.shape)==2:
            mean=X.mean(dim=0)
            var=((X-mean)**2).mean(dim=0)
        else:
            mean=X.mean(dim=0,keepdim=True).mean(dim=2,keepdim=True).mean(dim=3,keepdim=True)
            var=((X-mean)**2).mean(dim=0,keepdim=True).mean(dim=2,keepdim=True).mean(dim=3,keepdim=True)
        X_hat=(X-mean)/torch.sqrt(var+eps)
        moving_mean=momentum*moving_mean+(1.0-momentum)*mean
        moving_var=momentum*moving_var+(1.0-momentum)*var
    Y=gamma*X_hat+beta
    return Y,moving_mean,moving_var

- BatchNorm 实例
所需指定的 num_features 参数对于全连接层来说应为输出个数，对于卷积层来说则为输出通道数

In [8]:
class BatchNorm(nn.Module):
    def __init__(self, num_features, num_dims):
        super().__init__()
        if num_dims == 2:
            shape = (1, num_features)
        else:
            shape = (1, num_features, 1, 1)
        ## nn.Parameter 是一种类型
        # 参与求梯度和迭代的拉伸和偏移参数，分别初始化成0和1
        self.gamma = nn.Parameter(torch.ones(shape))
        self.beta = nn.Parameter(torch.zeros(shape))
        # 不参与求梯度和迭代的变量，全在内存上初始化成0
        self.moving_mean = torch.zeros(shape)
        self.moving_var = torch.zeros(shape)

    def forward(self, X):
        Y, self.moving_mean, self.moving_var = batch_norm(self.training,
                                                          X,
                                                          self.gamma,
                                                          self.beta,
                                                          self.moving_mean,
                                                          self.moving_var,
                                                          eps=1e-5,
                                                          momentum=0.9)
        return Y

In [9]:
net = nn.Sequential(
    nn.Conv2d(1, 6, 5),
    BatchNorm(6, num_dims=4),
    nn.Sigmoid(),
    nn.MaxPool2d(2,2),
    nn.Conv2d(6,16,5),
    BatchNorm(16,num_dims=4),
    nn.Sigmoid(),
    nn.MaxPool2d(2,2),
    d2l.FlattenLayer(),
    nn.Linear(16*4*4,120),
    BatchNorm(120,num_dims=2),
    nn.Sigmoid(),
    nn.Linear(120,84),
    BatchNorm(84,num_dims=2),
    nn.Sigmoid(),
    nn.Linear(84,10)
)

In [10]:
batch_size=256
train_iter,test_iter=d2l.load_data_fashion_mnist(batch_size=batch_size)

lr,num_epochs=0.001,5
optimizer=torch.optim.Adam(net.parameters(),lr=lr)
d2l.train_ch5(net,train_iter,test_iter,batch_size,optimizer,num_epochs)


epoch 1,loss 1.0063,train acc 0.782,test acc 0.778,time 35.3 sec
epoch 2,loss 0.2278,train acc 0.865,test acc 0.818,time 33.9 sec
epoch 3,loss 0.1235,train acc 0.876,test acc 0.849,time 34.0 sec
epoch 4,loss 0.0836,train acc 0.885,test acc 0.864,time 41.9 sec
epoch 5,loss 0.0626,train acc 0.890,test acc 0.871,time 50.2 sec


In [30]:
net[1].gamma.view((-1,)),net[1].beta.view((-1,))

(tensor([1.0270, 0.9866, 1.2438, 1.0900, 0.9440, 0.8653],
        grad_fn=<ViewBackward>),
 tensor([-0.6213, -0.6319,  0.1360, -0.7276, -0.1986, -0.2034],
        grad_fn=<ViewBackward>))