In [5]:
import torch
from torch import nn

def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum):
    if not torch.is_grad_enabled():  # 如果处于推理模式（非训练模式）
        # 使用历史的移动平均均值和方差进行归一化（测试时）
        # moving_mean, moving_var是全局的均值和方差
        # sqrt是平方根运算
        X_hat = (X - moving_mean) / torch.sqrt(moving_var + eps)
    else:
        # X 维度必须是2或4（全连接层输出 or 卷积层输出）
        assert len(X.shape) in (2, 4)

        if len(X.shape) == 2:  # 全连接层
            # 计算每一列（特征）的均值和方差
            mean = X.mean(dim=0)
            var = ((X - mean) ** 2).mean(dim=0)
        else:  # 卷积层，shape = (batch_size, channels, height, width)
            # 在 batch、height 和 width 维度上做均值（每个通道一个均值）
            # 输出形状1*n*1*1
            mean = X.mean(dim=(0, 2, 3), keepdim=True)
            var = ((X - mean) ** 2).mean(dim=(0, 2, 3), keepdim=True)

        # 归一化
        X_hat = (X - mean) / torch.sqrt(var + eps)

        # 更新移动平均（用于测试阶段）
        moving_mean[:] = momentum * moving_mean + (1.0 - momentum) * mean
        moving_var[:] = momentum * moving_var + (1.0 - momentum) * var

    # 缩放和平移（通过可学习参数 gamma 和 beta 恢复表达能力）
    return gamma * X_hat + beta, moving_mean.data, moving_var.data


In [6]:
import torch
from torch import nn

class BatchNorm(nn.Module):
    def __init__(self, num_features, num_dims):
        super().__init__()
        # 根据输入维度确定归一化形状
        if num_dims == 2:
            shape = (1, num_features)
        else:
            shape = (1, num_features, 1, 1)

        # 可学习的参数 gamma（缩放）和 beta（偏移）
        self.gamma = nn.Parameter(torch.ones(shape))
        self.beta = nn.Parameter(torch.zeros(shape))

        # 非学习参数：用于推理阶段的均值和方差
        self.moving_mean = torch.zeros(shape)
        self.moving_var = torch.ones(shape)

    def forward(self, X):
        # 如果当前设备与参数设备不同，则拷贝到相同设备上（GPU）
        if self.moving_mean.device != X.device:
            self.moving_mean = self.moving_mean.to(X.device)
            self.moving_var = self.moving_var.to(X.device)

        # 使用辅助函数 batch_norm 进行归一化
        Y, self.moving_mean, self.moving_var = batch_norm(
            X, self.gamma, self.beta,
            self.moving_mean, self.moving_var,
            eps=1e-5, momentum=0.9
        )
        return Y


In [7]:
# 运用batch normalization到LeNet模型
net = nn.Sequential(
    # 第一层卷积：输入通道 1，输出通道 6，卷积核大小 5x5
    nn.Conv2d(1, 6, kernel_size=5),
    BatchNorm(6, num_dims=4),              # 对卷积输出做批归一化（4维输入）

    nn.Sigmoid(),                          # 激活函数
    nn.MaxPool2d(kernel_size=2, stride=2), # 最大池化，窗口2x2，步幅2

    # 第二层卷积：输入通道 6，输出通道 16，卷积核 5x5
    nn.Conv2d(6, 16, kernel_size=5),
    BatchNorm(16, num_dims=4),             # 对第二层卷积输出做批归一化

    nn.Sigmoid(),
    nn.MaxPool2d(kernel_size=2, stride=2), # 再做一次池化

    nn.Flatten(),                          # 展平为全连接层输入

    # 全连接层：输入尺寸 16×4×4 = 256，输出 120
    nn.Linear(16 * 4 * 4, 120),
    BatchNorm(120, num_dims=2),            # 对全连接层输出归一化
    nn.Sigmoid(),

    nn.Linear(120, 84),                    # 第二个全连接层
    BatchNorm(84, num_dims=2),             # 再归一化一次
    nn.Sigmoid(),

    nn.Linear(84, 10)                      # 输出层，10类
)



training on cuda:0
loss 0.243, train acc 0.911, test acc 0.885
24795.7 examples/sec on cuda:0


In [None]:
from ml_utils import *
batch_size = 256
train_iter, test_iter = load_data_fashion_mnist(batch_size=batch_size)
lr, num_epochs = 0.9, 10
train_ch6(net, train_iter, test_iter, num_epochs, lr, try_gpu())

In [8]:
# 简洁实现
import torch.nn as nn

net = nn.Sequential(
    # 第1层卷积：输入通道数=1（灰度图），输出通道数=6，卷积核大小5x5
    nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5),

    # 批归一化：用于稳定训练过程，加快收敛
    nn.BatchNorm2d(num_features=6),

    # 激活函数：Sigmoid，用于引入非线性
    nn.Sigmoid(),

    # 平均池化：池化窗口为2x2，步长为2，用于降采样
    nn.AvgPool2d(kernel_size=2, stride=2),

    # 第2层卷积：输入通道6，输出通道16，卷积核5x5
    nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5),

    # 对第2层卷积后的输出做归一化
    nn.BatchNorm2d(num_features=16),

    # 再次使用激活函数
    nn.Sigmoid(),

    # 再次进行平均池化，进一步压缩空间尺寸
    nn.AvgPool2d(kernel_size=2, stride=2),

    # 展平：将 [batch_size, 16, 4, 4] 展平为 [batch_size, 256]
    nn.Flatten(),

    # 第1个全连接层：输入256维，输出120维
    nn.Linear(in_features=256, out_features=120),

    # 对全连接层输出做归一化
    nn.BatchNorm1d(num_features=120),

    # 激活函数
    nn.Sigmoid(),

    # 第2个全连接层：120 -> 84
    nn.Linear(in_features=120, out_features=84),

    # 批归一化
    nn.BatchNorm1d(num_features=84),

    # 激活函数
    nn.Sigmoid(),

    # 输出层：最后输出10类（对应数字 0~9）
    nn.Linear(in_features=84, out_features=10)
)


In [9]:
from ml_utils import *
batch_size = 256
train_iter, test_iter = load_data_fashion_mnist(batch_size=batch_size)
lr, num_epochs = 0.9, 10
train_ch6(net, train_iter, test_iter, num_epochs, lr, try_gpu())

training on cuda:0
loss 0.264, train acc 0.902, test acc 0.826
36605.8 examples/sec on cuda:0
