In [1]:
import torch
import matplotlib.pyplot as plt

def corr2d(X, K):
    """计算二维互相关运算。"""
    h, w = K.shape
    Y = torch.zeros((X.shape[0] - h + 1, X.shape[1] - w + 1))
    for i in range(Y.shape[0]):
        for j in range(Y.shape[1]):
            # 对应区域相乘后求和
            Y[i, j] = (X[i:i + h, j:j + w] * K).sum()
    return Y

# 示例输入 X：3x3 图像
X = torch.tensor([[0, 1, 2],
                  [3, 4, 5],
                  [6, 7, 8]], dtype=torch.float32)

# 示例卷积核 K：2x2
K = torch.tensor([[0, 1],
                  [2, 3]], dtype=torch.float32)

# 调用 corr2d 计算结果
Y = corr2d(X, K)

# 打印输入和输出
print("输入 X:")
print(X)
print("\n卷积核 K:")
print(K)
print("\n输出 Y = corr2d(X, K):")
print(Y)


输入 X:
tensor([[0., 1., 2.],
        [3., 4., 5.],
        [6., 7., 8.]])

卷积核 K:
tensor([[0., 1.],
        [2., 3.]])

输出 Y = corr2d(X, K):
tensor([[19., 25.],
        [37., 43.]])


In [None]:
from torch import nn
# 自定义卷积层
class Conv2D(nn.Module):
    def __init__(self, kernel_size):
        super().__init__()
        self.weight = nn.Parameter(torch.rand(kernel_size))  # 随机初始化
        self.bias = nn.Parameter(torch.zeros(1))             # 初始化偏置为 0

    def forward(self, x):
        return corr2d(x, self.weight) + self.bias



In [10]:
# 边缘检测
X = torch.ones((6, 8))
X[:, 2:6] = 0

K = torch.tensor([[-1, 1],])
Y = corr2d(X, K)
Y

tensor([[ 0., -1.,  0.,  0.,  0.,  1.,  0.],
        [ 0., -1.,  0.,  0.,  0.,  1.,  0.],
        [ 0., -1.,  0.,  0.,  0.,  1.,  0.],
        [ 0., -1.,  0.,  0.,  0.,  1.,  0.],
        [ 0., -1.,  0.,  0.,  0.,  1.,  0.],
        [ 0., -1.,  0.,  0.,  0.,  1.,  0.]])

In [11]:
# 学习卷积核
import torch
from torch import nn

# 创建训练输入和目标输出
X = torch.ones((6, 8))
X[:, 2:6] = 0
Y = torch.tensor([[1., 1., 0., 0., 0., 0., 1.]] * 6)

# 定义卷积层（输入通道=1，输出通道=1，卷积核大小=(1, 2)，无偏置）
conv2d = nn.Conv2d(1, 1, kernel_size=(1, 2), bias=False)

# 调整输入输出形状以匹配 Conv2d 的要求 (N, C, H, W)
# 第一个1是通道数，第二个1是批量大小数
X = X.reshape((1, 1, 6, 8))  # batch=1, channel=1, height=6, width=8
Y = Y.reshape((1, 1, 6, 7))  # 输出宽度是输入减kernel+1 = 7

# 训练循环
for i in range(100):
    Y_hat = conv2d(X)               # 前向传播
    l = (Y_hat - Y) ** 2            # 均方误差 loss
    conv2d.zero_grad()              # 梯度清零
    l.sum().backward()              # 反向传播
    # 手动SGD更新参数
    conv2d.weight.data[:] -= 3e-2 * conv2d.weight.grad

    if (i + 1) % 2 == 0:
        print(f'batch {i+1}, loss {l.sum():.3f}')

print(conv2d.weight.data.reshape(1,2))


batch 2, loss 8.382
batch 4, loss 3.425
batch 6, loss 1.402
batch 8, loss 0.574
batch 10, loss 0.235
batch 12, loss 0.096
batch 14, loss 0.039
batch 16, loss 0.016
batch 18, loss 0.007
batch 20, loss 0.003
batch 22, loss 0.001
batch 24, loss 0.000
batch 26, loss 0.000
batch 28, loss 0.000
batch 30, loss 0.000
batch 32, loss 0.000
batch 34, loss 0.000
batch 36, loss 0.000
batch 38, loss 0.000
batch 40, loss 0.000
batch 42, loss 0.000
batch 44, loss 0.000
batch 46, loss 0.000
batch 48, loss 0.000
batch 50, loss 0.000
batch 52, loss 0.000
batch 54, loss 0.000
batch 56, loss 0.000
batch 58, loss 0.000
batch 60, loss 0.000
batch 62, loss 0.000
batch 64, loss 0.000
batch 66, loss 0.000
batch 68, loss 0.000
batch 70, loss 0.000
batch 72, loss 0.000
batch 74, loss 0.000
batch 76, loss 0.000
batch 78, loss 0.000
batch 80, loss 0.000
batch 82, loss 0.000
batch 84, loss 0.000
batch 86, loss 0.000
batch 88, loss 0.000
batch 90, loss 0.000
batch 92, loss 0.000
batch 94, loss 0.000
batch 96, loss 0.

In [19]:
import torch
from torch import nn

# 定义一个包装函数，将二维输入X扩展成4维 (batch_size, channels, height, width)
def comp_conv2d(conv2d, X):
    X = X.reshape((1, 1) + X.shape)   # => shape (1, 1, H, W)
    Y = conv2d(X)                     # 应用卷积
    return Y.reshape(Y.shape[2:])     # 返回 H, W 部分

# 定义一个Conv2D层：输入通道1，输出通道1，kernel大小3x3，padding=1
conv2d = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=3, padding=1)

# 构造一个 8x8 的输入张量
X = torch.rand(size=(8, 8))

# 调用并打印输出形状
output = comp_conv2d(conv2d, X)
print("输出形状:", output.shape)  # 应该是 torch.Size([8, 8])


输出形状: torch.Size([8, 8])


In [18]:
conv2d = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=(5,3), padding=(2,1))
output = comp_conv2d(conv2d, X)
print("输出形状:", output.shape)

输出形状: torch.Size([8, 8])


In [None]:
conv2d = nn.Conv2d(1, 1, kernel_size=3, padding=1, stride=2)
comp_conv2d(conv2d, X).shape
# 输出: torch.Size([4, 4])； (n-k+p+s)/s


In [20]:
conv2d = nn.Conv2d(1, 1, kernel_size=(3, 5), padding=(0, 1), stride=(3, 4))
comp_conv2d(conv2d, X).shape
# 输出: torch.Size([2, 2])


torch.Size([2, 2])

## 超参数--通道数

In [22]:
import torch

def corr2d_multi_in(X, K):
    return sum(corr2d(x, k) for x, k in zip(X, K))


In [27]:
X = torch.tensor([
    [[0., 1., 2.], [1., 2., 3.]],
    [[3., 4., 5.], [4., 5., 6.]],
    [[6., 7., 8.], [7., 8., 9.]]
])

K = torch.tensor([
    [[0., 1.], [2., 3.]],
    [[1., 2.], [3., 4.]],
    [[2., 3.], [4., 5.]]
])

# 等价于
# corr2d(X[0], K[0]) +
# corr2d(X[1], K[1]) +
# corr2d(X[2], K[2])
corr2d_multi_in(X,K).shape

torch.Size([1, 2])

In [28]:
# 多输出通道，核是四维的
def corr2d_multi_in_out(X, K):
    # 在第0维给它堆叠起来
    return torch.stack([corr2d_multi_in(X, k) for k in K], 0)

K4= torch.stack((K, K + 1, K + 2), 0)

corr2d_multi_in_out(X, K4)


tensor([[[153., 183.]],

        [[201., 243.]],

        [[249., 303.]]])

## 1*1的全连接卷积核

In [30]:
# 只作用域通道之间的融合，不关注空间信息
def corr2d_multi_in_out_1x1(X, K):
    c_i, h, w = X.shape       # 输入通道数，高，宽
    c_o = K.shape[0]          # 输出通道数

    # 把每个通道展平成一行向量，共 c_i 行，长度为 h * w
    X = X.reshape((c_i, h * w))

    # K 的形状是 (输出通道数, 输入通道数)，不变
    K = K.reshape((c_o, c_i))

    # 矩阵乘法，相当于多个通道按位置相乘再加和
    Y = torch.matmul(K, X)            # 结果是 (c_o, h * w)

    return Y.reshape((c_o, h, w))     # reshape 回原图形状


X = torch.normal(0, 1, (3, 3, 3))    # 输入有 3 个通道，每通道 3×3
K = torch.normal(0, 1, (2, 3, 1, 1)) # 1×1 卷积核：输出通道为2，输入通道为3

Y1 = corr2d_multi_in_out_1x1(X, K)
Y2 = corr2d_multi_in_out(X, K)

# 验证两个实现是一样的（误差很小）
assert float(torch.abs(Y1 - Y2).sum()) < 1e-6



## 池化层

In [32]:
import torch
from torch import nn
def pool2d(X, pool_size, mode='max'):
    # 将池化窗口的高度和宽度分别赋值
    p_h, p_w = pool_size

    # 计算输出张量 Y 的尺寸（无 padding、stride=1）
    # 输出高 = 输入高 - 窗口高 + 1，输出宽 = 输入宽 - 窗口宽 + 1
    Y = torch.zeros((X.shape[0] - p_h + 1, X.shape[1] - p_w + 1))

    # 遍历输出张量的每一个元素位置
    for i in range(Y.shape[0]):          # 遍历高度方向
        for j in range(Y.shape[1]):      # 遍历宽度方向

            # 从输入 X 中截取当前窗口区域
            window = X[i:i + p_h, j:j + p_w]

            # 如果模式是最大池化
            if mode == 'max':
                Y[i, j] = window.max()

            # 如果模式是平均池化
            elif mode == 'avg':
                Y[i, j] = window.mean()

    # 返回池化结果
    return Y



In [33]:
x = torch.tensor([
    [0.0, 1.0, 2.0],
    [3.0, 4.0, 5.0],
    [6.0, 7.0, 8.0]
])
pool2d(x, (2, 2))


tensor([[4., 5.],
        [7., 8.]])

In [35]:
# 构造输入张量
# batch_size = 1, channels = 1
# 批量大小，一次喂给神经网络多少张图像
# 通道数，例如 1 是灰度图，3 是 RGB 彩图
x = torch.arange(16, dtype=torch.float32).reshape((1, 1, 4, 4))


# stride: the stride of the window. Default value is :attr:kernel_size
# stride：窗口的步幅。默认值为 kernel_size（即窗口大小本身）。
# padding: Implicit negative infinity padding to be added on both sides
# padding：在输入的两侧隐式添加负无穷作为填充值（通常用于最大池化操作）。
pool2d = nn.MaxPool2d(3)
pool2d(x)


tensor([[[[10.]]]])

In [37]:
pool2d = nn.MaxPool2d(3, padding=1, stride=2)
pool2d(x)


tensor([[[[ 5.,  7.],
          [13., 15.]]]])

In [38]:
pool2d = nn.MaxPool2d((2, 3), padding=(1, 1), stride=(2, 3))
pool2d(x)

tensor([[[[ 1.,  3.],
          [ 9., 11.],
          [13., 15.]]]])

In [39]:
import torch
import torch.nn as nn

# 构造一个输入张量 X
X = torch.arange(0., 16.).reshape(1, 1, 4, 4)  # 形状为 (1, 1, 4, 4)
X = torch.cat((X, X + 1), dim=1)  # 拼接另一个通道 (X+1)，得到形状 (1, 2, 4, 4)
print(X)

# 定义 2D 最大池化层，kernel_size=3, padding=1, stride=2
pool2d = nn.MaxPool2d(kernel_size=3, padding=1, stride=2)
output = pool2d(X)
print(output)


tensor([[[[ 0.,  1.,  2.,  3.],
          [ 4.,  5.,  6.,  7.],
          [ 8.,  9., 10., 11.],
          [12., 13., 14., 15.]],

         [[ 1.,  2.,  3.,  4.],
          [ 5.,  6.,  7.,  8.],
          [ 9., 10., 11., 12.],
          [13., 14., 15., 16.]]]])
tensor([[[[ 5.,  7.],
          [13., 15.]],

         [[ 6.,  8.],
          [14., 16.]]]])
