# Concolutional Neural Networks

**Why CNN for Image?**

Use 1st layer as module to build classifiers. Use 2nd layer as module.
Can the network be simplified by considering the properties of images?

Some patterns are much smaller than the whole image. A neuron does not have to see the whole image to discover the pattern. Connecting to small region with less parameters.

Subsampling the pixels will not change the object. We can subsample the pixels to make image smaller $\Longrightarrow$ Less parameters for the network to process the image.


In [3]:
# LeNet-5
import numpy as np
import time
import pickle  # 用于保存和加载模型

# =============================================================================
# 0. 工具函数 (Utility Functions)
# =============================================================================

def get_im2col_indices(x_shape, field_height, field_width, padding=1, stride=1):
    """计算im2col所需的索引，im2col是一种将卷积运算转换为矩阵乘法的技巧，可以极大加速计算"""
    N, C, H, W = x_shape
    assert (H + 2 * padding - field_height) % stride == 0
    assert (W + 2 * padding - field_width) % stride == 0
    out_height = (H + 2 * padding - field_height) // stride + 1
    out_width = (W + 2 * padding - field_width) // stride + 1

    i0 = np.repeat(np.arange(field_height), field_width)
    i0 = np.tile(i0, C)
    i1 = stride * np.repeat(np.arange(out_height), out_width)
    j0 = np.tile(np.arange(field_width), field_height * C)
    j1 = stride * np.tile(np.arange(out_width), out_height)
    i = i0.reshape(-1, 1) + i1.reshape(1, -1)
    j = j0.reshape(-1, 1) + j1.reshape(1, -1)

    k = np.repeat(np.arange(C), field_height * field_width).reshape(-1, 1)

    return (k, i, j)

def im2col_indices(x, field_height, field_width, padding=1, stride=1):
    """ im2col 实现：将输入图像的局部区域展平成列向量 """
    p = padding
    x_padded = np.pad(x, ((0, 0), (0, 0), (p, p), (p, p)), mode='constant')

    k, i, j = get_im2col_indices(x.shape, field_height, field_width, padding, stride)

    cols = x_padded[:, k, i, j]
    C = x.shape[1]
    cols = cols.transpose(1, 2, 0).reshape(field_height * field_width * C, -1)
    return cols

def col2im_indices(cols, x_shape, field_height=3, field_width=3, padding=1, stride=1):
    """ col2im 实现：im2col的逆操作，用于反向传播 """
    N, C, H, W = x_shape
    H_padded, W_padded = H + 2 * padding, W + 2 * padding
    x_padded = np.zeros((N, C, H_padded, W_padded), dtype=cols.dtype)
    k, i, j = get_im2col_indices(x_shape, field_height, field_width, padding, stride)

    cols_reshaped = cols.reshape(C * field_height * field_width, -1, N)
    cols_reshaped = cols_reshaped.transpose(2, 0, 1)
    np.add.at(x_padded, (slice(None), k, i, j), cols_reshaped)

    if padding == 0:
        return x_padded
    return x_padded[:, :, padding:-padding, padding:-padding]

def to_categorical_numpy(y, num_classes=10):
    """将类别向量(整数)转换为二进制(one-hot)矩阵"""
    y_int = y.astype(int)
    y_one_hot = np.zeros((len(y_int), num_classes))
    y_one_hot[np.arange(len(y_int)), y_int] = 1
    return y_one_hot

# =============================================================================
# 1. 基础层定义 (Base Layer Definitions)
# =============================================================================

class Layer:
    """所有层的基类"""
    def __init__(self):
        self.params = {}  # 存储权重 W 和偏置 b
        self.grads = {}   # 存储 W 和 b 对应的梯度

    def forward(self, inputs):
        raise NotImplementedError

    def backward(self, grad_out):
        raise NotImplementedError

# =============================================================================
# 2. 激活函数层 (Activation Layers)
# =============================================================================

class Tanh(Layer):
    """Tanh激活函数"""
    def __init__(self):
        super().__init__()
        self.cache = None

    def forward(self, x):
        """前向传播"""
        self.cache = x
        return np.tanh(x)

    def backward(self, grad_out):
        """反向传播"""
        output = np.tanh(self.cache)
        return grad_out * (1 - output**2)

class Softmax(Layer):
    """Softmax函数，通常用于输出层"""
    def __init__(self):
        super().__init__()
        self.cache = None

    def forward(self, x):
        """前向传播"""
        exps = np.exp(x - np.max(x, axis=1, keepdims=True))
        self.cache = exps / np.sum(exps, axis=1, keepdims=True)
        return self.cache

    def backward(self, grad_out):
        """占位符，实际计算在损失函数中完成"""
        return grad_out

# =============================================================================
# 3. 功能层 (Functional Layers)
# =============================================================================

class Dense(Layer):
    """全连接层"""
    def __init__(self, input_dim, output_dim, name="dense"):
        super().__init__()
        self.name = name
        self.params['W'] = np.random.randn(input_dim, output_dim) * np.sqrt(1. / input_dim)
        self.params['b'] = np.zeros(output_dim)
        self.cache = None

    def forward(self, x):
        self.cache = x
        return np.dot(x, self.params['W']) + self.params['b']

    def backward(self, grad_out):
        x = self.cache
        self.grads['W'] = np.dot(x.T, grad_out)
        self.grads['b'] = np.sum(grad_out, axis=0)
        return np.dot(grad_out, self.params['W'].T)

class Flatten(Layer):
    """展平层"""
    def __init__(self):
        super().__init__()
        self.cache = None

    def forward(self, x):
        self.cache = x.shape
        N = x.shape[0]
        return x.reshape(N, -1)

    def backward(self, grad_out):
        return grad_out.reshape(self.cache)
    
class Conv2D(Layer):
    """二维卷积层"""
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, name="conv"):
        super().__init__()
        self.name = name
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.params['W'] = np.random.randn(out_channels, in_channels, kernel_size, kernel_size) * 0.1
        self.params['b'] = np.zeros(out_channels)
        self.cache = None
        
    def forward(self, x):
        N, C, H, W = x.shape
        KH, KW = self.kernel_size, self.kernel_size
        out_h = (H + 2 * self.padding - KH) // self.stride + 1
        out_w = (W + 2 * self.padding - KW) // self.stride + 1
        self.x_col = im2col_indices(x, KH, KW, self.padding, self.stride)
        self.W_col = self.params['W'].reshape(self.out_channels, -1)
        out = self.W_col @ self.x_col + self.params['b'].reshape(-1, 1)
        out = out.reshape(self.out_channels, out_h, out_w, N)
        out = out.transpose(3, 0, 1, 2)
        self.cache = x
        return out

    def backward(self, grad_out):
        N, C, H, W = self.cache.shape
        self.grads['b'] = np.sum(grad_out, axis=(0, 2, 3))
        grad_out_reshaped = grad_out.transpose(1, 2, 3, 0).reshape(self.out_channels, -1)
        dW_col = grad_out_reshaped @ self.x_col.T
        self.grads['W'] = dW_col.reshape(self.params['W'].shape)
        dx_col = self.W_col.T @ grad_out_reshaped
        dx = col2im_indices(dx_col, self.cache.shape, self.kernel_size, self.kernel_size, self.padding, self.stride)
        return dx

class AvgPool2D(Layer):
    """二维平均池化层"""
    def __init__(self, pool_size, stride, name="avg_pool"):
        super().__init__()
        self.name = name
        self.pool_size = pool_size
        self.stride = stride
        self.cache = None

    def forward(self, x):
        self.cache = x
        N, C, H, W = x.shape
        pool_h, pool_w = self.pool_size, self.pool_size
        out_h = (H - pool_h) // self.stride + 1
        out_w = (W - pool_w) // self.stride + 1
        out = np.zeros((N, C, out_h, out_w))
        for i in range(out_h):
            for j in range(out_w):
                h_start, h_end = i * self.stride, i * self.stride + pool_h
                w_start, w_end = j * self.stride, j * self.stride + pool_w
                window = x[:, :, h_start:h_end, w_start:w_end]
                out[:, :, i, j] = np.mean(window, axis=(2, 3))
        return out

    def backward(self, grad_out):
        x = self.cache
        N, C, H, W = x.shape
        pool_h, pool_w = self.pool_size, self.pool_size
        out_h, out_w = grad_out.shape[2], grad_out.shape[3]
        dx = np.zeros_like(x)
        pool_area = pool_h * pool_w
        for i in range(out_h):
            for j in range(out_w):
                h_start, h_end = i * self.stride, i * self.stride + pool_h
                w_start, w_end = j * self.stride, j * self.stride + pool_w
                grad = grad_out[:, :, i, j][:, :, np.newaxis, np.newaxis] / pool_area
                dx[:, :, h_start:h_end, w_start:w_end] += grad
        return dx

# =============================================================================
# 4. 损失函数和优化器 (Loss Function and Optimizer)
# =============================================================================

class CrossEntropyLoss:
    """交叉熵损失函数"""
    def __init__(self):
        self.y_pred = None
        self.y_true = None

    def forward(self, y_pred, y_true):
        self.y_pred, self.y_true = y_pred, y_true
        m = y_true.shape[0]
        epsilon = 1e-12
        loss = -np.sum(y_true * np.log(y_pred + epsilon)) / m
        return loss

    def backward(self):
        m = self.y_true.shape[0]
        grad = (self.y_pred - self.y_true) / m
        return grad

class SGD:
    """随机梯度下降优化器"""
    def __init__(self, layers, learning_rate=0.01):
        self.layers = layers
        self.lr = learning_rate

    def step(self):
        for layer in self.layers:
            if hasattr(layer, 'params'):
                for key in layer.params:
                    layer.params[key] -= self.lr * layer.grads[key]
    
    def zero_grad(self):
        for layer in self.layers:
            if hasattr(layer, 'grads'):
                for key in layer.grads:
                    layer.grads[key].fill(0)

# =============================================================================
# 5. LeNet-5 模型定义
# =============================================================================

class LeNet5:
    """LeNet-5 模型"""
    def __init__(self):
        self.layers = [
            Conv2D(in_channels=1, out_channels=6, kernel_size=5, padding=0, name="conv1"),
            Tanh(),
            AvgPool2D(pool_size=2, stride=2, name="pool1"),
            Conv2D(in_channels=6, out_channels=16, kernel_size=5, padding=0, name="conv2"),
            Tanh(),
            AvgPool2D(pool_size=2, stride=2, name="pool2"),
            Flatten(),
            Dense(input_dim=5*5*16, output_dim=120, name="dense1"),
            Tanh(),
            Dense(input_dim=120, output_dim=84, name="dense2"),
            Tanh(),
            Dense(input_dim=84, output_dim=10, name="dense3"),
            Softmax()
        ]
        
        # ******** 代码修正处 ********
        # 修正筛选条件，只选择params字典不为空的层
        self.param_layers = [l for l in self.layers if hasattr(l, 'params') and l.params]

    def forward(self, x):
        for layer in self.layers:
            x = layer.forward(x)
        return x

    def backward(self, grad):
        for layer in reversed(self.layers):
            grad = layer.backward(grad)
        return grad
    
    def save_model(self, file_path):
        params = {}
        for layer in self.param_layers: # 现在这个列表是正确的
            params[layer.name] = layer.params
        with open(file_path, 'wb') as f:
            pickle.dump(params, f)
        print(f"模型已保存至 {file_path}")

    def load_model(self, file_path):
        with open(file_path, 'rb') as f:
            params = pickle.load(f)
        for layer in self.param_layers:
            if layer.name in params:
                layer.params = params[layer.name]
        print(f"模型已从 {file_path} 加载")


# =============================================================================
# 6. 训练过程 (Training Process)
# =============================================================================

if __name__ == '__main__':
    print("正在加载 MNIST 数据集...")
    try:
        from sklearn.datasets import fetch_openml
    except ImportError:
        print("请安装 scikit-learn 以加载数据集: pip install scikit-learn")
        exit()

    mnist = fetch_openml('mnist_784', version=1, as_frame=False, parser='auto')
    X, y = mnist["data"], mnist["target"]
    
    x_train, x_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
    
    y_train = y_train.astype(np.uint8)
    y_test = y_test.astype(np.uint8)

    x_train = x_train.astype('float32') / 255.0
    x_test = x_test.astype('float32') / 255.0
    
    x_train = x_train.reshape((-1, 28, 28))
    x_test = x_test.reshape((-1, 28, 28))

    x_train = np.expand_dims(x_train, axis=1)
    x_test = np.expand_dims(x_test, axis=1)

    x_train = np.pad(x_train, ((0,0), (0,0), (2,2), (2,2)), 'constant')
    x_test = np.pad(x_test, ((0,0), (0,0), (2,2), (2,2)), 'constant')
    
    y_train_one_hot = to_categorical_numpy(y_train, 10)
    y_test_one_hot = to_categorical_numpy(y_test, 10)
    
    print("数据加载和预处理完成。")
    print("训练集图像形状:", x_train.shape)
    print("训练集标签形状:", y_train_one_hot.shape)
    print("测试集图像形状:", x_test.shape)
    print("测试集标签形状:", y_test_one_hot.shape)

    model = LeNet5()
    loss_fn = CrossEntropyLoss()
    optimizer = SGD(model.param_layers, learning_rate=0.1)

    epochs = 5
    batch_size = 64
    num_batches = x_train.shape[0] // batch_size
    
    print("\n开始训练...")
    for epoch in range(epochs):
        start_time = time.time()
        epoch_loss = 0.0
        
        permutation = np.random.permutation(x_train.shape[0])
        x_train_shuffled = x_train[permutation]
        y_train_shuffled = y_train_one_hot[permutation]
        
        for i in range(num_batches):
            start_idx, end_idx = i * batch_size, (i + 1) * batch_size
            x_batch = x_train_shuffled[start_idx:end_idx]
            y_batch = y_train_shuffled[start_idx:end_idx]
            
            optimizer.zero_grad()
            y_pred = model.forward(x_batch)
            loss = loss_fn.forward(y_pred, y_batch)
            epoch_loss += loss
            grad = loss_fn.backward()
            model.backward(grad)
            optimizer.step()
            
            if (i + 1) % 100 == 0:
                print(f"    批次 {i+1}/{num_batches} - 当前损失: {loss:.4f}")
        
        end_time = time.time()
        avg_loss = epoch_loss / num_batches
        print(f"** Epoch {epoch+1}/{epochs} - 平均损失: {avg_loss:.4f} - 耗时: {end_time - start_time:.2f}s **")

    print("\n训练完成。")

    print("\n正在评估模型...")
    y_test_pred_probs = model.forward(x_test)
    y_test_pred_labels = np.argmax(y_test_pred_probs, axis=1)
    
    accuracy = np.mean(y_test_pred_labels == y_test)
    print(f"测试集准确率: {accuracy * 100:.2f}%")

    model.save_model('lenet5_numpy.pkl')

正在加载 MNIST 数据集...
数据加载和预处理完成。
训练集图像形状: (60000, 1, 32, 32)
训练集标签形状: (60000, 10)
测试集图像形状: (10000, 1, 32, 32)
测试集标签形状: (10000, 10)

开始训练...
    批次 100/937 - 当前损失: 0.4665
    批次 200/937 - 当前损失: 0.2899
    批次 300/937 - 当前损失: 0.3256
    批次 400/937 - 当前损失: 0.2282
    批次 500/937 - 当前损失: 0.3417
    批次 600/937 - 当前损失: 0.1903
    批次 700/937 - 当前损失: 0.1106
    批次 800/937 - 当前损失: 0.1167
    批次 900/937 - 当前损失: 0.2131
** Epoch 1/5 - 平均损失: 0.3166 - 耗时: 113.55s **
    批次 100/937 - 当前损失: 0.1171
    批次 200/937 - 当前损失: 0.0636
    批次 300/937 - 当前损失: 0.1013
    批次 400/937 - 当前损失: 0.1675
    批次 500/937 - 当前损失: 0.1098
    批次 600/937 - 当前损失: 0.1250
    批次 700/937 - 当前损失: 0.0361
    批次 800/937 - 当前损失: 0.0477
    批次 900/937 - 当前损失: 0.0844
** Epoch 2/5 - 平均损失: 0.1052 - 耗时: 55.09s **
    批次 100/937 - 当前损失: 0.0388
    批次 200/937 - 当前损失: 0.0832
    批次 300/937 - 当前损失: 0.0196
    批次 400/937 - 当前损失: 0.1256
    批次 500/937 - 当前损失: 0.0107
    批次 600/937 - 当前损失: 0.1346
    批次 700/937 - 当前损失: 0.0821
    批次 800/937 - 当前损失: 0

In [6]:
# AlexNet
import numpy as np
import time
import pickle  # 用于保存和加载模型

# =============================================================================
# 0. 工具函数 (Utility Functions)
# =============================================================================
# 这部分与LeNet-5实现中的工具函数相同，用于加速卷积
def get_im2col_indices(x_shape, field_height, field_width, padding=1, stride=1):
    N, C, H, W = x_shape
    out_height = (H + 2 * padding - field_height) // stride + 1
    out_width = (W + 2 * padding - field_width) // stride + 1

    i0 = np.repeat(np.arange(field_height), field_width)
    i0 = np.tile(i0, C)
    i1 = stride * np.repeat(np.arange(out_height), out_width)
    j0 = np.tile(np.arange(field_width), field_height * C)
    j1 = stride * np.tile(np.arange(out_width), out_height)
    i = i0.reshape(-1, 1) + i1.reshape(1, -1)
    j = j0.reshape(-1, 1) + j1.reshape(1, -1)
    k = np.repeat(np.arange(C), field_height * field_width).reshape(-1, 1)
    return (k, i, j)

def im2col_indices(x, field_height, field_width, padding=1, stride=1):
    p = padding
    x_padded = np.pad(x, ((0, 0), (0, 0), (p, p), (p, p)), mode='constant')
    k, i, j = get_im2col_indices(x.shape, field_height, field_width, padding, stride)
    cols = x_padded[:, k, i, j]
    C = x.shape[1]
    cols = cols.transpose(1, 2, 0).reshape(field_height * field_width * C, -1)
    return cols

def col2im_indices(cols, x_shape, field_height=3, field_width=3, padding=1, stride=1):
    N, C, H, W = x_shape
    H_padded, W_padded = H + 2 * padding, W + 2 * padding
    x_padded = np.zeros((N, C, H_padded, W_padded), dtype=cols.dtype)
    k, i, j = get_im2col_indices(x_shape, field_height, field_width, padding, stride)
    cols_reshaped = cols.reshape(C * field_height * field_width, -1, N)
    cols_reshaped = cols_reshaped.transpose(2, 0, 1)
    np.add.at(x_padded, (slice(None), k, i, j), cols_reshaped)
    if padding == 0:
        return x_padded
    return x_padded[:, :, padding:-padding, padding:-padding]

# 新增：自己实现的 to_categorical 函数
def to_categorical_numpy(y, num_classes=10):
    """将类别向量(整数)转换为二进制(one-hot)矩阵"""
    y_int = y.astype(int)
    y_one_hot = np.zeros((len(y_int), num_classes))
    y_one_hot[np.arange(len(y_int)), y_int] = 1
    return y_one_hot

# =============================================================================
# 1. 基础层及新层定义 (Base & New Layer Definitions)
# =============================================================================

class Layer:
    """所有层的基类"""
    def __init__(self):
        self.params = {}
        self.grads = {}
        self.mode = 'train' # 模式：'train' 或 'test'

    def forward(self, inputs):
        raise NotImplementedError

    def backward(self, grad_out):
        raise NotImplementedError

# --- 激活函数 ---
class ReLU(Layer):
    """ReLU激活函数"""
    def __init__(self):
        super().__init__()
        self.cache = None

    def forward(self, x):
        self.cache = x
        return np.maximum(0, x)

    def backward(self, grad_out):
        x = self.cache
        return grad_out * (x > 0)

class Softmax(Layer):
    """Softmax函数"""
    def __init__(self):
        super().__init__()
        self.cache = None

    def forward(self, x):
        exps = np.exp(x - np.max(x, axis=1, keepdims=True))
        self.cache = exps / np.sum(exps, axis=1, keepdims=True)
        return self.cache

    def backward(self, grad_out):
        return grad_out

# --- 功能层 ---
class Dense(Layer):
    """全连接层"""
    def __init__(self, input_dim, output_dim, name="dense"):
        super().__init__()
        self.name = name
        self.params['W'] = np.random.randn(input_dim, output_dim) * np.sqrt(2. / input_dim) # He 初始化
        self.params['b'] = np.zeros(output_dim)
        self.cache = None

    def forward(self, x):
        self.cache = x
        return np.dot(x, self.params['W']) + self.params['b']

    def backward(self, grad_out):
        x = self.cache
        self.grads['W'] = np.dot(x.T, grad_out)
        self.grads['b'] = np.sum(grad_out, axis=0)
        return np.dot(grad_out, self.params['W'].T)

class Flatten(Layer):
    """展平层"""
    def __init__(self):
        super().__init__()
        self.cache = None

    def forward(self, x):
        self.cache = x.shape
        return x.reshape(x.shape[0], -1)

    def backward(self, grad_out):
        return grad_out.reshape(self.cache)

class Conv2D(Layer):
    """二维卷积层"""
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, name="conv"):
        super().__init__()
        self.name = name
        self.params['W'] = np.random.randn(out_channels, in_channels, kernel_size, kernel_size) * np.sqrt(2. / (in_channels * kernel_size * kernel_size)) # He 初始化
        self.params['b'] = np.zeros(out_channels)
        self.stride = stride
        self.padding = padding
        self.cache = None
        self.kernel_size = kernel_size
        self.out_channels = out_channels

    def forward(self, x):
        N, C, H, W = x.shape
        out_h = (H + 2 * self.padding - self.kernel_size) // self.stride + 1
        out_w = (W + 2 * self.padding - self.kernel_size) // self.stride + 1
        self.x_col = im2col_indices(x, self.kernel_size, self.kernel_size, self.padding, self.stride)
        self.W_col = self.params['W'].reshape(self.out_channels, -1)
        out = self.W_col @ self.x_col + self.params['b'].reshape(-1, 1)
        out = out.reshape(self.out_channels, out_h, out_w, N).transpose(3, 0, 1, 2)
        self.cache = x
        return out

    def backward(self, grad_out):
        self.grads['b'] = np.sum(grad_out, axis=(0, 2, 3))
        grad_out_reshaped = grad_out.transpose(1, 2, 3, 0).reshape(self.out_channels, -1)
        self.grads['W'] = (grad_out_reshaped @ self.x_col.T).reshape(self.params['W'].shape)
        dx_col = self.W_col.T @ grad_out_reshaped
        dx = col2im_indices(dx_col, self.cache.shape, self.kernel_size, self.kernel_size, self.padding, self.stride)
        return dx
    
class MaxPool2D(Layer):
    """二维最大池化层"""
    def __init__(self, pool_size, stride, name="max_pool"):
        super().__init__()
        self.name = name
        self.pool_size = pool_size
        self.stride = stride
        self.cache = None

    def forward(self, x):
        N, C, H, W = x.shape
        pool_h, pool_w = self.pool_size, self.pool_size
        out_h = (H - pool_h) // self.stride + 1
        out_w = (W - pool_w) // self.stride + 1
        x_reshaped = x.reshape(N * C, 1, H, W)
        self.x_col = im2col_indices(x_reshaped, pool_h, pool_w, padding=0, stride=self.stride)
        max_idx = np.argmax(self.x_col, axis=0)
        out = self.x_col[max_idx, np.arange(self.x_col.shape[1])]
        out = out.reshape(out_h, out_w, N, C).transpose(2, 3, 0, 1)
        self.cache = (x, max_idx)
        return out

    def backward(self, grad_out):
        x, max_idx = self.cache
        N, C, H, W = x.shape
        pool_h, pool_w = self.pool_size, self.pool_size
        grad_out_flat = grad_out.transpose(2, 3, 0, 1).ravel()
        dx_col = np.zeros_like(self.x_col)
        dx_col[max_idx, np.arange(self.x_col.shape[1])] = grad_out_flat
        dx = col2im_indices(dx_col, (N * C, 1, H, W), pool_h, pool_w, padding=0, stride=self.stride)
        dx = dx.reshape(x.shape)
        return dx

class Dropout(Layer):
    """Dropout层"""
    def __init__(self, p=0.5, name="dropout"):
        super().__init__()
        self.name = name
        self.p = p # p是保留神经元的概率
        self.mask = None

    def forward(self, x):
        if self.mode == 'train':
            self.mask = (np.random.rand(*x.shape) < self.p) / self.p
            return x * self.mask
        else:
            return x

    def backward(self, grad_out):
        return grad_out * self.mask

class LocalResponseNorm(Layer):
    """局部响应归一化层"""
    def __init__(self, n=5, alpha=1e-4, beta=0.75, k=2, name="lrn"):
        super().__init__()
        self.n = n
        self.alpha = alpha
        self.beta = beta
        self.k = k
        self.name = name
        self.cache = None
        self.norm_factor_cache = None

    def forward(self, x):
        self.cache = x
        N, C, H, W = x.shape
        half_n = self.n // 2
        sq_sum = np.zeros_like(x)
        x_sq = x**2
        for i in range(C):
            start = max(0, i - half_n)
            end = min(C, i + half_n + 1)
            sq_sum[:, i, :, :] = np.sum(x_sq[:, start:end, :, :], axis=1)
        norm_factor = (self.k + self.alpha * sq_sum)**self.beta
        self.norm_factor_cache = norm_factor
        return x / norm_factor

    def backward(self, grad_out):
        # LRN的反向传播非常复杂，使用简化版本
        return grad_out

# =============================================================================
# 2. 损失函数和优化器 (Loss & Optimizer)
# =============================================================================

class CrossEntropyLoss:
    def __init__(self):
        self.y_pred, self.y_true = None, None

    def forward(self, y_pred, y_true):
        self.y_pred, self.y_true = y_pred, y_true
        m = y_true.shape[0]
        loss = -np.sum(y_true * np.log(y_pred + 1e-12)) / m
        return loss

    def backward(self):
        m = self.y_true.shape[0]
        grad = (self.y_pred - self.y_true) / m
        return grad

class SGD:
    def __init__(self, layers, learning_rate=0.01):
        self.layers = layers
        self.lr = learning_rate

    def step(self):
        for layer in self.layers:
            if hasattr(layer, 'params') and layer.params:
                for key in layer.params:
                    layer.params[key] -= self.lr * layer.grads[key]

# =============================================================================
# 3. AlexNet 模型定义 (AlexNet-style for CIFAR-10)
# =============================================================================

class AlexNet:
    """适用于CIFAR-10的AlexNet风格模型"""
    def __init__(self):
        self.layers = [
            Conv2D(in_channels=3, out_channels=32, kernel_size=3, padding=1, name="conv1"), ReLU(),
            MaxPool2D(pool_size=2, stride=2, name="pool1"), LocalResponseNorm(name="lrn1"),
            Conv2D(in_channels=32, out_channels=64, kernel_size=3, padding=1, name="conv2"), ReLU(),
            MaxPool2D(pool_size=2, stride=2, name="pool2"), LocalResponseNorm(name="lrn2"),
            Conv2D(in_channels=64, out_channels=128, kernel_size=3, padding=1, name="conv3"), ReLU(),
            Conv2D(in_channels=128, out_channels=128, kernel_size=3, padding=1, name="conv4"), ReLU(),
            Conv2D(in_channels=128, out_channels=64, kernel_size=3, padding=1, name="conv5"), ReLU(),
            MaxPool2D(pool_size=2, stride=2, name="pool3"),
            Flatten(),
            Dense(input_dim=4*4*64, output_dim=512, name="dense1"), ReLU(), Dropout(p=0.5, name="dropout1"),
            Dense(input_dim=512, output_dim=512, name="dense2"), ReLU(), Dropout(p=0.5, name="dropout2"),
            Dense(input_dim=512, output_dim=10, name="dense3"), Softmax()
        ]
        self.param_layers = [l for l in self.layers if hasattr(l, 'params') and l.params]

    def set_mode(self, mode):
        for layer in self.layers:
            layer.mode = mode

    def forward(self, x):
        for layer in self.layers:
            x = layer.forward(x)
        return x

    def backward(self, grad):
        for layer in reversed(self.layers):
            grad = layer.backward(grad)
        return grad

    def save_model(self, file_path):
        params = {layer.name: layer.params for layer in self.param_layers}
        with open(file_path, 'wb') as f:
            pickle.dump(params, f)
        print(f"模型已保存至 {file_path}")

    def load_model(self, file_path):
        with open(file_path, 'rb') as f:
            params = pickle.load(f)
        for layer in self.param_layers:
            if layer.name in params:
                layer.params = params[layer.name]
        print(f"模型已从 {file_path} 加载")

# =============================================================================
# 4. 训练过程 (Training Process)
# =============================================================================

if __name__ == '__main__':
    # --- 数据加载和预处理 (使用Scikit-learn) ---
    print("正在加载 CIFAR-10 数据集...")
    try:
        from sklearn.datasets import fetch_openml
    except ImportError:
        print("请安装 scikit-learn 以加载数据集: pip install scikit-learn")
        exit()

    # 从OpenML获取数据，这可能需要一些时间，但它会自动缓存数据
    cifar = fetch_openml('CIFAR_10', version=1, as_frame=False, parser='auto')
    X, y = cifar.data, cifar.target

    # 1. 归一化和类型转换
    X = X.astype('float32') / 255.0
    y = y.astype(np.uint8)

    # 2. Reshape: 数据被展平为 (70000, 3072)，需要reshape为 (70000, 3, 32, 32)
    # 我们的层期望 (N, C, H, W) 格式
    X = X.reshape(-1, 3, 32, 32)
    
    # 3. 手动划分训练集和测试集 (CIFAR-10标准划分是50k/10k/10k)
    # fetch_openml 合并了训练和测试集，总共70000个。我们用前50000训练，后10000测试。
    x_train, x_test = X[:50000], X[50000:60000]
    y_train, y_test = y[:50000], y[50000:60000]

    # 4. One-Hot 编码标签
    y_train_one_hot = to_categorical_numpy(y_train, 10)
    y_test_one_hot = to_categorical_numpy(y_test, 10)
    
    # 为了让代码能快速跑完看到结果，我们只使用一小部分数据
    # 如果您想在完整数据集上训练，请注释掉下面两行
    print("!!! 警告: 为快速演示，仅使用一小部分数据进行训练。!!!")
    x_train, y_train_one_hot = x_train[:1280], y_train_one_hot[:1280]

    print("数据加载和预处理完成。")
    print("训练集图像形状:", x_train.shape)
    print("训练集标签形状:", y_train_one_hot.shape)

    # --- 模型初始化 ---
    model = AlexNet()
    loss_fn = CrossEntropyLoss()
    optimizer = SGD(model.param_layers, learning_rate=0.01)

    # --- 训练参数 ---
    epochs = 2
    batch_size = 32
    num_batches = x_train.shape[0] // batch_size
    
    print(f"\n开始训练... (Epochs={epochs}, Batch Size={batch_size})")
    for epoch in range(epochs):
        start_time = time.time()
        epoch_loss = 0.0
        model.set_mode('train')
        
        permutation = np.random.permutation(x_train.shape[0])
        x_train_shuffled = x_train[permutation]
        y_train_shuffled = y_train_one_hot[permutation]
        
        for i in range(num_batches):
            start_idx = i * batch_size
            end_idx = start_idx + batch_size
            x_batch = x_train_shuffled[start_idx:end_idx]
            y_batch = y_train_shuffled[start_idx:end_idx]
            
            y_pred = model.forward(x_batch)
            loss = loss_fn.forward(y_pred, y_batch)
            epoch_loss += loss
            
            grad = loss_fn.backward()
            model.backward(grad)
            optimizer.step()
            
            if (i + 1) % 10 == 0:
                print(f"    Epoch {epoch+1}, 批次 {i+1}/{num_batches} - 当前损失: {loss:.4f}")
        
        end_time = time.time()
        avg_loss = epoch_loss / num_batches
        print(f"** Epoch {epoch+1}/{epochs} - 平均损失: {avg_loss:.4f} - 耗时: {end_time - start_time:.2f}s **")

    print("\n训练完成。")

    # --- 模型评估 ---
    print("\n正在评估模型...")
    model.set_mode('test')
    
    x_test_subset = x_test[:500]
    y_test_subset_labels = y_test[:500]

    y_test_pred_probs = model.forward(x_test_subset)
    y_test_pred_labels = np.argmax(y_test_pred_probs, axis=1)
    
    accuracy = np.mean(y_test_pred_labels == y_test_subset_labels)
    print(f"在500个测试样本上的准确率: {accuracy * 100:.2f}%")

    model.save_model('alexnet_numpy.pkl')

正在加载 CIFAR-10 数据集...
!!! 警告: 为快速演示，仅使用一小部分数据进行训练。!!!
数据加载和预处理完成。
训练集图像形状: (1280, 3, 32, 32)
训练集标签形状: (1280, 10)

开始训练... (Epochs=2, Batch Size=32)
    Epoch 1, 批次 10/40 - 当前损失: 2.4539
    Epoch 1, 批次 20/40 - 当前损失: 2.3031
    Epoch 1, 批次 30/40 - 当前损失: 2.2378
    Epoch 1, 批次 40/40 - 当前损失: 2.2460
** Epoch 1/2 - 平均损失: 2.3283 - 耗时: 10.26s **
    Epoch 2, 批次 10/40 - 当前损失: 2.2842
    Epoch 2, 批次 20/40 - 当前损失: 2.3652
    Epoch 2, 批次 30/40 - 当前损失: 2.2850
    Epoch 2, 批次 40/40 - 当前损失: 2.3594
** Epoch 2/2 - 平均损失: 2.3033 - 耗时: 11.56s **

训练完成。

正在评估模型...
在500个测试样本上的准确率: 17.00%
模型已保存至 alexnet_numpy.pkl


In [7]:
# VGGNet
import numpy as np
import time
import pickle  # 用于保存和加载模型

# =============================================================================
# 0. 工具函数 (Utility Functions)
# =============================================================================
# 这部分与之前的实现完全相同，用于加速卷积和进行one-hot编码
def get_im2col_indices(x_shape, field_height, field_width, padding=1, stride=1):
    N, C, H, W = x_shape
    out_height = (H + 2 * padding - field_height) // stride + 1
    out_width = (W + 2 * padding - field_width) // stride + 1

    i0 = np.repeat(np.arange(field_height), field_width)
    i0 = np.tile(i0, C)
    i1 = stride * np.repeat(np.arange(out_height), out_width)
    j0 = np.tile(np.arange(field_width), field_height * C)
    j1 = stride * np.tile(np.arange(out_width), out_height)
    i = i0.reshape(-1, 1) + i1.reshape(1, -1)
    j = j0.reshape(-1, 1) + j1.reshape(1, -1)
    k = np.repeat(np.arange(C), field_height * field_width).reshape(-1, 1)
    return (k, i, j)

def im2col_indices(x, field_height, field_width, padding=1, stride=1):
    p = padding
    x_padded = np.pad(x, ((0, 0), (0, 0), (p, p), (p, p)), mode='constant')
    k, i, j = get_im2col_indices(x.shape, field_height, field_width, padding, stride)
    cols = x_padded[:, k, i, j]
    C = x.shape[1]
    cols = cols.transpose(1, 2, 0).reshape(field_height * field_width * C, -1)
    return cols

def col2im_indices(cols, x_shape, field_height=3, field_width=3, padding=1, stride=1):
    N, C, H, W = x_shape
    H_padded, W_padded = H + 2 * padding, W + 2 * padding
    x_padded = np.zeros((N, C, H_padded, W_padded), dtype=cols.dtype)
    k, i, j = get_im2col_indices(x_shape, field_height, field_width, padding, stride)
    cols_reshaped = cols.reshape(C * field_height * field_width, -1, N)
    cols_reshaped = cols_reshaped.transpose(2, 0, 1)
    np.add.at(x_padded, (slice(None), k, i, j), cols_reshaped)
    if padding == 0:
        return x_padded
    return x_padded[:, :, padding:-padding, padding:-padding]

def to_categorical_numpy(y, num_classes=10):
    """将类别向量(整数)转换为二进制(one-hot)矩阵"""
    y_int = y.astype(int)
    y_one_hot = np.zeros((len(y_int), num_classes))
    y_one_hot[np.arange(len(y_int)), y_int] = 1
    return y_one_hot

# =============================================================================
# 1. 基础层定义 (Layer Definitions)
# =============================================================================
# 这部分与AlexNet的实现基本相同，是构建任何CNN的基础
class Layer:
    """所有层的基类"""
    def __init__(self):
        self.params = {}
        self.grads = {}
        self.mode = 'train'

    def forward(self, inputs):
        raise NotImplementedError

    def backward(self, grad_out):
        raise NotImplementedError

class ReLU(Layer):
    def __init__(self):
        super().__init__()
        self.cache = None
    def forward(self, x):
        self.cache = x
        return np.maximum(0, x)
    def backward(self, grad_out):
        return grad_out * (self.cache > 0)

class Softmax(Layer):
    def __init__(self):
        super().__init__()
        self.cache = None
    def forward(self, x):
        exps = np.exp(x - np.max(x, axis=1, keepdims=True))
        self.cache = exps / np.sum(exps, axis=1, keepdims=True)
        return self.cache
    def backward(self, grad_out):
        return grad_out

class Dense(Layer):
    def __init__(self, input_dim, output_dim, name="dense"):
        super().__init__()
        self.name = name
        self.params['W'] = np.random.randn(input_dim, output_dim) * np.sqrt(2. / input_dim) # He 初始化
        self.params['b'] = np.zeros(output_dim)
        self.cache = None
    def forward(self, x):
        self.cache = x
        return np.dot(x, self.params['W']) + self.params['b']
    def backward(self, grad_out):
        x = self.cache
        self.grads['W'] = np.dot(x.T, grad_out)
        self.grads['b'] = np.sum(grad_out, axis=0)
        return np.dot(grad_out, self.params['W'].T)

class Flatten(Layer):
    def __init__(self):
        super().__init__()
        self.cache = None
    def forward(self, x):
        self.cache = x.shape
        return x.reshape(x.shape[0], -1)
    def backward(self, grad_out):
        return grad_out.reshape(self.cache)

class Conv2D(Layer):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, name="conv"):
        super().__init__()
        self.name = name
        self.params['W'] = np.random.randn(out_channels, in_channels, kernel_size, kernel_size) * np.sqrt(2. / (in_channels * kernel_size * kernel_size)) # He 初始化
        self.params['b'] = np.zeros(out_channels)
        self.stride = stride
        self.padding = padding
        self.kernel_size = kernel_size
        self.out_channels = out_channels
        self.cache = None
    def forward(self, x):
        N, C, H, W = x.shape
        out_h = (H + 2 * self.padding - self.kernel_size) // self.stride + 1
        out_w = (W + 2 * self.padding - self.kernel_size) // self.stride + 1
        self.x_col = im2col_indices(x, self.kernel_size, self.kernel_size, self.padding, self.stride)
        self.W_col = self.params['W'].reshape(self.out_channels, -1)
        out = self.W_col @ self.x_col + self.params['b'].reshape(-1, 1)
        out = out.reshape(self.out_channels, out_h, out_w, N).transpose(3, 0, 1, 2)
        self.cache = x
        return out
    def backward(self, grad_out):
        self.grads['b'] = np.sum(grad_out, axis=(0, 2, 3))
        grad_out_reshaped = grad_out.transpose(1, 2, 3, 0).reshape(self.out_channels, -1)
        self.grads['W'] = (grad_out_reshaped @ self.x_col.T).reshape(self.params['W'].shape)
        dx_col = self.W_col.T @ grad_out_reshaped
        dx = col2im_indices(dx_col, self.cache.shape, self.kernel_size, self.kernel_size, self.padding, self.stride)
        return dx
    
class MaxPool2D(Layer):
    def __init__(self, pool_size, stride, name="max_pool"):
        super().__init__()
        self.name = name
        self.pool_size = pool_size
        self.stride = stride
        self.cache = None
    def forward(self, x):
        N, C, H, W = x.shape
        pool_h, pool_w = self.pool_size, self.pool_size
        out_h = (H - pool_h) // self.stride + 1
        out_w = (W - pool_w) // self.stride + 1
        x_reshaped = x.reshape(N * C, 1, H, W)
        self.x_col = im2col_indices(x_reshaped, pool_h, pool_w, padding=0, stride=self.stride)
        max_idx = np.argmax(self.x_col, axis=0)
        out = self.x_col[max_idx, np.arange(self.x_col.shape[1])]
        out = out.reshape(out_h, out_w, N, C).transpose(2, 3, 0, 1)
        self.cache = (x, max_idx)
        return out
    def backward(self, grad_out):
        x, max_idx = self.cache
        N, C, H, W = x.shape
        pool_h, pool_w = self.pool_size, self.pool_size
        grad_out_flat = grad_out.transpose(2, 3, 0, 1).ravel()
        dx_col = np.zeros_like(self.x_col)
        dx_col[max_idx, np.arange(self.x_col.shape[1])] = grad_out_flat
        dx = col2im_indices(dx_col, (N * C, 1, H, W), pool_h, pool_w, padding=0, stride=self.stride)
        dx = dx.reshape(x.shape)
        return dx

# =============================================================================
# 2. 损失函数和优化器 (Loss & Optimizer)
# =============================================================================
class CrossEntropyLoss:
    def __init__(self):
        self.y_pred, self.y_true = None, None
    def forward(self, y_pred, y_true):
        self.y_pred, self.y_true = y_pred, y_true
        m = y_true.shape[0]
        loss = -np.sum(y_true * np.log(y_pred + 1e-12)) / m
        return loss
    def backward(self):
        m = self.y_true.shape[0]
        grad = (self.y_pred - self.y_true) / m
        return grad

class SGD:
    def __init__(self, layers, learning_rate=0.01):
        self.layers = layers
        self.lr = learning_rate
    def step(self):
        for layer in self.layers:
            if hasattr(layer, 'params') and layer.params:
                for key in layer.params:
                    layer.params[key] -= self.lr * layer.grads[key]

# =============================================================================
# 3. VGG-11 模型定义
# =============================================================================

class VGG11:
    """适用于CIFAR-10的VGG-11模型"""
    def __init__(self):
        # VGG-11 包含 8 个卷积层和 3 个全连接层
        # 所有卷积层都使用 3x3 的核，padding=1 来保持尺寸
        # 每个卷积块后接一个 2x2 的最大池化层
        
        # 输入: (N, 3, 32, 32)
        self.layers = [
            # Block 1: 32x32 -> 16x16
            Conv2D(in_channels=3, out_channels=64, kernel_size=3, padding=1, name="conv1"), ReLU(),
            MaxPool2D(pool_size=2, stride=2, name="pool1"),
            
            # Block 2: 16x16 -> 8x8
            Conv2D(in_channels=64, out_channels=128, kernel_size=3, padding=1, name="conv2"), ReLU(),
            MaxPool2D(pool_size=2, stride=2, name="pool2"),

            # Block 3: 8x8 -> 4x4
            Conv2D(in_channels=128, out_channels=256, kernel_size=3, padding=1, name="conv3_1"), ReLU(),
            Conv2D(in_channels=256, out_channels=256, kernel_size=3, padding=1, name="conv3_2"), ReLU(),
            MaxPool2D(pool_size=2, stride=2, name="pool3"),

            # Block 4: 4x4 -> 2x2
            Conv2D(in_channels=256, out_channels=512, kernel_size=3, padding=1, name="conv4_1"), ReLU(),
            Conv2D(in_channels=512, out_channels=512, kernel_size=3, padding=1, name="conv4_2"), ReLU(),
            MaxPool2D(pool_size=2, stride=2, name="pool4"),

            # Block 5: 2x2 -> 1x1
            Conv2D(in_channels=512, out_channels=512, kernel_size=3, padding=1, name="conv5_1"), ReLU(),
            Conv2D(in_channels=512, out_channels=512, kernel_size=3, padding=1, name="conv5_2"), ReLU(),
            MaxPool2D(pool_size=2, stride=2, name="pool5"),

            # 展平层: 最终特征图尺寸为 1x1x512
            Flatten(),
            
            # 全连接层 (分类器)
            # 原始VGG使用4096个神经元，这里简化为512以适应小数据集和加速计算
            Dense(input_dim=1*1*512, output_dim=512, name="dense1"), ReLU(),
            # 这里可以加Dropout层，但为保持VGG的核心结构简洁，暂时省略
            # Dense(input_dim=512, output_dim=512, name="dense2"), ReLU(),
            Dense(input_dim=512, output_dim=10, name="dense_out"),
            Softmax()
        ]
        self.param_layers = [l for l in self.layers if hasattr(l, 'params') and l.params]

    def forward(self, x):
        for layer in self.layers:
            x = layer.forward(x)
        return x

    def backward(self, grad):
        for layer in reversed(self.layers):
            grad = layer.backward(grad)
        return grad

    def save_model(self, file_path):
        params = {layer.name: layer.params for layer in self.param_layers}
        with open(file_path, 'wb') as f:
            pickle.dump(params, f)
        print(f"模型已保存至 {file_path}")

    def load_model(self, file_path):
        with open(file_path, 'rb') as f:
            params = pickle.load(f)
        for layer in self.param_layers:
            if layer.name in params:
                layer.params = params[layer.name]
        print(f"模型已从 {file_path} 加载")

# =============================================================================
# 4. 训练过程 (Training Process)
# =============================================================================

if __name__ == '__main__':
    # --- 数据加载和预处理 (使用Scikit-learn) ---
    print("正在加载 CIFAR-10 数据集...")
    try:
        from sklearn.datasets import fetch_openml
    except ImportError:
        print("请安装 scikit-learn 以加载数据集: pip install scikit-learn")
        exit()

    cifar = fetch_openml('CIFAR_10', version=1, as_frame=False, parser='auto')
    X, y = cifar.data, cifar.target

    X = X.astype('float32') / 255.0
    y = y.astype(np.uint8)
    X = X.reshape(-1, 3, 32, 32)
    
    x_train, x_test = X[:50000], X[50000:60000]
    y_train, y_test = y[:50000], y[50000:60000]

    y_train_one_hot = to_categorical_numpy(y_train, 10)
    
    # !!! 警告: VGG非常大，在完整数据集上训练极慢 !!!
    # !!! 为了让代码能快速跑完看到结果，我们只使用非常小的一部分数据 !!!
    num_train_samples = 512 # 可以调整，但越大越慢
    print(f"!!! 警告: 为快速演示，仅使用 {num_train_samples} 个样本进行训练。!!!")
    x_train, y_train_one_hot = x_train[:num_train_samples], y_train_one_hot[:num_train_samples]

    print("数据加载和预处理完成。")
    print("训练集图像形状:", x_train.shape)
    print("训练集标签形状:", y_train_one_hot.shape)

    # --- 模型初始化 ---
    model = VGG11()
    loss_fn = CrossEntropyLoss()
    optimizer = SGD(model.param_layers, learning_rate=0.001) # VGG通常需要较小的学习率

    # --- 训练参数 ---
    epochs = 1 # 增加 epoch 会显著增加等待时间
    batch_size = 16 # 使用小批量以减少内存占用
    num_batches = x_train.shape[0] // batch_size
    
    print(f"\n开始训练... (Epochs={epochs}, Batch Size={batch_size})")
    # --- 训练循环 ---
    for epoch in range(epochs):
        start_time = time.time()
        epoch_loss = 0.0
        
        permutation = np.random.permutation(x_train.shape[0])
        x_train_shuffled = x_train[permutation]
        y_train_shuffled = y_train_one_hot[permutation]
        
        for i in range(num_batches):
            start_idx = i * batch_size
            end_idx = start_idx + batch_size
            x_batch = x_train_shuffled[start_idx:end_idx]
            y_batch = y_train_shuffled[start_idx:end_idx]
            
            y_pred = model.forward(x_batch)
            loss = loss_fn.forward(y_pred, y_batch)
            epoch_loss += loss
            
            grad = loss_fn.backward()
            model.backward(grad)
            
            optimizer.step()
            
            print(f"    Epoch {epoch+1}, 批次 {i+1}/{num_batches} - 当前损失: {loss:.4f}")
        
        end_time = time.time()
        avg_loss = epoch_loss / num_batches
        print(f"** Epoch {epoch+1}/{epochs} - 平均损失: {avg_loss:.4f} - 耗时: {end_time - start_time:.2f}s **")

    print("\n训练完成。")

    # --- 模型评估 ---
    print("\n正在评估模型...")
    # 为节省时间，只在测试集的一个子集上评估
    num_test_samples = 200
    x_test_subset = x_test[:num_test_samples]
    y_test_subset_labels = y_test[:num_test_samples]

    y_test_pred_probs = model.forward(x_test_subset)
    y_test_pred_labels = np.argmax(y_test_pred_probs, axis=1)
    
    accuracy = np.mean(y_test_pred_labels == y_test_subset_labels)
    print(f"在 {num_test_samples} 个测试样本上的准确率: {accuracy * 100:.2f}%")

    model.save_model('vgg11_numpy.pkl')

正在加载 CIFAR-10 数据集...
!!! 警告: 为快速演示，仅使用 512 个样本进行训练。!!!
数据加载和预处理完成。
训练集图像形状: (512, 3, 32, 32)
训练集标签形状: (512, 10)

开始训练... (Epochs=1, Batch Size=16)
    Epoch 1, 批次 1/32 - 当前损失: 2.4863
    Epoch 1, 批次 2/32 - 当前损失: 2.6632
    Epoch 1, 批次 3/32 - 当前损失: 2.4926
    Epoch 1, 批次 4/32 - 当前损失: 2.3549
    Epoch 1, 批次 5/32 - 当前损失: 2.2613
    Epoch 1, 批次 6/32 - 当前损失: 2.4613
    Epoch 1, 批次 7/32 - 当前损失: 2.2503
    Epoch 1, 批次 8/32 - 当前损失: 2.3767
    Epoch 1, 批次 9/32 - 当前损失: 2.3326
    Epoch 1, 批次 10/32 - 当前损失: 2.3408
    Epoch 1, 批次 11/32 - 当前损失: 2.2883
    Epoch 1, 批次 12/32 - 当前损失: 2.3356
    Epoch 1, 批次 13/32 - 当前损失: 2.3310
    Epoch 1, 批次 14/32 - 当前损失: 2.2095
    Epoch 1, 批次 15/32 - 当前损失: 2.2351
    Epoch 1, 批次 16/32 - 当前损失: 2.2691
    Epoch 1, 批次 17/32 - 当前损失: 2.4499
    Epoch 1, 批次 18/32 - 当前损失: 2.2512
    Epoch 1, 批次 19/32 - 当前损失: 2.2001
    Epoch 1, 批次 20/32 - 当前损失: 2.4192
    Epoch 1, 批次 21/32 - 当前损失: 2.2683
    Epoch 1, 批次 22/32 - 当前损失: 2.2919
    Epoch 1, 批次 23/32 - 当前损失: 2.2672
    Epoch 1

In [10]:
# GoogLeNet(Inception)
import numpy as np
import time
import pickle

# =============================================================================
# 0. 工具函数 (Utility Functions)
# =============================================================================
# (这部分代码无变化，保持不变)
def get_im2col_indices(x_shape, field_height, field_width, padding=1, stride=1):
    N, C, H, W = x_shape
    out_height = (H + 2 * padding - field_height) // stride + 1
    out_width = (W + 2 * padding - field_width) // stride + 1
    i0 = np.repeat(np.arange(field_height), field_width); i0 = np.tile(i0, C)
    i1 = stride * np.repeat(np.arange(out_height), out_width)
    j0 = np.tile(np.arange(field_width), field_height * C)
    j1 = stride * np.tile(np.arange(out_width), out_height)
    i = i0.reshape(-1, 1) + i1.reshape(1, -1)
    j = j0.reshape(-1, 1) + j1.reshape(1, -1)
    k = np.repeat(np.arange(C), field_height * field_width).reshape(-1, 1)
    return (k, i, j)

def im2col_indices(x, field_height, field_width, padding=1, stride=1):
    p = padding
    x_padded = np.pad(x, ((0, 0), (0, 0), (p, p), (p, p)), mode='constant')
    k, i, j = get_im2col_indices(x.shape, field_height, field_width, padding, stride)
    cols = x_padded[:, k, i, j]
    C = x.shape[1]
    cols = cols.transpose(1, 2, 0).reshape(field_height * field_width * C, -1)
    return cols

def col2im_indices(cols, x_shape, field_height=3, field_width=3, padding=1, stride=1):
    N, C, H, W = x_shape
    H_padded, W_padded = H + 2 * padding, W + 2 * padding
    x_padded = np.zeros((N, C, H_padded, W_padded), dtype=cols.dtype)
    k, i, j = get_im2col_indices(x_shape, field_height, field_width, padding, stride)
    cols_reshaped = cols.reshape(C * field_height * field_width, -1, N)
    cols_reshaped = cols_reshaped.transpose(2, 0, 1)
    np.add.at(x_padded, (slice(None), k, i, j), cols_reshaped)
    if padding == 0: return x_padded
    return x_padded[:, :, padding:-padding, padding:-padding]

def to_categorical_numpy(y, num_classes=10):
    y_int = y.astype(int)
    y_one_hot = np.zeros((len(y_int), num_classes))
    y_one_hot[np.arange(len(y_int)), y_int] = 1
    return y_one_hot

# =============================================================================
# 1. 基础层定义 (Layer Definitions)
# =============================================================================
class Layer:
    def __init__(self): self.params, self.grads = {}, {}
    def forward(self, inputs): raise NotImplementedError
    def backward(self, grad_out): raise NotImplementedError

class ReLU(Layer):
    def forward(self, x): self.cache = x; return np.maximum(0, x)
    def backward(self, grad_out): return grad_out * (self.cache > 0)

class Softmax(Layer):
    def forward(self, x):
        exps = np.exp(x - np.max(x, axis=1, keepdims=True))
        self.cache = exps / np.sum(exps, axis=1, keepdims=True)
        return self.cache
    def backward(self, grad_out): return grad_out

class Dense(Layer):
    def __init__(self, input_dim, output_dim, name):
        super().__init__()
        self.name = name
        self.params['W'] = np.random.randn(input_dim, output_dim) * np.sqrt(2. / input_dim)
        self.params['b'] = np.zeros(output_dim)
    def forward(self, x): self.cache = x; return x @ self.params['W'] + self.params['b']
    def backward(self, grad_out):
        self.grads['W'] = self.cache.T @ grad_out
        self.grads['b'] = np.sum(grad_out, axis=0)
        return grad_out @ self.params['W'].T

class Flatten(Layer):
    def forward(self, x): self.cache = x.shape; return x.reshape(x.shape[0], -1)
    def backward(self, grad_out): return grad_out.reshape(self.cache)

class Conv2D(Layer):
    def __init__(self, in_c, out_c, kernel, stride=1, padding=0, name=""):
        super().__init__()
        self.name, self.stride, self.padding, self.kernel, self.out_c = name, stride, padding, kernel, out_c
        self.params['W'] = np.random.randn(out_c, in_c, kernel, kernel) * np.sqrt(2./(in_c*kernel*kernel))
        self.params['b'] = np.zeros(out_c)
    def forward(self, x):
        N, C, H, W = x.shape
        h_out = (H + 2*self.padding - self.kernel)//self.stride + 1
        w_out = (W + 2*self.padding - self.kernel)//self.stride + 1
        self.x_col = im2col_indices(x, self.kernel, self.kernel, self.padding, self.stride)
        W_col = self.params['W'].reshape(self.out_c, -1)
        out = (W_col @ self.x_col) + self.params['b'].reshape(-1, 1)
        out = out.reshape(self.out_c, h_out, w_out, N).transpose(3, 0, 1, 2)
        self.cache = x
        return out
    def backward(self, grad_out):
        self.grads['b'] = np.sum(grad_out, axis=(0, 2, 3))
        grad_reshaped = grad_out.transpose(1, 2, 3, 0).reshape(self.out_c, -1)
        self.grads['W'] = (grad_reshaped @ self.x_col.T).reshape(self.params['W'].shape)
        W_col = self.params['W'].reshape(self.out_c, -1)
        dx_col = W_col.T @ grad_reshaped
        return col2im_indices(dx_col, self.cache.shape, self.kernel, self.kernel, self.padding, self.stride)

class MaxPool2D(Layer):
    def __init__(self, size, stride, name=""):
        super().__init__()
        self.name, self.size, self.stride = name, size, stride
    def forward(self, x):
        N, C, H, W = x.shape
        h_out, w_out = (H-self.size)//self.stride+1, (W-self.size)//self.stride+1
        x_reshaped = x.reshape(N * C, 1, H, W)
        self.x_col = im2col_indices(x_reshaped, self.size, self.size, 0, self.stride)
        max_idx = np.argmax(self.x_col, axis=0)
        out = self.x_col[max_idx, np.arange(self.x_col.shape[1])]
        out = out.reshape(h_out, w_out, N, C).transpose(2, 3, 0, 1)
        self.cache = (x, max_idx)
        return out
    def backward(self, grad_out):
        x, max_idx = self.cache
        N, C, H, W = x.shape
        grad_flat = grad_out.transpose(2, 3, 0, 1).ravel()
        dx_col = np.zeros_like(self.x_col)
        dx_col[max_idx, np.arange(self.x_col.shape[1])] = grad_flat
        dx = col2im_indices(dx_col, (N*C, 1, H, W), self.size, self.size, 0, self.stride)
        return dx.reshape(x.shape)

class AvgPool2D(Layer):
    def __init__(self, size, stride, name=""):
        super().__init__()
        self.name, self.size, self.stride = name, size, stride
        self.cache = None
    def forward(self, x):
        self.cache = x
        N, C, H, W = x.shape
        h_out = (H - self.size) // self.stride + 1
        w_out = (W - self.size) // self.stride + 1
        x_reshaped = x.reshape(N * C, 1, H, W)
        x_col = im2col_indices(x_reshaped, self.size, self.size, 0, self.stride)
        out = np.mean(x_col, axis=0)
        out = out.reshape(h_out, w_out, N, C).transpose(2, 3, 0, 1)
        return out
    def backward(self, grad_out):
        x = self.cache
        N, C, H, W = x.shape
        # ******** 代码修正处 ********
        # 将错误的元组解包修正为两个独立的赋值语句
        h_out = (H - self.size) // self.stride + 1
        w_out = (W - self.size) // self.stride + 1
        # ******** 修正结束 ********
        pool_area = self.size * self.size
        grad_flat = grad_out.transpose(2, 3, 0, 1).ravel()
        num_patches = N * C * h_out * w_out
        dx_col = np.ones((self.size * self.size, num_patches)) 
        dx_col *= (grad_flat / pool_area)
        dx = col2im_indices(dx_col, (N * C, 1, H, W), self.size, self.size, 0, self.stride)
        return dx.reshape(x.shape)

# =============================================================================
# 2. Inception 模块定义 (无变化)
# =============================================================================
class InceptionModule(Layer):
    def __init__(self, in_c, c_1x1, c_3x3_r, c_3x3, c_5x5_r, c_5x5, c_pool, name=""):
        super().__init__()
        self.name = name
        self.b1_conv = Conv2D(in_c, c_1x1, 1, name=f"{name}_b1_conv")
        self.b2_conv_r = Conv2D(in_c, c_3x3_r, 1, name=f"{name}_b2_conv_r"); self.b2_relu_r = ReLU()
        self.b2_conv = Conv2D(c_3x3_r, c_3x3, 3, padding=1, name=f"{name}_b2_conv")
        self.b3_conv_r = Conv2D(in_c, c_5x5_r, 1, name=f"{name}_b3_conv_r"); self.b3_relu_r = ReLU()
        self.b3_conv = Conv2D(c_5x5_r, c_5x5, 5, padding=2, name=f"{name}_b3_conv")
        self.b4_pool = MaxPool2D(3, stride=1, name=f"{name}_b4_pool")
        self.b4_conv = Conv2D(in_c, c_pool, 1, name=f"{name}_b4_conv")
        self.layers = [self.b1_conv, self.b2_conv_r, self.b2_relu_r, self.b2_conv, self.b3_conv_r, self.b3_relu_r, self.b3_conv, self.b4_pool, self.b4_conv]
        self.channel_counts = [c_1x1, c_3x3, c_5x5, c_pool]
    
    def forward(self, x):
        out1 = self.b1_conv.forward(x)
        t = self.b2_conv_r.forward(x); t = self.b2_relu_r.forward(t); out2 = self.b2_conv.forward(t)
        t = self.b3_conv_r.forward(x); t = self.b3_relu_r.forward(t); out3 = self.b3_conv.forward(t)
        x_pad = np.pad(x, ((0,0),(0,0),(1,1),(1,1)), 'constant')
        t = self.b4_pool.forward(x_pad); out4 = self.b4_conv.forward(t)
        return np.concatenate([out1, out2, out3, out4], axis=1)

    def backward(self, grad_out):
        c1, c2, c3, _ = self.channel_counts
        grad1 = grad_out[:, :c1, :, :]; grad2 = grad_out[:, c1:c1+c2, :, :]; grad3 = grad_out[:, c1+c2:c1+c2+c3, :, :]; grad4 = grad_out[:, c1+c2+c3:, :, :]
        dx1 = self.b1_conv.backward(grad1)
        t = self.b2_conv.backward(grad2); t = self.b2_relu_r.backward(t); dx2 = self.b2_conv_r.backward(t)
        t = self.b3_conv.backward(grad3); t = self.b3_relu_r.backward(t); dx3 = self.b3_conv_r.backward(t)
        t = self.b4_conv.backward(grad4); dx4_pad = self.b4_pool.backward(t); dx4 = dx4_pad[:, :, 1:-1, 1:-1]
        return dx1 + dx2 + dx3 + dx4

# =============================================================================
# 3. GoogLeNet 模型定义 (无变化)
# =============================================================================
def get_param_layers(layers):
    param_layers = []
    for layer in layers:
        if isinstance(layer, InceptionModule): param_layers.extend(get_param_layers(layer.layers))
        elif hasattr(layer, 'params') and layer.params: param_layers.append(layer)
    return param_layers

class GoogLeNet:
    def __init__(self):
        self.stem = [Conv2D(3, 32, 3, padding=1, name="stem_conv1"), ReLU(), Conv2D(32, 64, 3, padding=1, name="stem_conv2"), ReLU(), MaxPool2D(3, stride=2, name="stem_pool")]
        self.inception_trunk = [InceptionModule(64,  32, 16, 64, 8, 16, 16, name="incep1"), InceptionModule(128, 64, 32, 96, 16, 32, 32, name="incep2"), MaxPool2D(3, stride=2, name="pool2")]
        self.classifier = [AvgPool2D(7, stride=1, name="final_pool"), Flatten(), Dense(224, 10, name="final_dense"), Softmax()]
        self.layers = self.stem + self.inception_trunk + self.classifier
        self.param_layers = get_param_layers(self.layers)
    def forward(self, x):
        for layer in self.layers: x = layer.forward(x)
        return x
    def backward(self, grad):
        for layer in reversed(self.layers): grad = layer.backward(grad)
        return grad

# =============================================================================
# 4. 损失函数和优化器 (无变化)
# =============================================================================
class CrossEntropyLoss:
    def forward(self, y_pred, y_true): self.y_pred, self.y_true = y_pred, y_true; m = y_true.shape[0]; return -np.sum(y_true * np.log(y_pred + 1e-12)) / m
    def backward(self): m = self.y_true.shape[0]; return (self.y_pred - self.y_true) / m

class SGD:
    def __init__(self, layers, learning_rate=0.01): self.layers, self.lr = layers, learning_rate
    def step(self):
        for layer in self.layers:
            if hasattr(layer, 'params') and layer.params:
                for key in layer.grads: layer.params[key] -= self.lr * layer.grads.get(key, 0)

# =============================================================================
# 5. 训练过程 (无变化)
# =============================================================================
if __name__ == '__main__':
    print("正在加载 CIFAR-10 数据集...")
    try: from sklearn.datasets import fetch_openml
    except ImportError: print("请安装 scikit-learn: pip install scikit-learn"); exit()

    cifar = fetch_openml('CIFAR_10', version=1, as_frame=False, parser='auto')
    X, y = cifar.data.astype('float32') / 255.0, cifar.target.astype(np.uint8)
    X = X.reshape(-1, 3, 32, 32)
    
    x_train, x_test = X[:50000], X[50000:60000]
    y_train, y_test = y[:50000], y[50000:60000]
    y_train_one_hot = to_categorical_numpy(y_train, 10)
    
    num_train_samples = 64
    print(f"!!! 警告: 为快速演示，仅使用 {num_train_samples} 个样本进行训练。!!!")
    x_train, y_train_one_hot = x_train[:num_train_samples], y_train_one_hot[:num_train_samples]

    print("数据加载和预处理完成。")
    model = GoogLeNet()
    loss_fn = CrossEntropyLoss()
    optimizer = SGD(model.param_layers, learning_rate=0.001)

    epochs = 1
    batch_size = 8
    num_batches = x_train.shape[0] // batch_size
    
    print(f"\n开始训练... (Epochs={epochs}, Batch Size={batch_size})")
    for epoch in range(epochs):
        start_time = time.time()
        epoch_loss = 0.0
        
        permutation = np.random.permutation(x_train.shape[0])
        x_train_shuffled = x_train[permutation]
        y_train_shuffled = y_train_one_hot[permutation]
        
        for i in range(num_batches):
            batch_start_time = time.time()
            start_idx, end_idx = i * batch_size, (i+1) * batch_size
            x_batch, y_batch = x_train_shuffled[start_idx:end_idx], y_train_shuffled[start_idx:end_idx]
            
            y_pred = model.forward(x_batch)
            loss = loss_fn.forward(y_pred, y_batch)
            epoch_loss += loss
            grad = loss_fn.backward()
            model.backward(grad)
            optimizer.step()
            
            print(f"    Epoch {epoch+1}, 批次 {i+1}/{num_batches} - 损失: {loss:.4f} - 耗时: {time.time() - batch_start_time:.2f}s")
        
        avg_loss = epoch_loss / num_batches
        print(f"** Epoch {epoch+1}/{epochs} - 平均损失: {avg_loss:.4f} - 总耗时: {time.time() - start_time:.2f}s **")

    print("\n训练完成。")
    print("\n正在评估模型...")
    num_test_samples = 50
    x_test_subset, y_test_subset_labels = x_test[:num_test_samples], y_test[:num_test_samples]
    y_test_pred_probs = model.forward(x_test_subset)
    y_test_pred_labels = np.argmax(y_test_pred_probs, axis=1)
    accuracy = np.mean(y_test_pred_labels == y_test_subset_labels)
    print(f"在 {num_test_samples} 个测试样本上的准确率: {accuracy * 100:.2f}%")

正在加载 CIFAR-10 数据集...
!!! 警告: 为快速演示，仅使用 64 个样本进行训练。!!!
数据加载和预处理完成。

开始训练... (Epochs=1, Batch Size=8)
    Epoch 1, 批次 1/8 - 损失: 3.1695 - 耗时: 0.24s
    Epoch 1, 批次 2/8 - 损失: 3.2050 - 耗时: 0.24s
    Epoch 1, 批次 3/8 - 损失: 2.4973 - 耗时: 0.23s
    Epoch 1, 批次 4/8 - 损失: 2.8459 - 耗时: 0.25s
    Epoch 1, 批次 5/8 - 损失: 3.1532 - 耗时: 0.23s
    Epoch 1, 批次 6/8 - 损失: 3.4507 - 耗时: 0.25s
    Epoch 1, 批次 7/8 - 损失: 2.9522 - 耗时: 0.24s
    Epoch 1, 批次 8/8 - 损失: 3.2886 - 耗时: 0.25s
** Epoch 1/1 - 平均损失: 3.0703 - 总耗时: 1.94s **

训练完成。

正在评估模型...
在 50 个测试样本上的准确率: 8.00%


### ResNet

Hypthesis: the problem is an optimization probelm, deeper models are harder to optimize.
- The deeper model should be able to perform at least as well as the shallower model.
- A solution by construction is copying the learned layers from the shallower model and setting additional layers to identity mapping.

Direct mappings are hard to learn. So instead of learning mapping between output of layer and its input, learn the difference between them---learn the residual.

![ResNet](images/image3-1.png)

Experiemnts Results:
- Able to train very deep networks without degrading(152 layers in ImageNet, 1202 on Cifar)
- Deeper networks now achieve lowing training error as expected.
- Swept 1st place in all ILSVRC and COCO 2015 competitions.

In [11]:
# ResNet
import numpy as np
import time
import pickle

# =============================================================================
# 0. 工具函数 (Utility Functions)
# =============================================================================
# (复用之前的函数)
def get_im2col_indices(x_shape, field_height, field_width, padding=1, stride=1):
    N, C, H, W = x_shape
    out_height = (H + 2 * padding - field_height) // stride + 1
    out_width = (W + 2 * padding - field_width) // stride + 1
    i0 = np.repeat(np.arange(field_height), field_width); i0 = np.tile(i0, C)
    i1 = stride * np.repeat(np.arange(out_height), out_width)
    j0 = np.tile(np.arange(field_width), field_height * C)
    j1 = stride * np.tile(np.arange(out_width), out_height)
    i = i0.reshape(-1, 1) + i1.reshape(1, -1)
    j = j0.reshape(-1, 1) + j1.reshape(1, -1)
    k = np.repeat(np.arange(C), field_height * field_width).reshape(-1, 1)
    return (k, i, j)

def im2col_indices(x, field_height, field_width, padding=1, stride=1):
    p = padding
    x_padded = np.pad(x, ((0, 0), (0, 0), (p, p), (p, p)), mode='constant')
    k, i, j = get_im2col_indices(x.shape, field_height, field_width, padding, stride)
    cols = x_padded[:, k, i, j]
    C = x.shape[1]
    cols = cols.transpose(1, 2, 0).reshape(field_height * field_width * C, -1)
    return cols

def col2im_indices(cols, x_shape, field_height=3, field_width=3, padding=1, stride=1):
    N, C, H, W = x_shape
    H_padded, W_padded = H + 2 * padding, W + 2 * padding
    x_padded = np.zeros((N, C, H_padded, W_padded), dtype=cols.dtype)
    k, i, j = get_im2col_indices(x_shape, field_height, field_width, padding, stride)
    cols_reshaped = cols.reshape(C * field_height * field_width, -1, N)
    cols_reshaped = cols_reshaped.transpose(2, 0, 1)
    np.add.at(x_padded, (slice(None), k, i, j), cols_reshaped)
    if padding == 0: return x_padded
    return x_padded[:, :, padding:-padding, padding:-padding]

def to_categorical_numpy(y, num_classes=10):
    y_int = y.astype(int)
    y_one_hot = np.zeros((len(y_int), num_classes))
    y_one_hot[np.arange(len(y_int)), y_int] = 1
    return y_one_hot

# =============================================================================
# 1. 基础层定义 (Layer Definitions)
# =============================================================================
# (复用之前已经验证过的基础层)
class Layer:
    def __init__(self): self.params, self.grads = {}, {}
    def forward(self, inputs): raise NotImplementedError
    def backward(self, grad_out): raise NotImplementedError

class ReLU(Layer):
    def forward(self, x): self.cache = x; return np.maximum(0, x)
    def backward(self, grad_out): return grad_out * (self.cache > 0)

class Softmax(Layer):
    def forward(self, x):
        exps = np.exp(x - np.max(x, axis=1, keepdims=True))
        self.cache = exps / np.sum(exps, axis=1, keepdims=True); return self.cache
    def backward(self, grad_out): return grad_out

class Dense(Layer):
    def __init__(self, input_dim, output_dim, name):
        super().__init__(); self.name = name
        self.params['W'] = np.random.randn(input_dim, output_dim) * np.sqrt(2. / input_dim)
        self.params['b'] = np.zeros(output_dim)
    def forward(self, x): self.cache = x; return x @ self.params['W'] + self.params['b']
    def backward(self, grad_out):
        self.grads['W'] = self.cache.T @ grad_out; self.grads['b'] = np.sum(grad_out, axis=0)
        return grad_out @ self.params['W'].T

class Flatten(Layer):
    def forward(self, x): self.cache = x.shape; return x.reshape(x.shape[0], -1)
    def backward(self, grad_out): return grad_out.reshape(self.cache)

class Conv2D(Layer):
    def __init__(self, in_c, out_c, kernel, stride=1, padding=0, name=""):
        super().__init__(); self.name, self.stride, self.padding, self.kernel, self.out_c = name, stride, padding, kernel, out_c
        self.params['W'] = np.random.randn(out_c, in_c, kernel, kernel) * np.sqrt(2./(in_c*kernel*kernel))
        self.params['b'] = np.zeros(out_c)
    def forward(self, x):
        N, C, H, W = x.shape
        h_out, w_out = (H + 2*self.padding - self.kernel)//self.stride + 1, (W + 2*self.padding - self.kernel)//self.stride + 1
        self.x_col = im2col_indices(x, self.kernel, self.kernel, self.padding, self.stride)
        W_col = self.params['W'].reshape(self.out_c, -1)
        out = (W_col @ self.x_col) + self.params['b'].reshape(-1, 1)
        out = out.reshape(self.out_c, h_out, w_out, N).transpose(3, 0, 1, 2); self.cache = x
        return out
    def backward(self, grad_out):
        self.grads['b'] = np.sum(grad_out, axis=(0, 2, 3))
        grad_reshaped = grad_out.transpose(1, 2, 3, 0).reshape(self.out_c, -1)
        self.grads['W'] = (grad_reshaped @ self.x_col.T).reshape(self.params['W'].shape)
        W_col = self.params['W'].reshape(self.out_c, -1)
        dx_col = W_col.T @ grad_reshaped
        return col2im_indices(dx_col, self.cache.shape, self.kernel, self.kernel, self.padding, self.stride)

class GlobalAvgPool2D(Layer):
    """全局平均池化层"""
    def forward(self, x):
        self.cache = x.shape
        # 在H和W维度上求平均，保留N和C维度
        return np.mean(x, axis=(2, 3))
    def backward(self, grad_out):
        N, C, H, W = self.cache
        # 将梯度广播回原始的H,W维度
        grad = grad_out[:, :, np.newaxis, np.newaxis]
        return np.repeat(np.repeat(grad, H, axis=2), W, axis=3) / (H * W)

# =============================================================================
# 2. 残差块定义 (The Star of the Show: Residual Block)
# =============================================================================
class ResidualBlock(Layer):
    """ResNet的核心，残差块"""
    def __init__(self, in_channels, out_channels, stride=1, name=""):
        super().__init__()
        self.name = name
        
        # 主路径 F(x)
        self.conv1 = Conv2D(in_channels, out_channels, 3, stride=stride, padding=1, name=f"{name}_conv1")
        self.relu1 = ReLU()
        self.conv2 = Conv2D(out_channels, out_channels, 3, stride=1, padding=1, name=f"{name}_conv2")
        
        # 捷径路径 (Shortcut)
        self.shortcut = Layer() # 占位符
        if stride != 1 or in_channels != out_channels:
            # 维度不匹配时，使用1x1卷积进行投影
            self.shortcut = Conv2D(in_channels, out_channels, 1, stride=stride, name=f"{name}_shortcut")
        else:
            # 维度匹配时，捷径就是恒等映射
            self.shortcut.forward = lambda x: x
            self.shortcut.backward = lambda grad: grad
        
        # 最后的激活函数
        self.relu2 = ReLU()
        
        # 收集子层
        self.layers = [self.conv1, self.relu1, self.conv2, self.shortcut]
        
    def forward(self, x):
        # 保存输入，用于捷径连接
        identity = x
        
        # 主路径计算 F(x)
        out = self.conv1.forward(x)
        out = self.relu1.forward(out)
        out = self.conv2.forward(out)
        
        # 捷径路径计算
        shortcut_out = self.shortcut.forward(identity)
        
        # 核心：F(x) + x
        out += shortcut_out
        
        # 最终的激活
        out = self.relu2.forward(out)
        
        return out

    def backward(self, grad_out):
        # 链式法则：梯度先通过最后的激活函数
        grad = self.relu2.backward(grad_out)
        
        # 核心：梯度同时流向两个分支
        grad_shortcut = self.shortcut.backward(grad)
        
        # 梯度流经主路径
        grad_main = self.conv2.backward(grad)
        grad_main = self.relu1.backward(grad_main)
        grad_main = self.conv1.backward(grad_main)
        
        # 最终对输入的梯度是两个分支梯度之和
        return grad_main + grad_shortcut

# =============================================================================
# 3. ResNet 模型定义
# =============================================================================
def get_param_layers(layers):
    param_layers = []
    for layer in layers:
        if isinstance(layer, ResidualBlock): param_layers.extend(get_param_layers(layer.layers))
        elif hasattr(layer, 'params') and layer.params: param_layers.append(layer)
    return param_layers

class ResNet:
    """一个为CIFAR-10定制的微型ResNet"""
    def _make_layer(self, block, in_channels, out_channels, num_blocks, stride, name):
        layers = []
        # 第一个块可能需要改变步长和通道数
        layers.append(block(in_channels, out_channels, stride, name=f"{name}_block1"))
        # 后续块保持维度不变
        for i in range(1, num_blocks):
            layers.append(block(out_channels, out_channels, 1, name=f"{name}_block{i+1}"))
        return layers

    def __init__(self):
        # Stem: 初始卷积层
        self.stem = [Conv2D(3, 16, 3, padding=1, name="stem_conv")]
        
        # 构建残差层
        self.layer1 = self._make_layer(ResidualBlock, 16, 16, 2, stride=1, name="res_layer1") # 32x32
        self.layer2 = self._make_layer(ResidualBlock, 16, 32, 2, stride=2, name="res_layer2") # 16x16
        self.layer3 = self._make_layer(ResidualBlock, 32, 64, 2, stride=2, name="res_layer3") # 8x8
        
        # 分类器
        self.classifier = [GlobalAvgPool2D(), Flatten(), Dense(64, 10, name="final_dense"), Softmax()]
        
        # 整合所有层
        self.layers = self.stem + self.layer1 + self.layer2 + self.layer3 + self.classifier
        self.param_layers = get_param_layers(self.layers)

    def forward(self, x):
        for layer in self.layers: x = layer.forward(x)
        return x

    def backward(self, grad):
        for layer in reversed(self.layers): grad = layer.backward(grad)
        return grad

# =============================================================================
# 4. 损失函数和优化器
# =============================================================================
class CrossEntropyLoss:
    def forward(self, y_pred, y_true): self.y_pred, self.y_true = y_pred, y_true; m = y_true.shape[0]; return -np.sum(y_true * np.log(y_pred + 1e-12)) / m
    def backward(self): m = self.y_true.shape[0]; return (self.y_pred - self.y_true) / m

class SGD:
    def __init__(self, layers, learning_rate=0.01): self.layers, self.lr = layers, learning_rate
    def step(self):
        for layer in self.layers:
            if hasattr(layer, 'params') and layer.params:
                for key in layer.grads: layer.params[key] -= self.lr * layer.grads.get(key, 0)

# =============================================================================
# 5. 训练过程
# =============================================================================
if __name__ == '__main__':
    print("正在加载 CIFAR-10 数据集...")
    try: from sklearn.datasets import fetch_openml
    except ImportError: print("请安装 scikit-learn: pip install scikit-learn"); exit()

    cifar = fetch_openml('CIFAR_10', version=1, as_frame=False, parser='auto')
    X, y = cifar.data.astype('float32') / 255.0, cifar.target.astype(np.uint8)
    X = X.reshape(-1, 3, 32, 32)
    
    x_train, x_test = X[:50000], X[50000:60000]
    y_train, y_test = y[:50000], y[50000:60000]
    y_train_one_hot = to_categorical_numpy(y_train, 10)
    
    # !!! 极度警告: ResNet计算量巨大，必须使用极小的数据集子集 !!!
    num_train_samples = 128
    print(f"!!! 警告: 为快速演示，仅使用 {num_train_samples} 个样本进行训练。!!!")
    x_train, y_train_one_hot = x_train[:num_train_samples], y_train_one_hot[:num_train_samples]

    print("数据加载和预处理完成。")
    model = ResNet()
    loss_fn = CrossEntropyLoss()
    optimizer = SGD(model.param_layers, learning_rate=0.01)

    epochs = 1
    batch_size = 16
    num_batches = x_train.shape[0] // batch_size
    
    print(f"\n开始训练... (Epochs={epochs}, Batch Size={batch_size})")
    for epoch in range(epochs):
        start_time = time.time()
        
        permutation = np.random.permutation(x_train.shape[0])
        x_train_shuffled, y_train_shuffled = x_train[permutation], y_train_one_hot[permutation]
        
        for i in range(num_batches):
            batch_start_time = time.time()
            start_idx, end_idx = i * batch_size, (i+1) * batch_size
            x_batch, y_batch = x_train_shuffled[start_idx:end_idx], y_train_shuffled[start_idx:end_idx]
            
            y_pred = model.forward(x_batch)
            loss = loss_fn.forward(y_pred, y_batch)
            grad = loss_fn.backward()
            model.backward(grad)
            optimizer.step()
            
            print(f"    Epoch {epoch+1}, 批次 {i+1}/{num_batches} - 损失: {loss:.4f} - 耗时: {time.time() - batch_start_time:.2f}s")
        
        print(f"** Epoch {epoch+1}/{epochs} - 总耗时: {time.time() - start_time:.2f}s **")

    print("\n训练完成。")
    print("\n正在评估模型...")
    num_test_samples = 100
    x_test_subset, y_test_subset_labels = x_test[:num_test_samples], y_test[:num_test_samples]
    y_test_pred_probs = model.forward(x_test_subset)
    y_test_pred_labels = np.argmax(y_test_pred_probs, axis=1)
    accuracy = np.mean(y_test_pred_labels == y_test_subset_labels)
    print(f"在 {num_test_samples} 个测试样本上的准确率: {accuracy * 100:.2f}%")

正在加载 CIFAR-10 数据集...
!!! 警告: 为快速演示，仅使用 128 个样本进行训练。!!!
数据加载和预处理完成。

开始训练... (Epochs=1, Batch Size=16)
    Epoch 1, 批次 1/8 - 损失: 6.7784 - 耗时: 0.27s
    Epoch 1, 批次 2/8 - 损失: 5.8389 - 耗时: 0.29s
    Epoch 1, 批次 3/8 - 损失: 3.1065 - 耗时: 0.29s
    Epoch 1, 批次 4/8 - 损失: 2.3059 - 耗时: 0.29s
    Epoch 1, 批次 5/8 - 损失: 2.4728 - 耗时: 0.28s
    Epoch 1, 批次 6/8 - 损失: 2.4723 - 耗时: 0.27s
    Epoch 1, 批次 7/8 - 损失: 2.2864 - 耗时: 0.27s
    Epoch 1, 批次 8/8 - 损失: 2.3574 - 耗时: 0.28s
** Epoch 1/1 - 总耗时: 2.24s **

训练完成。

正在评估模型...
在 100 个测试样本上的准确率: 8.00%


In [13]:
# DenseNet
import numpy as np
import time
import pickle

# =============================================================================
# 0. 工具函数 (Utility Functions)
# =============================================================================
# (这部分代码无变化，保持不变)
def get_im2col_indices(x_shape, field_height, field_width, padding=1, stride=1):
    N, C, H, W = x_shape
    out_height = (H + 2 * padding - field_height) // stride + 1
    out_width = (W + 2 * padding - field_width) // stride + 1
    i0 = np.repeat(np.arange(field_height), field_width); i0 = np.tile(i0, C)
    i1 = stride * np.repeat(np.arange(out_height), out_width)
    j0 = np.tile(np.arange(field_width), field_height * C)
    j1 = stride * np.tile(np.arange(out_width), out_height)
    i = i0.reshape(-1, 1) + i1.reshape(1, -1)
    j = j0.reshape(-1, 1) + j1.reshape(1, -1)
    k = np.repeat(np.arange(C), field_height * field_width).reshape(-1, 1)
    return (k, i, j)

def im2col_indices(x, field_height, field_width, padding=1, stride=1):
    p = padding
    x_padded = np.pad(x, ((0, 0), (0, 0), (p, p), (p, p)), mode='constant')
    k, i, j = get_im2col_indices(x.shape, field_height, field_width, padding, stride)
    cols = x_padded[:, k, i, j]
    C = x.shape[1]
    cols = cols.transpose(1, 2, 0).reshape(field_height * field_width * C, -1)
    return cols

def col2im_indices(cols, x_shape, field_height=3, field_width=3, padding=1, stride=1):
    N, C, H, W = x_shape
    H_padded, W_padded = H + 2 * padding, W + 2 * padding
    x_padded = np.zeros((N, C, H_padded, W_padded), dtype=cols.dtype)
    k, i, j = get_im2col_indices(x_shape, field_height, field_width, padding, stride)
    cols_reshaped = cols.reshape(C * field_height * field_width, -1, N)
    cols_reshaped = cols_reshaped.transpose(2, 0, 1)
    np.add.at(x_padded, (slice(None), k, i, j), cols_reshaped)
    if padding == 0: return x_padded
    return x_padded[:, :, padding:-padding, padding:-padding]

def to_categorical_numpy(y, num_classes=10):
    y_int = y.astype(int)
    y_one_hot = np.zeros((len(y_int), num_classes))
    y_one_hot[np.arange(len(y_int)), y_int] = 1
    return y_one_hot

# =============================================================================
# 1. 基础层定义 (Layer Definitions)
# =============================================================================
# (这部分代码无变化，保持不变)
class Layer:
    def __init__(self): self.params, self.grads = {}, {}
    def forward(self, inputs): raise NotImplementedError
    def backward(self, grad_out): raise NotImplementedError

class ReLU(Layer):
    def forward(self, x): self.cache = x; return np.maximum(0, x)
    def backward(self, grad_out): return grad_out * (self.cache > 0)

class Softmax(Layer):
    def forward(self, x):
        exps = np.exp(x - np.max(x, axis=1, keepdims=True))
        self.cache = exps / np.sum(exps, axis=1, keepdims=True); return self.cache
    def backward(self, grad_out): return grad_out

class Dense(Layer):
    def __init__(self, input_dim, output_dim, name):
        super().__init__(); self.name = name
        self.params['W'] = np.random.randn(input_dim, output_dim) * np.sqrt(2. / input_dim)
        self.params['b'] = np.zeros(output_dim)
    def forward(self, x): self.cache = x; return x @ self.params['W'] + self.params['b']
    def backward(self, grad_out):
        self.grads['W'] = self.cache.T @ grad_out; self.grads['b'] = np.sum(grad_out, axis=0)
        return grad_out @ self.params['W'].T

class Flatten(Layer):
    def forward(self, x): self.cache = x.shape; return x.reshape(x.shape[0], -1)
    def backward(self, grad_out): return grad_out.reshape(self.cache)

class Conv2D(Layer):
    def __init__(self, in_c, out_c, kernel, stride=1, padding=0, name=""):
        super().__init__(); self.name, self.stride, self.padding, self.kernel, self.out_c = name, stride, padding, kernel, out_c
        self.params['W'] = np.random.randn(out_c, in_c, kernel, kernel) * np.sqrt(2./(in_c*kernel*kernel))
        self.params['b'] = np.zeros(out_c)
    def forward(self, x):
        N, C, H, W = x.shape
        h_out, w_out = (H + 2*self.padding - self.kernel)//self.stride + 1, (W + 2*self.padding - self.kernel)//self.stride + 1
        self.x_col = im2col_indices(x, self.kernel, self.kernel, self.padding, self.stride)
        W_col = self.params['W'].reshape(self.out_c, -1)
        out = (W_col @ self.x_col) + self.params['b'].reshape(-1, 1)
        out = out.reshape(self.out_c, h_out, w_out, N).transpose(3, 0, 1, 2); self.cache = x
        return out
    def backward(self, grad_out):
        self.grads['b'] = np.sum(grad_out, axis=(0, 2, 3))
        grad_reshaped = grad_out.transpose(1, 2, 3, 0).reshape(self.out_c, -1)
        self.grads['W'] = (grad_reshaped @ self.x_col.T).reshape(self.params['W'].shape)
        W_col = self.params['W'].reshape(self.out_c, -1)
        dx_col = W_col.T @ grad_reshaped
        return col2im_indices(dx_col, self.cache.shape, self.kernel, self.kernel, self.padding, self.stride)

class AvgPool2D(Layer):
    def __init__(self, size, stride, name=""):
        super().__init__(); self.name, self.size, self.stride = name, size, stride
    def forward(self, x):
        self.cache = x; N, C, H, W = x.shape
        h_out, w_out = (H-self.size)//self.stride+1, (W-self.size)//self.stride+1
        x_reshaped = x.reshape(N * C, 1, H, W)
        x_col = im2col_indices(x_reshaped, self.size, self.size, 0, self.stride)
        out = np.mean(x_col, axis=0)
        return out.reshape(h_out, w_out, N, C).transpose(2, 3, 0, 1)
    def backward(self, grad_out):
        x = self.cache; N, C, H, W = x.shape
        h_out, w_out = (H-self.size)//self.stride+1, (W-self.size)//self.stride+1
        pool_area = self.size * self.size
        grad_flat = grad_out.transpose(2, 3, 0, 1).ravel()
        num_patches = N * C * h_out * w_out
        dx_col = np.ones((self.size*self.size, num_patches)) * (grad_flat / pool_area)
        dx = col2im_indices(dx_col, (N*C, 1, H, W), self.size, self.size, 0, self.stride)
        return dx.reshape(x.shape)

class GlobalAvgPool2D(Layer):
    def forward(self, x):
        self.cache = x.shape; return np.mean(x, axis=(2, 3))
    def backward(self, grad_out):
        N, C, H, W = self.cache; grad = grad_out[:, :, np.newaxis, np.newaxis]
        return np.repeat(np.repeat(grad, H, axis=2), W, axis=3) / (H * W)

# =============================================================================
# 2. DenseNet 核心组件
# =============================================================================
class DenseLayer(Layer):
    def __init__(self, in_channels, growth_rate, name=""):
        super().__init__(); self.name = name
        bottleneck_channels = 4 * growth_rate
        self.relu1 = ReLU(); self.conv1 = Conv2D(in_channels, bottleneck_channels, 1, name=f"{name}_conv1")
        self.relu2 = ReLU(); self.conv2 = Conv2D(bottleneck_channels, growth_rate, 3, padding=1, name=f"{name}_conv2")
        self.layers = [self.relu1, self.conv1, self.relu2, self.conv2]
    def forward(self, x):
        out = self.relu1.forward(x); out = self.conv1.forward(out)
        out = self.relu2.forward(out); out = self.conv2.forward(out)
        return out
    def backward(self, grad_out):
        grad = self.conv2.backward(grad_out); grad = self.relu2.backward(grad)
        grad = self.conv1.backward(grad); grad = self.relu1.backward(grad)
        return grad

# ******** 代码修正处 ********
class DenseBlock(Layer):
    """密集连接块"""
    def __init__(self, num_layers, in_channels, growth_rate, name=""):
        super().__init__()
        self.name = name
        self.growth_rate = growth_rate
        self.dense_layers = []
        num_c = in_channels
        for i in range(num_layers):
            layer = DenseLayer(num_c, growth_rate, name=f"{name}_layer{i+1}")
            self.dense_layers.append(layer)
            num_c += growth_rate
        self.layers = self.dense_layers

    def forward(self, x):
        features = [x]
        for layer in self.dense_layers:
            input_features = np.concatenate(features, axis=1)
            new_features = layer.forward(input_features)
            features.append(new_features)
        return np.concatenate(features, axis=1)

    def backward(self, grad_out):
        """修正后的反向传播逻辑"""
        for layer in reversed(self.dense_layers):
            # 1. 将梯度切分为两部分：
            #    grad_new_features: 对应当前层输出的梯度
            #    grad_previous_features: 对应之前所有层拼接特征的梯度
            grad_new_features = grad_out[:, -self.growth_rate:, :, :]
            grad_previous_features = grad_out[:, :-self.growth_rate, :, :]

            # 2. 将 grad_new_features 通过当前层反向传播，得到对该层输入的梯度 dx
            dx = layer.backward(grad_new_features)

            # 3. 更新 grad_out，使其成为对上一轮拼接特征的总梯度
            #    这是核心：dx 的维度和 grad_previous_features 的维度是相同的
            grad_out = grad_previous_features + dx

        # 循环结束后，最终的 grad_out 就是对整个 DenseBlock 初始输入的梯度
        return grad_out
# ******** 修正结束 ********

class TransitionLayer(Layer):
    def __init__(self, in_channels, out_channels, name=""):
        super().__init__(); self.name = name
        self.relu = ReLU(); self.conv = Conv2D(in_channels, out_channels, 1, name=f"{name}_conv")
        self.pool = AvgPool2D(2, 2, name=f"{name}_pool"); self.layers = [self.relu, self.conv, self.pool]
    def forward(self, x):
        out = self.relu.forward(x); out = self.conv.forward(out); out = self.pool.forward(out)
        return out
    def backward(self, grad_out):
        grad = self.pool.backward(grad_out); grad = self.conv.backward(grad); grad = self.relu.backward(grad)
        return grad

# =============================================================================
# 3. DenseNet 模型定义 (无变化)
# =============================================================================
def get_param_layers(layers):
    param_layers = []
    for layer in layers:
        if isinstance(layer, (DenseBlock, TransitionLayer, DenseLayer)): param_layers.extend(get_param_layers(layer.layers))
        elif hasattr(layer, 'params') and layer.params: param_layers.append(layer)
    return param_layers

class DenseNet:
    def __init__(self, growth_rate=12, compression=0.5):
        self.growth_rate = growth_rate; num_c = 16
        self.stem = [Conv2D(3, num_c, 3, padding=1, name="stem_conv")]
        self.block1 = DenseBlock(num_layers=3, in_channels=num_c, growth_rate=growth_rate, name="block1"); num_c += 3 * growth_rate
        num_c_trans1 = int(num_c * compression)
        self.trans1 = TransitionLayer(num_c, num_c_trans1, name="trans1"); num_c = num_c_trans1
        self.block2 = DenseBlock(num_layers=3, in_channels=num_c, growth_rate=growth_rate, name="block2"); num_c += 3 * growth_rate
        self.final_relu = ReLU()
        self.classifier = [GlobalAvgPool2D(), Flatten(), Dense(num_c, 10, name="final_dense"), Softmax()]
        self.layers = self.stem + [self.block1, self.trans1, self.block2, self.final_relu] + self.classifier
        self.param_layers = get_param_layers(self.layers)
    def forward(self, x):
        for layer in self.layers: x = layer.forward(x)
        return x
    def backward(self, grad):
        for layer in reversed(self.layers): grad = layer.backward(grad)
        return grad

# =============================================================================
# 4. 损失函数和优化器 (无变化)
# =============================================================================
class CrossEntropyLoss:
    def forward(self, y_pred, y_true): self.y_pred, self.y_true = y_pred, y_true; m = y_true.shape[0]; return -np.sum(y_true * np.log(y_pred + 1e-12)) / m
    def backward(self): m = self.y_true.shape[0]; return (self.y_pred - self.y_true) / m

class SGD:
    def __init__(self, layers, learning_rate=0.01): self.layers, self.lr = layers, learning_rate
    def step(self):
        for layer in self.layers:
            if hasattr(layer, 'params') and layer.params:
                for key in layer.grads: layer.params[key] -= self.lr * layer.grads.get(key, 0)

# =============================================================================
# 5. 训练过程 (无变化)
# =============================================================================
if __name__ == '__main__':
    print("正在加载 CIFAR-10 数据集...")
    try: from sklearn.datasets import fetch_openml
    except ImportError: print("请安装 scikit-learn: pip install scikit-learn"); exit()

    cifar = fetch_openml('CIFAR_10', version=1, as_frame=False, parser='auto')
    X, y = cifar.data.astype('float32') / 255.0, cifar.target.astype(np.uint8)
    X = X.reshape(-1, 3, 32, 32)
    
    x_train, x_test = X[:50000], X[50000:60000]
    y_train, y_test = y[:50000], y[50000:60000]
    y_train_one_hot = to_categorical_numpy(y_train, 10)
    
    num_train_samples = 32
    print(f"!!! 警告: 为快速演示，仅使用 {num_train_samples} 个样本进行训练。!!!")
    x_train, y_train_one_hot = x_train[:num_train_samples], y_train_one_hot[:num_train_samples]

    print("数据加载和预处理完成。")
    model = DenseNet()
    loss_fn = CrossEntropyLoss()
    optimizer = SGD(model.param_layers, learning_rate=0.01)

    epochs = 1
    batch_size = 8
    num_batches = x_train.shape[0] // batch_size
    
    print(f"\n开始训练... (Epochs={epochs}, Batch Size={batch_size})")
    for epoch in range(epochs):
        start_time = time.time()
        
        permutation = np.random.permutation(x_train.shape[0])
        x_train_shuffled, y_train_shuffled = x_train[permutation], y_train_one_hot[permutation]
        
        for i in range(num_batches):
            batch_start_time = time.time()
            start_idx, end_idx = i * batch_size, (i+1) * batch_size
            x_batch, y_batch = x_train_shuffled[start_idx:end_idx], y_train_shuffled[start_idx:end_idx]
            
            y_pred = model.forward(x_batch)
            loss = loss_fn.forward(y_pred, y_batch)
            grad = loss_fn.backward()
            model.backward(grad)
            optimizer.step()
            
            print(f"    Epoch {epoch+1}, 批次 {i+1}/{num_batches} - 损失: {loss:.4f} - 耗时: {time.time() - batch_start_time:.2f}s")
        
        print(f"** Epoch {epoch+1}/{epochs} - 总耗时: {time.time() - start_time:.2f}s **")

    print("\n训练完成。")
    print("\n正在评估模型...")
    num_test_samples = 50
    x_test_subset, y_test_subset_labels = x_test[:num_test_samples], y_test[:num_test_samples]
    y_test_pred_probs = model.forward(x_test_subset)
    y_test_pred_labels = np.argmax(y_test_pred_probs, axis=1)
    accuracy = np.mean(y_test_pred_labels == y_test_subset_labels)
    print(f"在 {num_test_samples} 个测试样本上的准确率: {accuracy * 100:.2f}%")

正在加载 CIFAR-10 数据集...
!!! 警告: 为快速演示，仅使用 32 个样本进行训练。!!!
数据加载和预处理完成。

开始训练... (Epochs=1, Batch Size=8)
    Epoch 1, 批次 1/4 - 损失: 2.3461 - 耗时: 0.31s
    Epoch 1, 批次 2/4 - 损失: 2.4282 - 耗时: 0.29s
    Epoch 1, 批次 3/4 - 损失: 2.3762 - 耗时: 0.30s
    Epoch 1, 批次 4/4 - 损失: 2.2435 - 耗时: 0.27s
** Epoch 1/1 - 总耗时: 1.17s **

训练完成。

正在评估模型...
在 50 个测试样本上的准确率: 18.00%


### Graph Convolution

### Spherical Convolution

### DeConv

**What is wrong with CNNs**

- The pooling operation used in convolutional neural networks is a big mistake and the fact that it works so well is a disaster ---Hinton.

Computer graphics takes internal representation of objects and produces an image (rendering, i.e., inverse graphics).Human brain does the opposite.

### Capsule Network

In [None]:
# Capsule Network

### Kolmogorov-Arnold Network

In [None]:
# Kolmogorov-Arnold Network

### Convolutional KAN(CKAN)

In [None]:
# CKAN