<font color="red">注</font>: 使用 tensorboard 可视化需要安装 tensorflow (TensorBoard依赖于tensorflow库，可以任意安装tensorflow的gpu/cpu版本)

```shell
pip install tensorflow-cpu
```

In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F

print(sys.version_info)
for module in mpl, np, pd, sklearn, torch:
    print(module.__name__, module.__version__)
    
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(device)

seed = 42


sys.version_info(major=3, minor=12, micro=3, releaselevel='final', serial=0)
matplotlib 3.10.0
numpy 2.0.2
pandas 2.2.3
sklearn 1.6.0
torch 2.5.1+cpu
cpu


## 数据准备

```shell
$ tree -L 1 cifar-10                                    
cifar-10
├── sampleSubmission.csv
├── test
├── train
└── trainLabels.csv
```

In [2]:
from pathlib import Path  # 导入Path类，用于处理文件路径

# 定义数据目录路径
DATA_DIR = Path("E:/BaiduNetdiskDownload/cifar-10")

# 定义训练标签文件路径
train_lables_file = DATA_DIR / "trainLabels.csv"
# 定义测试集模板CSV文件路径
test_csv_file = DATA_DIR / "sampleSubmission.csv"
# 定义训练集图片文件夹路径
train_folder = DATA_DIR / "train"
# 定义测试集图片文件夹路径
test_folder = DATA_DIR / "test"

# 所有的类别名称
class_names = [
    'airplane',
    'automobile',
    'bird',
    'cat',
    'deer',
    'dog',
    'frog',
    'horse',
    'ship',
    'truck',
]

def parse_csv_file(filepath, folder):
    """解析CSV文件，将其转换为包含文件路径和标签的元组列表"""
    results = []  # 初始化结果列表
    with open(filepath, 'r') as f:  # 以只读模式打开CSV文件
        lines = f.readlines()[1:]  # 读取所有行，并去掉第一行（标题行）
    for line in lines:  # 遍历每一行
        image_id, label_str = line.strip('\n').split(',')  # 去除换行符并按逗号分割，获取图片ID和标签
        image_full_path = folder / f"{image_id}.png"  # 构建图片的完整路径
        results.append((image_full_path, label_str))  # 将图片路径和标签作为元组添加到结果列表
    return results  # 返回结果列表

# 解析训练集标签文件
train_labels_info = parse_csv_file(train_lables_file, train_folder)
# 解析测试集CSV文件
test_csv_info = parse_csv_file(test_csv_file, test_folder)

# 打印训练集和测试集的前5条信息
import pprint
pprint.pprint(train_labels_info[0:5])
pprint.pprint(test_csv_info[0:5])

# 打印训练集和测试集的总长度
print(len(train_labels_info), len(test_csv_info))


[(WindowsPath('E:/BaiduNetdiskDownload/cifar-10/train/1.png'), 'frog'),
 (WindowsPath('E:/BaiduNetdiskDownload/cifar-10/train/2.png'), 'truck'),
 (WindowsPath('E:/BaiduNetdiskDownload/cifar-10/train/3.png'), 'truck'),
 (WindowsPath('E:/BaiduNetdiskDownload/cifar-10/train/4.png'), 'deer'),
 (WindowsPath('E:/BaiduNetdiskDownload/cifar-10/train/5.png'), 'automobile')]
[(WindowsPath('E:/BaiduNetdiskDownload/cifar-10/test/1.png'), 'cat'),
 (WindowsPath('E:/BaiduNetdiskDownload/cifar-10/test/2.png'), 'cat'),
 (WindowsPath('E:/BaiduNetdiskDownload/cifar-10/test/3.png'), 'cat'),
 (WindowsPath('E:/BaiduNetdiskDownload/cifar-10/test/4.png'), 'cat'),
 (WindowsPath('E:/BaiduNetdiskDownload/cifar-10/test/5.png'), 'cat')]
50000 300000


In [3]:
# train_df = pd.DataFrame(train_labels_info)
train_df = pd.DataFrame(train_labels_info[0:45000]) # 取前45000张图片作为训练集
valid_df = pd.DataFrame(train_labels_info[45000:]) # 取后5000张图片作为验证集
test_df = pd.DataFrame(test_csv_info)

train_df.columns = ['filepath', 'class']
valid_df.columns = ['filepath', 'class']
test_df.columns = ['filepath', 'class']

print(train_df.head())
print(valid_df.head())
print(test_df.head())

                                       filepath       class
0  E:\BaiduNetdiskDownload\cifar-10\train\1.png        frog
1  E:\BaiduNetdiskDownload\cifar-10\train\2.png       truck
2  E:\BaiduNetdiskDownload\cifar-10\train\3.png       truck
3  E:\BaiduNetdiskDownload\cifar-10\train\4.png        deer
4  E:\BaiduNetdiskDownload\cifar-10\train\5.png  automobile
                                           filepath       class
0  E:\BaiduNetdiskDownload\cifar-10\train\45001.png       horse
1  E:\BaiduNetdiskDownload\cifar-10\train\45002.png  automobile
2  E:\BaiduNetdiskDownload\cifar-10\train\45003.png        deer
3  E:\BaiduNetdiskDownload\cifar-10\train\45004.png  automobile
4  E:\BaiduNetdiskDownload\cifar-10\train\45005.png    airplane
                                      filepath class
0  E:\BaiduNetdiskDownload\cifar-10\test\1.png   cat
1  E:\BaiduNetdiskDownload\cifar-10\test\2.png   cat
2  E:\BaiduNetdiskDownload\cifar-10\test\3.png   cat
3  E:\BaiduNetdiskDownload\cifar-10\test\4.p

In [4]:
from PIL import Image  # 导入PIL库中的Image模块，用于图像处理
from torch.utils.data import Dataset, DataLoader  # 从PyTorch的torch.utils.data模块导入Dataset和DataLoader类，用于数据集的处理和加载
from torchvision import transforms  # 从torchvision库导入transforms模块，用于图像预处理和数据增强

class Cifar10Dataset(Dataset):
    # 定义一个字典，将模式映射到对应的数据集
    df_map = {
        "train": train_df,  # 训练集数据
        "eval": valid_df,   # 验证集数据
        "test": test_df     # 测试集数据
    }
    # 定义类别到索引的映射字典
    label_to_idx = {label: idx for idx, label in enumerate(class_names)}
    # 定义索引到类别的映射字典
    idx_to_label = {idx: label for idx, label in enumerate(class_names)}

    def __init__(self, mode, transform=None):
        """
        初始化Cifar10Dataset类的实例
        :param mode: 数据集模式，'train'、'eval'或'test'
        :param transform: 可选的图像预处理或数据增强操作
        """
        self.df = self.df_map.get(mode, None)  # 根据模式获取对应的数据集
        if self.df is None:
            raise ValueError(f"mode should be one of train, eval, test, but got {mode}")
        self.transform = transform  # 存储传入的预处理或数据增强操作

    def __getitem__(self, index):
        """
        获取指定索引的样本
        :param index: 样本索引
        :return: 处理后的图像和对应的标签
        """
        img_path, label = self.df.iloc[index]  # 获取图像路径和标签
        img = Image.open(img_path).convert('RGB')  # 打开图像并转换为RGB模式
        if self.transform:
            img = self.transform(img)  # 如果定义了预处理或数据增强操作，则应用之
        label = self.label_to_idx[label]  # 将标签转换为对应的索引
        return img, label  # 返回处理后的图像和标签索引

    def __len__(self):
        """
        获取数据集的大小
        :return: 数据集中的样本数量
        """
        return len(self.df)  # 返回数据集的样本数量

# 定义图像的目标尺寸
IMAGE_SIZE = 32
# 定义图像的均值和标准差，用于标准化
mean, std = [0.4914, 0.4822, 0.4465], [0.247, 0.243, 0.261]

# 定义训练集的预处理和数据增强操作
transforms_train = transforms.Compose([
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),  # 调整图像大小
    transforms.RandomRotation(40),  # 随机旋转图像
    transforms.RandomHorizontalFlip(),  # 随机水平翻转图像
    transforms.ToTensor(),  # 将图像转换为Tensor
    # transforms.Normalize(mean, std)  # 标准化图像（此行被注释掉）
])

# 定义验证集和测试集的预处理操作
transforms_eval = transforms.Compose([
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),  # 调整图像大小
    transforms.ToTensor(),  # 将图像转换为Tensor
    transforms.Normalize(mean, std)  # 标准化图像
])

# 创建训练集数据集实例
train_ds = Cifar10Dataset("train", transforms_train)
# 创建验证集数据集实例
eval_ds = Cifar10Dataset("eval", transforms_eval)


In [5]:
train_ds[0][0].shape # 图片的shape,输入

torch.Size([3, 32, 32])

In [6]:
print(train_ds.idx_to_label)  # 类别映射为idx
train_ds.label_to_idx # idx映射为类别

{0: 'airplane', 1: 'automobile', 2: 'bird', 3: 'cat', 4: 'deer', 5: 'dog', 6: 'frog', 7: 'horse', 8: 'ship', 9: 'truck'}


{'airplane': 0,
 'automobile': 1,
 'bird': 2,
 'cat': 3,
 'deer': 4,
 'dog': 5,
 'frog': 6,
 'horse': 7,
 'ship': 8,
 'truck': 9}

In [7]:
batch_size = 64
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)   
eval_dl = DataLoader(eval_ds, batch_size=batch_size, shuffle=False)

In [8]:
# 遍历train_ds得到每张图片，计算每个通道的均值和方差
def cal_mean_std(ds):
    mean = 0.
    std = 0.
    for img, _ in ds:
        mean += img.mean(dim=(1, 2))
        std += img.std(dim=(1, 2))
    mean /= len(ds)
    std /= len(ds)
    return mean, std

# 经过 normalize 后 均值为0，方差为1
print(cal_mean_std(train_ds))

(tensor([0.4369, 0.4268, 0.3947]), tensor([0.2465, 0.2420, 0.2360]))


## 定义模型

In [9]:
# 定义一个卷积神经网络 (CNN) 类，继承自 nn.Module
class CNN(nn.Module):
    def __init__(self, num_classes):
        # 调用父类构造函数
        super().__init__()

        # 定义网络的模型结构
        self.model = nn.Sequential(
            # 第一层卷积层：输入通道数3（RGB图片），输出通道数128，卷积核大小3x3，padding="same"保持输入输出的大小一致
            nn.Conv2d(in_channels=3, out_channels=128, kernel_size=3, padding="same"),
            nn.ReLU(),  # 激活函数ReLU，增加非线性
            nn.BatchNorm2d(128),  # 批标准化，用于对卷积结果进行归一化，减少训练时的内部协变量偏移
            
            # 第二层卷积层：输入通道数128，输出通道数128，卷积核大小3x3，padding="same"
            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, padding="same"), 
            nn.ReLU(),  # 激活函数ReLU
            nn.BatchNorm2d(128),  # 批标准化

            # 最大池化层：kernel_size=2，表示池化窗口大小为2x2，步长为2，输出尺寸减半
            nn.MaxPool2d(kernel_size=2),  # 输出尺寸 (128, 16, 16)

            # 第三层卷积层：输入通道数128，输出通道数256，卷积核大小3x3，padding="same"
            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding="same"),
            nn.ReLU(),  # 激活函数ReLU
            nn.BatchNorm2d(256),  # 批标准化

            # 第四层卷积层：输入通道数256，输出通道数256，卷积核大小3x3，padding="same"
            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding="same"),
            nn.ReLU(),  # 激活函数ReLU
            nn.BatchNorm2d(256),  # 批标准化

            # 最大池化层：kernel_size=2，表示池化窗口大小为2x2，步长为2，输出尺寸减半
            nn.MaxPool2d(kernel_size=2),  # 输出尺寸 (256, 8, 8)

            # 第五层卷积层：输入通道数256，输出通道数512，卷积核大小3x3，padding="same"
            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, padding="same"),
            nn.ReLU(),  # 激活函数ReLU
            nn.BatchNorm2d(512),  # 批标准化

            # 第六层卷积层：输入通道数512，输出通道数512，卷积核大小3x3，padding="same"
            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding="same"),
            nn.ReLU(),  # 激活函数ReLU
            nn.BatchNorm2d(512),  # 批标准化

            # 最大池化层：kernel_size=2，表示池化窗口大小为2x2，步长为2，输出尺寸减半
            nn.MaxPool2d(kernel_size=2),  # 输出尺寸 (512, 4, 4)

            # 将多维的特征图展平成一维向量，准备输入全连接层
            nn.Flatten(),  # 展平

            # 第一层全连接层：输入特征为4*4*512=8192，输出特征数为512
            nn.Linear(8192, 512),
            nn.ReLU(),  # 激活函数ReLU

            # 第二层全连接层：输入特征数为512，输出为num_classes（类别数），对应最终分类任务
            nn.Linear(512, num_classes),
        )

    # 前向传播函数，定义数据如何流经模型
    def forward(self, x):
        return self.model(x)

# 打印模型的每个参数的名称和参数数量
for key, value in CNN(len(class_names)).named_parameters():
    print(f"{key:^40}paramerters num: {np.prod(value.shape)}")


             model.0.weight             paramerters num: 3456
              model.0.bias              paramerters num: 128
             model.2.weight             paramerters num: 128
              model.2.bias              paramerters num: 128
             model.3.weight             paramerters num: 147456
              model.3.bias              paramerters num: 128
             model.5.weight             paramerters num: 128
              model.5.bias              paramerters num: 128
             model.7.weight             paramerters num: 294912
              model.7.bias              paramerters num: 256
             model.9.weight             paramerters num: 256
              model.9.bias              paramerters num: 256
            model.10.weight             paramerters num: 589824
             model.10.bias              paramerters num: 256
            model.12.weight             paramerters num: 256
             model.12.bias              paramerters num: 256
            mo

In [10]:
total_params = sum(p.numel() for p in CNN(len(class_names)).parameters() if p.requires_grad)
print(f"Total trainable parameters: {total_params}")

Total trainable parameters: 8779914


In [11]:
512*4*4

8192

In [12]:
input_4d = torch.randn(32, 3, 64, 64)  # 32 个样本，3 个通道，图像大小为 64x64
bn2d = nn.BatchNorm2d(3)              # 对 3 个通道进行归一化
output_4d = bn2d(input_4d)
output_4d.shape

torch.Size([32, 3, 64, 64])

In [13]:
output_4d

tensor([[[[ 2.0220e+00,  3.4402e-01, -4.4450e-01,  ...,  1.4650e+00,
            1.0845e-01,  1.6322e+00],
          [ 7.9437e-01, -5.8831e-01, -8.5833e-01,  ..., -5.0367e-01,
           -4.7779e-01,  4.9912e-01],
          [-2.2494e+00,  1.0997e+00,  1.5307e+00,  ..., -6.4616e-01,
           -4.3801e-01,  8.3926e-01],
          ...,
          [ 5.2208e-02,  1.2663e+00,  6.7109e-01,  ..., -1.0686e+00,
            4.4737e-01,  1.0966e+00],
          [-1.2508e+00, -8.2019e-02,  1.3695e+00,  ...,  7.1213e-01,
           -8.1538e-01, -6.8265e-01],
          [-2.4799e-01, -3.6358e-01,  1.5454e-01,  ..., -1.1900e+00,
           -1.6888e+00, -9.0077e-01]],

         [[-1.2966e+00,  1.3186e+00, -2.0587e+00,  ...,  4.0129e-02,
           -1.3070e+00,  1.1656e+00],
          [-1.5719e-01, -1.4689e+00, -1.0365e+00,  ...,  2.0640e+00,
           -6.7039e-01, -4.2742e-01],
          [-9.0844e-01, -1.3823e+00,  1.2226e+00,  ..., -5.6536e-01,
           -1.1478e-01, -3.7578e-01],
          ...,
     

In [14]:
bn2d = nn.BatchNorm2d(1)              # 对 3 个通道进行归一化
output_4d1 = bn2d(input_4d[:, 0:1, :, :])
output_4d1

tensor([[[[ 2.0220,  0.3440, -0.4445,  ...,  1.4650,  0.1084,  1.6322],
          [ 0.7944, -0.5883, -0.8583,  ..., -0.5037, -0.4778,  0.4991],
          [-2.2494,  1.0997,  1.5307,  ..., -0.6462, -0.4380,  0.8393],
          ...,
          [ 0.0522,  1.2663,  0.6711,  ..., -1.0686,  0.4474,  1.0966],
          [-1.2508, -0.0820,  1.3695,  ...,  0.7121, -0.8154, -0.6827],
          [-0.2480, -0.3636,  0.1545,  ..., -1.1900, -1.6888, -0.9008]]],


        [[[-0.4516, -0.6221, -0.4244,  ..., -1.3023, -1.0611, -0.9143],
          [ 0.0976, -2.1890, -0.3804,  ..., -0.7832, -1.4822,  0.1446],
          [ 0.3591,  0.2114,  1.9337,  ..., -0.0600, -0.2099,  0.6248],
          ...,
          [ 1.4630, -2.2503,  0.7333,  ...,  1.1931,  0.4984, -0.1197],
          [ 1.3181,  0.4229,  1.6649,  ..., -0.0367, -0.7852, -1.0072],
          [-1.3880,  1.2997, -1.3212,  ...,  0.6075,  0.6304,  0.5862]]],


        [[[ 0.8346, -0.7580,  0.1383,  ..., -1.1670, -1.5761, -0.8985],
          [-0.0305,  1.724

## 训练

pytorch的训练需要自行实现，包括
1. 定义损失函数
2. 定义优化器
3. 定义训练步
4. 训练

In [15]:
from sklearn.metrics import accuracy_score

@torch.no_grad()
def evaluating(model, dataloader, loss_fct):
    loss_list = []
    pred_list = []
    label_list = []
    for datas, labels in dataloader:
        datas = datas.to(device)
        labels = labels.to(device)
        # 前向计算
        logits = model(datas)
        loss = loss_fct(logits, labels)         # 验证集损失
        loss_list.append(loss.item())
        
        preds = logits.argmax(axis=-1)    # 验证集预测
        pred_list.extend(preds.cpu().numpy().tolist())
        label_list.extend(labels.cpu().numpy().tolist())
        
    acc = accuracy_score(label_list, pred_list)
    return np.mean(loss_list), acc


### TensorBoard 可视化


训练过程中可以使用如下命令启动tensorboard服务。

```shell
tensorboard \
    --logdir=runs \     # log 存放路径
    --host 0.0.0.0 \    # ip
    --port 8848         # 端口
```

In [16]:
from torch.utils.tensorboard import SummaryWriter


class TensorBoardCallback:
    def __init__(self, log_dir, flush_secs=10):
        """
        Args:
            log_dir (str): dir to write log.
            flush_secs (int, optional): write to dsk each flush_secs seconds. Defaults to 10.
        """
        self.writer = SummaryWriter(log_dir=log_dir, flush_secs=flush_secs)

    def draw_model(self, model, input_shape):
        self.writer.add_graph(model, input_to_model=torch.randn(input_shape))
        
    def add_loss_scalars(self, step, loss, val_loss):
        self.writer.add_scalars(
            main_tag="training/loss", 
            tag_scalar_dict={"loss": loss, "val_loss": val_loss},
            global_step=step,
            )
        
    def add_acc_scalars(self, step, acc, val_acc):
        self.writer.add_scalars(
            main_tag="training/accuracy",
            tag_scalar_dict={"accuracy": acc, "val_accuracy": val_acc},
            global_step=step,
        )
        
    def add_lr_scalars(self, step, learning_rate):
        self.writer.add_scalars(
            main_tag="training/learning_rate",
            tag_scalar_dict={"learning_rate": learning_rate},
            global_step=step,
            
        )
    
    def __call__(self, step, **kwargs):
        # add loss
        loss = kwargs.pop("loss", None)
        val_loss = kwargs.pop("val_loss", None)
        if loss is not None and val_loss is not None:
            self.add_loss_scalars(step, loss, val_loss)
        # add acc
        acc = kwargs.pop("acc", None)
        val_acc = kwargs.pop("val_acc", None)
        if acc is not None and val_acc is not None:
            self.add_acc_scalars(step, acc, val_acc)
        # add lr
        learning_rate = kwargs.pop("lr", None)
        if learning_rate is not None:
            self.add_lr_scalars(step, learning_rate)


### Save Best


In [17]:
class SaveCheckpointsCallback:
    def __init__(self, save_dir, save_step=5000, save_best_only=True):
        """
        Save checkpoints each save_epoch epoch. 
        We save checkpoint by epoch in this implementation.
        Usually, training scripts with pytorch evaluating model and save checkpoint by step.

        Args:
            save_dir (str): dir to save checkpoint
            save_epoch (int, optional): the frequency to save checkpoint. Defaults to 1.
            save_best_only (bool, optional): If True, only save the best model or save each model at every epoch.
        """
        self.save_dir = save_dir
        self.save_step = save_step
        self.save_best_only = save_best_only
        self.best_metrics = -1
        
        # mkdir
        if not os.path.exists(self.save_dir):
            os.mkdir(self.save_dir)
        
    def __call__(self, step, state_dict, metric=None):
        if step % self.save_step > 0:
            return
        
        if self.save_best_only:
            assert metric is not None
            if metric >= self.best_metrics:
                # save checkpoints
                torch.save(state_dict, os.path.join(self.save_dir, "best.ckpt"))
                # update best metrics
                self.best_metrics = metric
        else:
            torch.save(state_dict, os.path.join(self.save_dir, f"{step}.ckpt"))



### Early Stop

In [18]:
class EarlyStopCallback:
    def __init__(self, patience=5, min_delta=0.01):
        """

        Args:
            patience (int, optional): Number of epochs with no improvement after which training will be stopped.. Defaults to 5.
            min_delta (float, optional): Minimum change in the monitored quantity to qualify as an improvement, i.e. an absolute 
                change of less than min_delta, will count as no improvement. Defaults to 0.01.
        """
        self.patience = patience
        self.min_delta = min_delta
        self.best_metric = -1
        self.counter = 0
        
    def __call__(self, metric):
        if metric >= self.best_metric + self.min_delta:
            # update best metric
            self.best_metric = metric
            # reset counter 
            self.counter = 0
        else: 
            self.counter += 1
            
    @property
    def early_stop(self):
        return self.counter >= self.patience


In [19]:
# 训练
def training(
    model, 
    train_loader, 
    val_loader, 
    epoch, 
    loss_fct, 
    optimizer, 
    tensorboard_callback=None,
    save_ckpt_callback=None,
    early_stop_callback=None,
    eval_step=500,
    ):
    record_dict = {
        "train": [],
        "val": []
    }
    
    global_step = 0
    model.train()
    with tqdm(total=epoch * len(train_loader)) as pbar:
        for epoch_id in range(epoch):
            # training
            for datas, labels in train_loader:
                datas = datas.to(device)
                labels = labels.to(device)
                # 梯度清空
                optimizer.zero_grad()
                # 模型前向计算
                logits = model(datas)
                # 计算损失
                loss = loss_fct(logits, labels)
                # 梯度回传
                loss.backward()
                # 调整优化器，包括学习率的变动等
                optimizer.step()
                preds = logits.argmax(axis=-1) #最大值的索引
            
                acc = accuracy_score(labels.cpu().numpy(), preds.cpu().numpy())     # 计算准确率
                loss = loss.cpu().item() # 计算损失
                # record
                
                record_dict["train"].append({
                    "loss": loss, "acc": acc, "step": global_step # 记录每一步的损失和准确率
                })
                
                # evaluating
                if global_step % eval_step == 0:
                    model.eval()
                    val_loss, val_acc = evaluating(model, val_loader, loss_fct)
                    record_dict["val"].append({
                        "loss": val_loss, "acc": val_acc, "step": global_step
                    })
                    model.train()
                    
                    # 1. 使用 tensorboard 可视化
                    if tensorboard_callback is not None:
                        tensorboard_callback(
                            global_step, 
                            loss=loss, val_loss=val_loss,
                            acc=acc, val_acc=val_acc,
                            lr=optimizer.param_groups[0]["lr"],
                            )
                
                    # 2. 保存模型权重 save model checkpoint
                    if save_ckpt_callback is not None:
                        save_ckpt_callback(global_step, model.state_dict(), metric=val_acc)

                    # 3. 早停 Early Stop
                    if early_stop_callback is not None:
                        early_stop_callback(val_acc)
                        if early_stop_callback.early_stop:
                            print(f"Early stop at epoch {epoch_id} / global_step {global_step}")
                            return record_dict
                    
                # udate step
                global_step += 1
                pbar.update(1)
                pbar.set_postfix({"epoch": epoch_id})
        
    return record_dict
        

epoch = 20

model = CNN(num_classes=10)

# 1. 定义损失函数 采用交叉熵损失
loss_fct = nn.CrossEntropyLoss()
# 2. 定义优化器 采用 adam
# Optimizers specified in the torch.optim package
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 1. tensorboard 可视化
if not os.path.exists("runs"):
    os.mkdir("runs")
tensorboard_callback = TensorBoardCallback("runs/cifar-10")
tensorboard_callback.draw_model(model, [1, 3, IMAGE_SIZE, IMAGE_SIZE])
# 2. save best
if not os.path.exists("checkpoints"):
    os.makedirs("checkpoints")
save_ckpt_callback = SaveCheckpointsCallback("checkpoints/cifar-10", save_step=len(train_dl), save_best_only=True)
# 3. early stop
early_stop_callback = EarlyStopCallback(patience=5)

model = model.to(device)


In [None]:
record = training(
    model,
    train_dl,
    eval_dl,
    epoch,
    loss_fct,
    optimizer,
    tensorboard_callback=None,
    save_ckpt_callback=save_ckpt_callback,
    early_stop_callback=early_stop_callback,
    eval_step=len(train_dl)
    )

In [None]:
#画线要注意的是损失是不一定在零到1之间的
def plot_learning_curves(record_dict, sample_step=500):
    # build DataFrame
    train_df = pd.DataFrame(record_dict["train"]).set_index("step").iloc[::sample_step]
    val_df = pd.DataFrame(record_dict["val"]).set_index("step")

    # plot
    fig_num = len(train_df.columns)
    fig, axs = plt.subplots(1, fig_num, figsize=(5 * fig_num, 5))
    for idx, item in enumerate(train_df.columns):    
        axs[idx].plot(train_df.index, train_df[item], label=f"train_{item}")
        axs[idx].plot(val_df.index, val_df[item], label=f"val_{item}")
        axs[idx].grid()
        axs[idx].legend()
        # axs[idx].set_xticks(range(0, train_df.index[-1], 5000))
        # axs[idx].set_xticklabels(map(lambda x: f"{int(x/1000)}k", range(0, train_df.index[-1], 5000)))
        axs[idx].set_xlabel("step")
    
    plt.show()

plot_learning_curves(record, sample_step=10)  #横坐标是 steps

In [None]:
# dataload for evaluating

# load checkpoints
model.load_state_dict(torch.load("checkpoints/cifar-10/best.ckpt", map_location="cpu"))

model.eval()
loss, acc = evaluating(model, eval_dl, loss_fct)
print(f"loss:     {loss:.4f}\naccuracy: {acc:.4f}")

# 评估

In [20]:
# dataload for evaluating

# load checkpoints
model.load_state_dict(torch.load("checkpoints/cifar-10/best.ckpt", map_location="cpu"))

model.eval()
loss, acc = evaluating(model, eval_dl, loss_fct)
print(f"loss:     {loss:.4f}\naccuracy: {acc:.4f}")

  model.load_state_dict(torch.load("checkpoints/cifar-10/best.ckpt", map_location="cpu"))


FileNotFoundError: [Errno 2] No such file or directory: 'checkpoints/cifar-10/best.ckpt'

# 推理

In [None]:
# test_df
test_ds = Cifar10Dataset("test", transform=transforms_eval)
test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=False, drop_last=False)

preds_collect = [] # 预测结果收集器
model.eval()
for data, fake_label in tqdm(test_dl):
    data = data.to(device=device)
    logits = model(data) #得到预测结果
    preds = [test_ds.idx_to_label[idx] for idx in logits.argmax(axis=-1).cpu().tolist()] # 得到预测类别，idx_to_label是id到字符串类别的映射
    preds_collect.extend(preds)
    
test_df["label"] = preds_collect # 增加预测类别列,比赛要求这一列是label
test_df.head()

In [None]:
64*4688

In [None]:
# 导出 submission.csv
test_df.to_csv("submission.csv", index=False)