In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torchvision.utils import make_grid
import time
import os
import numpy as np
import torch.nn.functional as F

class FCN(nn.Module):
    def __init__(self, num_classes=10):
        super(FCN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.conv4 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
        # GAP前卷积层，输出通道数应与类别数(10个数字)相同
        self.conv5 = nn.Conv2d(in_channels=256, out_channels=num_classes, kernel_size=3, padding=1)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        #x = self.dropout(x)
        x = F.relu(self.conv2(x))
        #x = self.dropout(x)
        x = F.relu(self.conv3(x))
        #x = self.dropout(x)
        x = F.relu(self.conv4(x))
        #x = self.dropout(x)
        x = self.conv5(x)  # 最后一层卷积不使用ReLU激活
        
        # 使用全局平均池化
        x = nn.functional.adaptive_avg_pool2d(x, (1, 1))  # 输出形状变为(B, C, 1, 1)
        
        # 将四维张量展平为二维，以便进行损失计算
        x = x.view(x.size(0), -1)
        return x

# 定义训练函数
def train(model:FCN, device:torch.device, writer_epoch:SummaryWriter, data_loader:DataLoader, optimizer:optim, criterion, epoch)->None:
    watch_batch_size = 100
    model.train()
    avg_loss = 0.0
    for batch_idx, (data, target) in enumerate(data_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        avg_loss += loss.item()
        if batch_idx % watch_batch_size == (watch_batch_size - 1):  # 每100个batch打印一次
            avg_loss =  avg_loss / watch_batch_size
            writer_epoch.add_scalars('training loss pre batch', {f'avg_loss{epoch+1}': avg_loss}, batch_idx + 1)
            print(f'Epoch {epoch + 1}, Batch [{batch_idx + 1}/{len(data_loader)}], AvgLoss: {avg_loss:.4f}')
            avg_loss = 0.0

# 定义验证函数
def evaluate(model:FCN, device:torch.device, data_loader:DataLoader, criterion) ->tuple[float, float]:
    data_len = 0
    model.eval()
    loss = 0.0
    correct_count = 0.0
    first_batch = next(iter(data_loader))[0]
    # print('first_batch.shape', first_batch.shape)
    with torch.no_grad():
        for data, target in data_loader:
            data, target = data.to(device), target.to(device)
            # print(target.shape) # 一个64维的行向量，代表整个64张图的batch中每个图的代表的数字
            # print(data.shape)
            output = model(data)
            # print(output.shape) # 64行10列表示这个batch中的64张图对应到每个数字（0-9）的概率。
            loss += criterion(output, target).item()  # 累积损失
            data_len += 1
            pred = output.argmax(dim=1, keepdim=True)
            correct_count += pred.eq(target.view_as(pred)).sum().item()
    loss = loss / data_len
    accuracy = correct_count / len(data_loader.dataset)
    return accuracy, loss, first_batch

def predict4HandWritten(model:FCN, validate_loader:DataLoader, device:torch.device):
    predict_list = []
    with torch.no_grad():
        for data, target in validate_loader:
            i = 0
            data, target = data.to(device), target.to(device)
            output = model(data)
            pred = output.argmax(dim=1, keepdim=True)
            predict_res = [data[i], target[i]]
    return

def main():
    # 设置设备
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f'Using device: {device}')

    # 实例化模型
    # model = FCN().to(device)
    # checkpoint = torch.load('mnist_fcn.pth')
    # model.load_state_dict(checkpoint)
    
    # 加载MNIST数据集
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
    ])

    train_dataset = datasets.MNIST('./data', train=True, download=True, transform=transform)
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, pin_memory = True, pin_memory_device=device.type, num_workers=4)

    validate_dataset = datasets.MNIST('./data', train=False, transform=transform)
    validate_loader = DataLoader(validate_dataset, batch_size=64, shuffle=True, pin_memory = True, pin_memory_device=device.type, num_workers=4)

    model = FCN().to(device)
    model_name = 'FCN'
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())
    
    # 获取当前时间戳
    timestamp = time.strftime("%Y%m%d-%H%M%S")
    log_dir = f"runs/{model_name}_experiment_{timestamp}"
    os.makedirs(log_dir, exist_ok=True)
    
    # 添加模型结构到TensorBoard
    writer_model = SummaryWriter(log_dir)
    data_iter = iter(train_loader)
    images, _ = next(data_iter)
    writer_model.add_graph(model, images.to(device))
    
    # 训练和验证循环
    num_epochs = 5
    for epoch in range(num_epochs):
        # 初始化writer_epoch
        writer_epoch = SummaryWriter(log_dir)
        
        epoch_start_time = time.time()
        
        train(model, device, writer_epoch, train_loader, optimizer, criterion, epoch)

        epoch_end_time = time.time()
        epoch_duration = epoch_end_time - epoch_start_time
        writer_epoch.add_scalar('epoch_duration (seconds)', epoch_duration, epoch+1)
        
        train_accuracy, train_loss, batch0 = evaluate(model, device, train_loader, criterion)
        validate_accuracy, validate_loss, _ = evaluate(model, device, validate_loader, criterion)
    
        writer_epoch.add_scalars('accurancy', {'train accurancy': train_accuracy, 
                                                'validate accurancy':validate_accuracy},global_step=epoch+1)
        writer_epoch.add_scalars('loss', {'train loss': train_loss, 
                                                    'validate loss':validate_loss},global_step=epoch+1)

        img_grid = make_grid(batch0, nrow=8, normalize=False, scale_each=False)
        writer_epoch.add_image(f'batch0_imgs{epoch+1}', img_grid, global_step=None)
    # 保存训练模型和参数
    torch.save(model.state_dict(), 'mnist_fcn.pth')
    print(f"Model save to mnist_fcn")
    
    input_shape = (1, 1, 28, 28)  # MNIST图像为28x28像素，单通道
    dummy_input = torch.randn(input_shape).to(device)  # 创建一个虚拟输入张量
    onnx_file_path = "mnist_fcn_onnx.onnx"
    torch.onnx.export(model, dummy_input, onnx_file_path, export_params=True, opset_version=17, do_constant_folding=True)

    print(f"Train finished, Export Model to {onnx_file_path}")

    writer_epoch.close()
    
if __name__ == "__main__":
    main()

Using device: cuda
Epoch 1, Batch [100/938], AvgLoss: 1.8812
Epoch 1, Batch [200/938], AvgLoss: 1.2475
Epoch 1, Batch [300/938], AvgLoss: 0.6816
Epoch 1, Batch [400/938], AvgLoss: 0.4705
Epoch 1, Batch [500/938], AvgLoss: 0.3747
Epoch 1, Batch [600/938], AvgLoss: 0.3031
Epoch 1, Batch [700/938], AvgLoss: 0.2860
Epoch 1, Batch [800/938], AvgLoss: 0.2253
Epoch 1, Batch [900/938], AvgLoss: 0.2196
Model save to mnist_fcn
Train finished, Export Model to mnist_fcn_onnx.onnx
