In [None]:
!nvidia-smi

Wed Apr 12 10:47:33 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# 数据增广

> 在开始之前记得把`models.py`扔进来

由于之前的0到3的小demo中最终得到的准确率结果都不好，前面测试中最好的模型只是0.62（AlexNet），所以这里考虑是数据量太少了的问题来重新进行数据增广，增加数据量。


## ask chatgpt

> pytorch CIFAR10任务，我有train_data（50000，3，32，32）,train_labels（50000）,test_data（10000，3，32，32）,test_labels（10000）四个tensor，你可以告诉我如何使用transform来进行数据增广吗，比如重塑宽高、翻转、张量化、规范化

当我们进行深度学习任务时，使用数据增强技术可以提高模型的泛化能力。在 PyTorch 中，我们可以使用 transforms 模块来对数据进行增强。下面是一个使用 transforms 对 CIFAR10 数据集进行数据增强的示例：

在下面的代码中，transform_train 定义了训练数据的增强方式，包括随机水平翻转、随机裁剪、张量化、规范化等步骤。transform_test 定义了测试数据的增强方式，只包括张量化和规范化。

通过 transforms.Compose 将多个 transform 组合起来，形成一个 transform 序列。在加载数据集时，将这个 transform 序列传递给 transform 参数即可实现数据增强。

使用 transforms.RandomCrop() 可以进行裁剪操作，使用 transforms.RandomHorizontalFlip() 可以进行随机水平翻转。对于张量化和规范化，使用 transforms.ToTensor() 将数据转换成 tensor 格式，并使用 transforms.Normalize() 将数据规范化到均值为 0、标准差为 1。

在上面的代码中，我们定义了训练数据和测试数据的 transform 不同，因为测试数据不需要进行数据增强，只需要进行归一化即可。

最后，使用 DataLoader 加载数据集时，可以设置 batch_size 和 shuffle 等参数，进行批处理和打乱数据的操作。


In [None]:
import torch
import torchvision
import torchvision.transforms as transforms

# 定义 transform，包括随机水平翻转、随机裁剪、归一化
transform_train = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# 只需要归一化
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# 加载 CIFAR10 数据集
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=128,shuffle=True, num_workers=2)
testset = torchvision.datasets.CIFAR10(root='./data', train=False,download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=128,shuffle=False, num_workers=2)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:13<00:00, 12989298.57it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [None]:
# some test here
trainset.data[0].shape

(32, 32, 3)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

with torch.no_grad():
    for batch_idx, (inputs, targets) in enumerate(testloader):
        inputs, targets = inputs.to(device), targets.to(device)
        if batch_idx == 0:
            whole_test_inputs = inputs
            whole_test_targets = targets
        else:
            whole_test_inputs = torch.cat((whole_test_inputs, inputs), dim=0)
            whole_test_targets = torch.cat((whole_test_targets, targets), dim=0)
with torch.no_grad():
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        if batch_idx == 0:
            whole_train_inputs = inputs
            whole_train_targets = targets
        else:
            whole_train_inputs = torch.cat((whole_train_inputs, inputs), dim=0)
            whole_train_targets = torch.cat((whole_train_targets, targets), dim=0)

whole_train_inputs.shape,whole_test_inputs.shape

(torch.Size([50000, 3, 32, 32]), torch.Size([10000, 3, 32, 32]))

tensor([[[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
         [-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
         [-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
         ...,
         [ 0.6706,  0.5529,  0.4588,  ..., -1.0000, -1.0000, -1.0000],
         [ 0.6706,  0.4510,  0.3176,  ..., -1.0000, -1.0000, -1.0000],
         [ 0.5843,  0.3882,  0.2314,  ..., -1.0000, -1.0000, -1.0000]],

        [[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
         [-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
         [-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
         ...,
         [ 0.3647,  0.2314,  0.1294,  ..., -1.0000, -1.0000, -1.0000],
         [ 0.3725,  0.1294, -0.0039,  ..., -1.0000, -1.0000, -1.0000],
         [ 0.2863,  0.0667, -0.0824,  ..., -1.0000, -1.0000, -1.0000]],

        [[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
         [-1.0000, -1.0000, -1.0000,  ..., -1

## 定义模型

In [None]:
!pip install torchinfo

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchinfo
  Downloading torchinfo-1.7.2-py3-none-any.whl (22 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.7.2


In [None]:
from models import AlexNet
from torchinfo import summary
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = AlexNet()
model.to(device)
summary(model, input_size=(10, 3, 32, 32))

  action_fn=lambda data: sys.getsizeof(data.storage()),
  return super().__sizeof__() + self.nbytes()


Layer (type:depth-idx)                   Output Shape              Param #
AlexNet                                  [10, 10]                  --
├─Sequential: 1-1                        [10, 256, 1, 1]           --
│    └─Conv2d: 2-1                       [10, 64, 17, 17]          1,792
│    └─ReLU: 2-2                         [10, 64, 17, 17]          --
│    └─MaxPool2d: 2-3                    [10, 64, 8, 8]            --
│    └─Conv2d: 2-4                       [10, 192, 8, 8]           307,392
│    └─ReLU: 2-5                         [10, 192, 8, 8]           --
│    └─MaxPool2d: 2-6                    [10, 192, 3, 3]           --
│    └─Conv2d: 2-7                       [10, 384, 3, 3]           663,936
│    └─ReLU: 2-8                         [10, 384, 3, 3]           --
│    └─Conv2d: 2-9                       [10, 256, 3, 3]           884,992
│    └─ReLU: 2-10                        [10, 256, 3, 3]           --
│    └─Conv2d: 2-11                      [10, 256, 3, 3]           

## 开始训练

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from models import LeNet, AlexNet
from tqdm import tqdm
import os
import numpy as np


class ArgumentConfig:
    def __init__(self,
        lr=0.001,
        resume=False,
        model='AlexNet',
        epochs=20,
        is_print=True,
        print_every=100
        ):
        self.lr = lr
        self.resume = resume
        self.model = model
        self.epochs = epochs
        self.is_print = is_print
        self.print_every = print_every

args = ArgumentConfig()


In [None]:
# 定义损失函数
criterion = nn.CrossEntropyLoss()

# 定义优化器
optimizer = optim.Adam(model.parameters(), lr=args.lr)

# 定义学习率衰减
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)

# 定义 tensorboard
writer = SummaryWriter(log_dir='runs/' + args.model)

# 定义是否使用 GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

# 定义是否加载模型
if args.resume:
    print('==> Resuming from checkpoint..')
    assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!'
    checkpoint = torch.load('./checkpoint/ckpt.pth')
    model.load_state_dict(checkpoint['net'])
    best_acc = checkpoint['acc']
    start_epoch = checkpoint['epoch']

# 定义训练函数
def train(epoch):
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        predicted = torch.argmax(outputs, dim=1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
        if batch_idx % args.print_every == 0 and args.is_print:
            print('Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tAcc: {:.3f}%'.format(
                epoch, 
                batch_idx * len(inputs), len(trainloader.dataset),
                100. * batch_idx / len(trainloader), loss.item(),
                100. * correct / total
                )
            )
    writer.add_scalar('Train/Loss', train_loss / (batch_idx + 1), epoch)
    writer.add_scalar('Train/Acc', 100. * correct / total, epoch)

# 定义测试函数
def test(epoch):
    model.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            test_loss += loss.item()
            predicted = torch.argmax(outputs, dim=1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    writer.add_scalar('Test/Loss', test_loss / (batch_idx + 1), epoch)
    writer.add_scalar('Test/Acc', 100. * correct / total, epoch)
    return 100. * correct / total


In [None]:
for epoch in range(args.epochs):
    train(epoch)
    acc = test(epoch)
    scheduler.step()
    # 保存模型
    state = {
        'net': model.state_dict(),
        'acc': acc,
        'epoch': epoch,
    }
    if not os.path.isdir('checkpoint'):
        os.mkdir('checkpoint')
    torch.save(state, './checkpoint/ckpt.pth')



> 在7分钟的训练之后达到了64.732%的准确率

## 训练训练训练

In [None]:
# for epoch in range(args.epochs):
#     train(epoch)
#     acc = test(epoch)
#     scheduler.step()
#     # 保存模型
#     state = {
#         'net': model.state_dict(),
#         'acc': acc,
#         'epoch': epoch,
#     }
#     if not os.path.isdir('checkpoint'):
#         os.mkdir('checkpoint')
#     torch.save(state, './checkpoint/ckpt.pth')

> 在CPU上整整话了1小时45分钟2秒来跑10个epoch，好想吐槽一句果然还是没有gpu不行，哭死。
> 在GPU上用了7分41秒来跑20个epoch，准确率达到了68.882%

In [None]:
# for epoch in range(args.epochs):
#     train(epoch)
#     acc = test(epoch)
#     scheduler.step()
#     # 保存模型
#     state = {
#         'net': model.state_dict(),
#         'acc': acc,
#         'epoch': epoch,
#     }
#     if not os.path.isdir('checkpoint'):
#         os.mkdir('checkpoint')
#     torch.save(state, './checkpoint/ckpt.pth')

In [None]:
# for epoch in range(args.epochs):
#     train(epoch)
#     acc = test(epoch)
#     scheduler.step()
#     # 保存模型
#     state = {
#         'net': model.state_dict(),
#         'acc': acc,
#         'epoch': epoch,
#     }
#     if not os.path.isdir('checkpoint'):
#         os.mkdir('checkpoint')
#     torch.save(state, './checkpoint/ckpt.pth')

## tensorboard可视化

In [None]:
# %load_ext tensorboard

In [None]:
# %tensorboard --logdir runs --port 6666

## 查看训练集和测试集上的准确率

In [None]:
total = 0
correct = 0
with torch.no_grad():
    for batch_idx, (inputs, targets) in enumerate(testloader):
        inputs, targets = inputs.to(device), targets.to(device)
        # print(inputs.shape)
        outputs = model(inputs)
        predicted = torch.argmax(outputs, dim=1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
total,correct

(10000, 6755)

In [None]:
total = 0
correct = 0
with torch.no_grad():
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        predicted = torch.argmax(outputs, dim=1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
total,correct

(50000, 33480)

In [None]:
predicted

tensor([6, 5, 2, 0, 3, 0, 4, 0, 4, 1, 1, 8, 9, 9, 5, 1, 9, 6, 9, 8, 0, 8, 8, 1,
        4, 7, 8, 9, 5, 5, 9, 3, 9, 8, 1, 4, 3, 5, 6, 9, 9, 9, 5, 8, 5, 2, 0, 7,
        4, 8, 1, 3, 0, 4, 1, 6, 8, 3, 2, 4, 1, 0, 5, 3, 7, 3, 7, 3, 5, 6, 2, 2,
        3, 1, 5, 0, 3, 9, 6, 9], device='cuda:0')

In [None]:
targets

tensor([6, 5, 2, 8, 2, 0, 2, 1, 7, 1, 1, 8, 2, 1, 5, 1, 9, 6, 9, 8, 0, 8, 9, 1,
        4, 5, 8, 9, 3, 3, 0, 4, 9, 8, 1, 7, 6, 5, 4, 9, 1, 9, 2, 8, 3, 0, 0, 7,
        3, 8, 1, 3, 8, 7, 0, 3, 1, 3, 2, 4, 1, 2, 5, 5, 7, 3, 7, 7, 3, 6, 4, 2,
        7, 9, 5, 0, 6, 9, 6, 0], device='cuda:0')