## basics & summary

- 回顾下 CNN 结构与输入 shape 的适配
    - https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
- `torch.cuda.amp` 怎么用的；
    - fp16: [loss scaling](https://moocaholic.medium.com/fp64-fp32-fp16-bfloat16-tf32-and-other-members-of-the-zoo-a1ca7897d407)
        - https://github.com/mli/transformers-benchmarks/blob/main/transformers.ipynb
    - 极大地提升 batch_size

```
# basic usages

scaler = torch.cuda.amp.GradScaler()
```

## cnn pipeline

In [1]:
import torch
import torch.nn.functional as F  
import torchvision.datasets as datasets  
import torchvision.transforms as transforms  
from torch import optim  
from torch import nn  
from torch.utils.data import DataLoader
from tqdm import tqdm  

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

### Model

In [8]:
# Simple CNN
class CNN(nn.Module):
    def __init__(self, in_channels=1, num_classes=10):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(
            in_channels=in_channels,
            out_channels=5120,
            kernel_size=3,
            stride=1,
            padding=1,
        )
        # /2, downsampling
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(
            in_channels=5120,
            out_channels=10240,
            kernel_size=3,
            stride=1,
            padding=1,
        )
        # (channels*w*h)
            # w, h: 取决于初始的 width, height
        self.fc1 = nn.Linear(10240 * 7 * 7, num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        # /2
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        # /2
        x = self.pool(x)
        # 4d => 2d, (bs, features)
        x = x.reshape(x.shape[0], -1)
        x = self.fc1(x)
        return x

In [5]:
from torchsummary import summary

In [7]:
model = CNN(in_channels=3)
summary(model, input_size=(3, 224, 224), batch_size=32, device='cpu')

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [32, 8, 224, 224]             224
         MaxPool2d-2          [32, 8, 112, 112]               0
            Conv2d-3         [32, 64, 112, 112]           4,672
         MaxPool2d-4           [32, 64, 56, 56]               0
            Linear-5                   [32, 10]       2,007,050
Total params: 2,011,946
Trainable params: 2,011,946
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 18.38
Forward/backward pass size (MB): 367.50
Params size (MB): 7.67
Estimated Total Size (MB): 393.55
----------------------------------------------------------------


### training pipeline

In [9]:
# 对齐 mnist
in_channels = 1
num_classes = 10

learning_rate = 3e-4 # karpathy's constant
batch_size = 32
num_epochs = 3

In [10]:
train_dataset = datasets.MNIST(
    root="dataset/", train=True, transform=transforms.ToTensor(), download=True
)
test_dataset = datasets.MNIST(
    root="dataset/", train=False, transform=transforms.ToTensor(), download=True
)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)

# train_dataset = datasets.FashionMNIST(
#     root="dataset/", train=True, transform=transforms.ToTensor(), download=True
# )
# test_dataset = datasets.FashionMNIST(
#     root="dataset/", train=False, transform=transforms.ToTensor(), download=True
# )
# train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
# test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)

In [11]:
batch_x, batch_y = next(iter(train_loader))
print(batch_x.shape, batch_y.shape)

torch.Size([32, 1, 28, 28]) torch.Size([32])


#### float 32

In [None]:
# 对齐 mnist
in_channels = 1
num_classes = 10

learning_rate = 3e-4 # karpathy's constant
batch_size = 128
num_epochs = 3

train_dataset = datasets.MNIST(
    root="dataset/", train=True, transform=transforms.ToTensor(), download=True
)
test_dataset = datasets.MNIST(
    root="dataset/", train=False, transform=transforms.ToTensor(), download=True
)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)


model = CNN(in_channels=in_channels, num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# before training
# scaler = torch.cuda.amp.GradScaler()

def train():
    for epoch in tqdm(range(num_epochs)):
        for batch_idx, (batch_x, batch_y) in tqdm(enumerate(train_loader)):
            batch_x = batch_x.to(device)
            batch_y = batch_y.to(device)
            
            # forward
            logits = model(batch_x)
            loss = criterion(logits, batch_y)
            
            # backward
            optimizer.zero_grad()
            loss.backward()
#             scaler.scale(loss).backward()
            
            # gradient descent
            optimizer.step()
#             scaler.step(optimizer)
#             scaler.update()

def evalute(model, test_loader):
    total_correct = 0
    total_samples = 0
    model.eval()
    
    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            batch_x = batch_x.to(device)
            batch_y = batch_y.to(device)
            
            logits = model(batch_x)
            _, preds = logits.max(1)
            total_correct += (preds == batch_y).sum()
            total_samples += batch_y.size(0)
    model.train()
    return total_correct/total_samples

train()

In [None]:
print(f"Accuracy on training set: {evalute(model, train_loader)*100:.2f}")
print(f"Accuracy on test set: {evalute(model, test_loader)*100:.2f}")

#### 混合精度训练

In [4]:
in_channels = 1
num_classes = 10

learning_rate = 3e-4 # karpathy's constant
batch_size = 256
num_epochs = 3

train_dataset = datasets.MNIST(
    root="dataset/", train=True, transform=transforms.ToTensor(), download=True
)
test_dataset = datasets.MNIST(
    root="dataset/", train=False, transform=transforms.ToTensor(), download=True
)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)

model = CNN(in_channels=in_channels, num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# before training
scaler = torch.cuda.amp.GradScaler()

def train():
    for epoch in tqdm(range(num_epochs)):
        for batch_idx, (batch_x, batch_y) in tqdm(enumerate(train_loader)):
            batch_x = batch_x.to(device)
            batch_y = batch_y.to(device)
            
            # forward
            with torch.cuda.amp.autocast():
                logits = model(batch_x)
                loss = criterion(logits, batch_y)
            
            # backward
            optimizer.zero_grad()
            # loss scaling
            scaler.scale(loss).backward()
            
            # gradient descent
#             optimizer.step()
            scaler.step(optimizer)
            scaler.update()

def evalute(model, test_loader):
    total_correct = 0
    total_samples = 0
    model.eval()
    
    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            batch_x = batch_x.to(device)
            batch_y = batch_y.to(device)
            
            logits = model(batch_x)
            _, preds = logits.max(1)
            total_correct += (preds == batch_y).sum()
            total_samples += batch_y.size(0)
    model.train()
    return total_correct/total_samples

train()

  0%|          | 0/3 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
1it [00:05,  5.97s/it][A
2it [00:10,  5.15s/it][A
3it [00:15,  4.88s/it][A
4it [00:19,  4.75s/it][A
5it [00:24,  4.68s/it][A
6it [00:28,  4.63s/it][A
7it [00:33,  4.61s/it][A
8it [00:37,  4.59s/it][A
9it [00:46,  5.22s/it][A
  0%|          | 0/3 [00:46<?, ?it/s]


KeyboardInterrupt: 

In [None]:
print(f"Accuracy on training set: {evalute(model, train_loader)*100:.2f}")
print(f"Accuracy on test set: {evalute(model, test_loader)*100:.2f}")