# Quick Tips

* [Mixed Precision Training (FP16)](https://youtu.be/ks3oZ7Va8HU) video
* [How to get a Progress Bar](https://youtu.be/RKHopFfbPao) video
* [Reproducible Results and Deterministic Behavior](https://youtu.be/1SZocGaCAr8) video
* [Calculate Mean and Standard Deviation of Data](https://youtu.be/y6IEcEBRZks) video
* [Weight Initialization](https://youtu.be/xWQ-p_o0Uik) video
* [Using a Learning Rate Scheduler](https://youtu.be/P31hB37g4Ak) video

Copy from [Convolutional Neural Network example](https://colab.research.google.com/drive/18iSXhfQQwRdoLvHuxombXs4L7AC4C611)

[PyTorch initializations docs](https://pytorch.org/docs/stable/nn.init.html)

[PyTorch optim scheduler ReduceLROnPlateau](https://pytorch.org/docs/stable/optim.html?highlight=scheduler#torch.optim.lr_scheduler.ReduceLROnPlateau)

In [13]:
import os

data_dir = '/content/dataset/MNIST/raw/'
# if os.path.exists(data_dir):
#     !rm -rf $data_dir

!mkdir -p $data_dir

!wget --directory-prefix=$data_dir https://github.com/golbin/TensorFlow-MNIST/raw/master/mnist/data/t10k-images-idx3-ubyte.gz
!wget --directory-prefix=$data_dir https://github.com/golbin/TensorFlow-MNIST/raw/master/mnist/data/t10k-labels-idx1-ubyte.gz
!wget --directory-prefix=$data_dir https://github.com/golbin/TensorFlow-MNIST/raw/master/mnist/data/train-images-idx3-ubyte.gz
!wget --directory-prefix=$data_dir https://github.com/golbin/TensorFlow-MNIST/raw/master/mnist/data/train-labels-idx1-ubyte.gz

--2021-03-19 13:16:49--  https://github.com/golbin/TensorFlow-MNIST/raw/master/mnist/data/t10k-images-idx3-ubyte.gz
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/golbin/TensorFlow-MNIST/master/mnist/data/t10k-images-idx3-ubyte.gz [following]
--2021-03-19 13:16:49--  https://raw.githubusercontent.com/golbin/TensorFlow-MNIST/master/mnist/data/t10k-images-idx3-ubyte.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1648877 (1.6M) [application/octet-stream]
Saving to: ‘/content/dataset/MNIST/raw/t10k-images-idx3-ubyte.gz.2’


2021-03-19 13:16:49 (37.2 MB/s) - ‘/content/dataset/MNIST/raw/t10k-imag

In [14]:
import os
import torch
import random
import numpy as np
import torch.nn as nn
import multiprocessing
import torch.optim as optim
import torch.nn.functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms

from tqdm.notebook import tqdm
from torch.utils.data import DataLoader


# Reproducible Results and Deterministic Behavior
# Note: speed is slower!
# For debugging, not for training.
def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [15]:
# Test reproducible results

# Note: speed is slower!
# Only for debugging.

# seed_everything(seed=42)

# should be the same after each run
print(torch.rand((2, 2)))

tensor([[0.0019, 0.7173],
        [0.5282, 0.7903]])


In [28]:
# Create convolutional neural network (CNN)
class CNN(nn.Module):
    def __init__(self, in_channels=1, num_classes=10):
        super(CNN, self).__init__()
        # same convolution
        self.conv1 = nn.Conv2d(in_channels=in_channels, out_channels=8,
                               kernel_size=(3,3), stride=(1,1), padding=(1,1))
        self.pool = nn.MaxPool2d(kernel_size=(2,2), stride=(2,2))
        self.conv2 = nn.Conv2d(in_channels=8, out_channels=16,
                               kernel_size=(3,3), stride=(1,1), padding=(1,1))
        self.fc1 = nn.Linear(16*7*7, num_classes)

        self.initialize_weights()

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = x.reshape(x.shape[0], -1)
        x = self.fc1(x)
        return x

    def initialize_weights(self):
        # self.modules() is tracked by PyTorch:
        # self.conv1, self.conv2, self.pool, self.fc1, etc.
        for m in self.modules():
            if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d)):
                nn.init.kaiming_uniform_(m.weight)

                # # for nn.BatchNorm
                # if m.bias is not None:  # not None by default
                #     nn.init.constant_(m.bias, 0)  # set bias=False for BatchNorm

            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

            elif isinstance(m, nn.Linear):
                # init weights with mean of 0.0 and standard deviation of 0.02
                nn.init.normal_(m.weight.data, 0.0, 0.02)


# Hyperparameters
in_channel = 1
num_classes = 10
learning_rate = 0.001
batch_size = 64
num_epochs = 5


# Check if it runs correctly
model = CNN()
x = torch.randn(batch_size, 1, 28, 28)

# Run model on the input and print the shape
print(model(x).shape)


# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# Load data
train_dataset = datasets.MNIST(root='dataset/', train=True, transform=transforms.ToTensor(), download=True)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = datasets.MNIST(root='dataset/', train=False, transform=transforms.ToTensor(), download=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)


# Initialize network
model = CNN().to(device)


# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=model.parameters(), lr=learning_rate)
# if loss has not decreased for {patience} epochs,
# then lower the learning rate by the factor of {factor} (divide by 10 by default)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=5, verbose=True)


# Mixed Precision Training (FP16)
scaler = torch.cuda.amp.GradScaler()  # set scaler before training loop

torch.Size([64, 10])


In [29]:
# Check accuracy on training and test to see how good our model
def check_accuracy(loader, model, msg=False):
    num_correct = 0
    num_samples = 0
    model.eval()  # set model to evaluation mode

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device)
            y = y.to(device=device)

            scores = model(x)  # shape = 64x10
            _, predictions = scores.max(dim=1)  # get index of max value
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)

        acc = float(num_correct) / float(num_samples) * 100
        loss = criterion(scores, y)
        
        if msg:
            if loader.dataset.train:
                print('Checking accuracy on training data')
            else:
                print('Checking accuracy on test data')
            print(f'Got {num_correct} / {num_samples} with accuracy {acc:.2f}')

    model.train()
    return loss, acc

In [32]:
# Train network
# Without scaler ≈ 42 sec
# With scaler ≈ 35 sec
# With float16 training it is faster than float32 and saves video memory.

model.train()

for epoch in range(num_epochs):
    losses = []
    loop = tqdm(enumerate(train_loader), total=len(train_loader), leave=False)

    # update progress bar
    loss, acc = check_accuracy(test_loader, model)
    loop.set_description(f'Epoch [{epoch}/{num_epochs-1}]')
    loop.set_postfix(loss=f'{loss.item():.4f}', acc=acc)

    for batch_idx, (data, targets) in loop:
        # Get data to Cuda if possible
        data = data.to(device=device)
        targets = targets.to(device=device)

        # Forward
        with torch.cuda.amp.autocast():
            scores = model(data)  # shape 64x10
            loss = criterion(scores, targets)
            losses.append(loss.item())

        # Backward
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

    mean_loss = sum(losses) / len(losses)
    scheduler.step(mean_loss)
    print(f'Cost at epoch {epoch} is {mean_loss}')

HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))

Cost at epoch 0 is 0.08254332036705318


HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))

Cost at epoch 1 is 0.06514953039457469


HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))

Cost at epoch 2 is 0.055276696863117564


HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))

Cost at epoch 3 is 0.04830626049327221


HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))

Cost at epoch 4 is 0.0437837772299476


HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))

Cost at epoch 5 is 0.038128814792376196


HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))

Cost at epoch 6 is 0.034593099064113245


HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))

Cost at epoch 7 is 0.031574167256396925


HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))

Cost at epoch 8 is 0.029058264678704423


HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))

Cost at epoch 9 is 0.027593498854877562


HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))

Cost at epoch 10 is 0.02433233348298262


HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))

Cost at epoch 11 is 0.022210699156268505


HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))

Cost at epoch 12 is 0.020940384802639114


HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))

Cost at epoch 13 is 0.01997516350521245


HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))

Cost at epoch 14 is 0.018160848264327845


HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))

Cost at epoch 15 is 0.01775803858750662


HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))

Cost at epoch 16 is 0.01546666350165471


HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))

Cost at epoch 17 is 0.014878662274591302


HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))

Cost at epoch 18 is 0.013472166748876661


HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))

Cost at epoch 19 is 0.012434035786381326


HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))

Cost at epoch 20 is 0.01275260127433308


HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))

Cost at epoch 21 is 0.011391984550726209


HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))

Cost at epoch 22 is 0.011367540252851312


HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))

Cost at epoch 23 is 0.00960079415499771


HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))

Cost at epoch 24 is 0.010299776951632124


HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))

Cost at epoch 25 is 0.008621977736077561


HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))

Cost at epoch 26 is 0.009318203157081158


HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))

Cost at epoch 27 is 0.008134802580627215


HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))

Cost at epoch 28 is 0.007290482131027404


HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))

Cost at epoch 29 is 0.006560610214166914


In [33]:
_ = check_accuracy(train_loader, model, True)
_ = check_accuracy(test_loader, model, True)

Checking accuracy on training data
Got 59831 / 60000 with accuracy 99.72
Checking accuracy on test data
Got 9843 / 10000 with accuracy 98.43


## Calculate Mean and Standard Deviation of Data

In [20]:
# Get mean and standard deviation
def get_mean_std(loader):
    channels_sum, channels_squared_sum, num_batches = 0, 0, 0

    for data, _ in loader:
        # don't do mean across channels, dim=1
        channels_sum += torch.mean(data, dim=[0, 2, 3])
        channels_squared_sum += torch.mean(data**2, dim=[0, 2, 3])
        num_batches += 1

    mean = channels_sum / num_batches
    # std[x] = (E[x**2] - E[x]**2) ** 0.5
    std = (channels_squared_sum / num_batches - mean ** 2) ** 0.5

    return mean, std


# Get mean and standard deviation for MNIST dataset
mean, std = get_mean_std(train_loader)
print('MNIST dataset')
print(f'mean: {mean}')
print(f'std: {std}')

MNIST dataset
mean: tensor([0.1307])
std: tensor([0.3081])


In [21]:
# Get CIFAR10 dataset
cifar10_dataset = datasets.CIFAR10(root='cifar10',
                                 train=True,
                                 transform=transforms.ToTensor(),
                                 download=True)
cifar10_loader = DataLoader(dataset=cifar10_dataset,
                          batch_size=64,
                          shuffle=True,
                          num_workers=multiprocessing.cpu_count(),
                          pin_memory=True)

# Get mean and standard deviation for CIFAR10 dataset
mean, std = get_mean_std(cifar10_loader)
print('CIFAR10 dataset')
print(f'mean: {mean}')
print(f'std: {std}')

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to cifar10/cifar-10-python.tar.gz


HBox(children=(FloatProgress(value=0.0, max=170498071.0), HTML(value='')))


Extracting cifar10/cifar-10-python.tar.gz to cifar10
CIFAR10 dataset
mean: tensor([0.4915, 0.4822, 0.4466])
std: tensor([0.2471, 0.2435, 0.2616])
