## Imports

In [None]:
import argparse
import os
import random
import shutil
import time
import warnings

import numpy as np

import torch
import torch.nn as nn
import torch.nn.parallel
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models

In [None]:
device = (torch.device('cuda') if torch.cuda.is_available()
    else torch.device('gpu'))
print(f"Training on device {device}.")
print(f"torch.cuda.is_available() = {torch.cuda.is_available()}")

Training on device cuda.
torch.cuda.is_available() = True


## Custom Argparser for Running Notebook

In [None]:
parser = argparse.ArgumentParser(description='PyTorch SirenNet Training')
"""parser.add_argument('data', metavar='DIR',
                    help='path to dataset')"""
parser.add_argument('--epochs', default=90, type=int, metavar='N',
                    help='number of total epochs to run')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
                    help='manual epoch number (useful on restarts)')
parser.add_argument('-b', '--batch-size', default=256, type=int,
                    metavar='N',
                    help='mini-batch size (default: 256), this is the total '
                         'batch size of all GPUs on the current node when '
                         'using Data Parallel or Distributed Data Parallel')
parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
                    metavar='LR', help='initial learning rate', dest='lr')
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
                    help='momentum')
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
                    metavar='W', help='weight decay (default: 1e-4)',
                    dest='weight_decay')
parser.add_argument('-p', '--print-freq', default=10, type=int,
                    metavar='N', help='print frequency (default: 10)')
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
                    help='evaluate model on validation set')
parser.add_argument('--seed', default=None, type=int,
                    help='seed for initializing training. ')
parser.add_argument('--gpu', default=None, type=int,
                    help='GPU id to use.')

_StoreAction(option_strings=['--gpu'], dest='gpu', nargs=None, const=None, default=None, type=<class 'int'>, choices=None, help='GPU id to use.', metavar=None)

## Functions

In [None]:
def train(train_loader, model, criterion, optimizer, epoch, args = None):
    global device

    batch_time = AverageMeter('Time', ':6.3f')
    data_time = AverageMeter('Data', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    top1 = AverageMeter('Acc@1', ':6.2f')
    top5 = AverageMeter('Acc@5', ':6.2f')
    progress = ProgressMeter(
        len(train_loader),
        [batch_time, data_time, losses, top1, top5],
        prefix="Epoch: [{}]".format(epoch))

    # switch to train mode
    model.train()

    end = time.time()
    for i, (images, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)

        images_, target_ = None, None
        if args.gpu is not None:
            images_ = images.cuda(args.gpu, non_blocking=True)
        else:
            images_ = images.to(device=device)
        if torch.cuda.is_available():
            target_ = target.cuda(args.gpu, non_blocking=True)
        else:
            target_ = target.to(device=device)    


        images_ = images.to(device=device)
        target_ = target.to(device=device)

        # compute output
        output = model(images_)
        loss = criterion(output, target_)

        # measure accuracy and record loss
        acc1, acc5 = accuracy(output, target_, topk=(1, 5))
        losses.update(loss.item(), images.size(0))
        top1.update(acc1[0], images.size(0))
        top5.update(acc5[0], images.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            progress.display(i)
        pass

In [None]:
def validate(val_loader, model, criterion, args = None):
    global device

    batch_time = AverageMeter('Time', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    top1 = AverageMeter('Acc@1', ':6.2f')
    top5 = AverageMeter('Acc@5', ':6.2f')
    progress = ProgressMeter(
        len(val_loader),
        [batch_time, losses, top1, top5],
        prefix='Test: ')

    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, (images, target) in enumerate(val_loader):
            
            images_, target_ = None, None
            if args.gpu is not None:
                images_ = images.cuda(args.gpu, non_blocking=True)
            else:
                images_ = images.to(device=device)
            if torch.cuda.is_available():
                target_ = target.cuda(args.gpu, non_blocking=True)
            else:
                target_ = target.to(device=device)  
            

            images_ = images.to(device=device)
            target_ = target.to(device=device)

            # compute output
            output = model(images_)
            loss = criterion(output, target_)

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target_, topk=(1, 5))
            losses.update(loss.item(), images.size(0))
            top1.update(acc1[0], images.size(0))
            top5.update(acc5[0], images.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % args.print_freq == 0:
                progress.display(i)
            pass
        # TODO: this should also be done with the ProgressMeter
        print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
              .format(top1=top1, top5=top5))

    return top1.avg

In [None]:
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')

In [None]:
def adjust_learning_rate(optimizer, epoch, args):
    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
    lr = args.lr * (0.1 ** (epoch // 30))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [None]:
def accuracy(output, target, topk=(1,)):
    """Computes the accuracy over the k top predictions for the specified values of k"""
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)

        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(100.0 / batch_size))
        return res

In [None]:
def main_worker(train_loader, val_loader, model, criterion, optimizer, args):
    global best_acc1
    
    for epoch in range(args.start_epoch, args.epochs):
        """if args.distributed:
            train_sampler.set_epoch(epoch)"""
        # adjust_learning_rate(optimizer, epoch, args)
    
        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args)

        # evaluate on validation set
        acc1 = validate(val_loader, model, criterion, args)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        """
        save_checkpoint({
            'epoch': epoch + 1,
            'arch': args.arch,
            'state_dict': model.state_dict(),
            'best_acc1': best_acc1,
            'optimizer' : optimizer.state_dict(),
            }, 
        is_best)
        """
        pass
    return model

In [None]:
def load_train_val_sets(args, data_path = '/content/cifar-10-batches-py'):

    class_objects = "airplane,automobile,bird,cat,deer,dog,frog,horse,ship,truck".split(",")

    class_indeces = range(0, 10)
    item_pairs = zip(class_indeces, class_objects)
    class_names = dict(item_pairs)

    class_indeces = range(0, 10)
    item_pairs_reverse = zip(class_objects, class_indeces)
    class_names_reverse = dict(item_pairs_reverse)

    tensor_cifar10 = datasets.CIFAR10(data_path, train=True, download=True,
    transform=transforms.ToTensor())

    imgs = torch.stack([img_t for img_t, _ in tensor_cifar10], dim=3)
    mean_by_channels = imgs.view(3, -1).mean(dim=1)
    std_by_channels = imgs.view(3, -1).std(dim=1)

    transformed_cifar10 = datasets.CIFAR10(data_path, train=True, download=False,
        transform=transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize(mean=mean_by_channels, std=std_by_channels)
            ]
        )
    )

    transformed_cifar10_val = datasets.CIFAR10(data_path, train=False, download=True,
        transform=transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize(mean=mean_by_channels, std=std_by_channels)
            ]
        )
    )

    train_loader = torch.utils.data.DataLoader(
        transformed_cifar10, batch_size = args.batch_size, shuffle = True
    )

    val_loader = torch.utils.data.DataLoader(
        transformed_cifar10_val, batch_size = args.batch_size, shuffle = False
    )
    return train_loader, val_loader

## Classes

In [None]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self, name, fmt=':f'):
        self.name = name
        self.fmt = fmt
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def __str__(self):
        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
        return fmtstr.format(**self.__dict__)

In [None]:
class ProgressMeter(object):
    def __init__(self, num_batches, meters, prefix=""):
        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
        self.meters = meters
        self.prefix = prefix

    def display(self, batch):
        entries = [self.prefix + self.batch_fmtstr.format(batch)]
        entries += [str(meter) for meter in self.meters]
        print('\t'.join(entries))

    def _get_batch_fmtstr(self, num_batches):
        num_digits = len(str(num_batches // 1))
        fmt = '{:' + str(num_digits) + 'd}'
        return '[' + fmt + '/' + fmt.format(num_batches) + ']'

In [None]:
class ResBlock(nn.Module):
    def __init__(self, n_chans):
        super(ResBlock, self).__init__()
        self.conv = nn.Conv2d(n_chans, n_chans, kernel_size = 3, padding = 1, bias = False)
        self.batch_norm = nn.BatchNorm2d(num_features = n_chans)
        torch.nn.init.kaiming_normal_(self.conv.weight, nonlinearity='relu')
        torch.nn.init.constant_(self.batch_norm.weight, 0.5)
        torch.nn.init.zeros_(self.batch_norm.bias)
        pass

    def forward(self, x):
        out = self.conv(x)
        out = self.batch_norm(out)
        out = torch.relu(out)
        return out + x
    pass


class NetResDeep(nn.Module):
    def __init__(self, num_classes = 2, n_chans1 = 32, n_blocks = 10):
        super().__init__()
        self.n_chans1 = n_chans1


        self.conv1 = nn.Conv2d(3, n_chans1, kernel_size = 3, padding = 1)
        self.resblocks = nn.Sequential(
            *(n_blocks * [ResBlock(n_chans = n_chans1)])
        )

        # Fully Connected layers on top NN Arch.
        self.fc1 = nn.Linear(8 * 8 * n_chans1, 32)
        self.fc2 = nn.Linear(32, num_classes)
        pass

    def forward(self, x):
        out = F.max_pool2d(torch.relu(self.conv1(x)), 2)
        out = self.resblocks(out)

        out = F.max_pool2d(out, 2)
        
        out = out.view(-1, 8 * 8 * self.n_chans1)
        out = torch.relu(self.fc1(out))
        out = self.fc2(out)
        return out
    pass

## Load Dataset CIFAR-10

In [None]:
torch.manual_seed(0)
np.random.seed(0)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
data_path = '/content/cifar-10-batches-py'

In [None]:
class_objects = "airplane,automobile,bird,cat,deer,dog,frog,horse,ship,truck".split(",")

class_indeces = range(0, 10)
item_pairs = zip(class_indeces, class_objects)
class_names = dict(item_pairs)

class_indeces = range(0, 10)
item_pairs_reverse = zip(class_objects, class_indeces)
class_names_reverse = dict(item_pairs_reverse)

In [None]:
tensor_cifar10 = datasets.CIFAR10(data_path, train=True, download=True,
    transform=transforms.ToTensor())

Files already downloaded and verified


In [None]:
imgs = torch.stack([img_t for img_t, _ in tensor_cifar10], dim=3)

In [None]:
mean_by_channels = imgs.view(3, -1).mean(dim=1)
std_by_channels = imgs.view(3, -1).std(dim=1)

In [None]:
transformed_cifar10 = datasets.CIFAR10(data_path, train=True, download=False,
    transform=transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize(mean=mean_by_channels, std=std_by_channels)
            ]
        )
    )

In [None]:
transformed_cifar10_val = datasets.CIFAR10(data_path, train=False, download=True,
    transform=transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize(mean=mean_by_channels, std=std_by_channels)
            ]
        )
    )

Files already downloaded and verified


## Run

In [None]:
def main():
    global parser

    args, unknown = parser.parse_known_args()

    if args.seed is not None:
        np.random.seed(args.seed)
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn('You have chosen to seed training. '
                      'This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably! '
                      'You may see unexpected behavior when restarting '
                      'from checkpoints.')
    else:
        torch.manual_seed(0)
        np.random.seed(0)
        random.seed(0)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        pass

    if args.gpu is None:
        if torch.cuda.current_device():
            args.gpu = torch.cuda.current_device()
    
    # build a model
    model = NetResDeep(num_classes = 10)
    numel_list = [p.numel() for p in model.parameters() if p.requires_grad == True]
    print(model)
    print(sum(numel_list), numel_list)

    if args.gpu is not None:
        model.cuda(args.gpu)
    else:
        model = model.to(device = device)


    learning_rate = args.lr
    optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)

    criterion = nn.CrossEntropyLoss()

    # Load Data
    train_loader, val_loader = load_train_val_sets(args = args)

    # Train Model
    model = main_worker(train_loader, val_loader, model.to(device = device), criterion, optimizer, args)

    # Save model's parameters
    data_path = '/content/'
    torch.save(model.state_dict(), data_path + 'ResNet_norm_batchnorm.pt')
    pass

In [None]:
best_acc1 = 0
if __name__ == "__main__":
    main()
    pass

NetResDeep(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (resblocks): Sequential(
    (0): ResBlock(
      (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (batch_norm): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): ResBlock(
      (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (batch_norm): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (2): ResBlock(
      (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (batch_norm): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (3): ResBlock(
      (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (batch_norm): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (4): ResBlock

## References

- Pytorch Reference's Manual:
 - [torch.nn module](https://pytorch.org/docs/stable/nn.html)
 - [TensorBoard Support](https://pytorch.org/tutorials/intermediate/tensorboard_tutorial.html)

- Initialization Topic (Papers):
  - [Understanding the difficulty of training deep feedforward neural networks](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf) by X. Glorot & Y.Bengio, which lead to default Pytorch's weights initialization knwon as *Xavier initialization* algorithm or scheme
 -  [Fixup Initialization: Residual Learning Without Normalization](https://arxiv.org/abs/1901.09321) by Hongyi Zhang, Yann N. Dauphin, Tengyu Ma, whose works allows to *get rid off batch normalization layers* with a given particular NN Arch, to still be able to train a NN arch with meaningful and confident results or performance.

- Activation Functions (Papers):
  - [Deep Learning using Rectified Linear Units (ReLU)](https://arxiv.org/pdf/1803.08375.pdf)

- Datasets:
  - [CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html)

- Regularization techniques (Papers):
  - [Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shif](https://arxiv.org/abs/1502.03167)
  - [Dropout: A Simple Way to Prevent Neural Networks from
Overfitting](https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf)

- Archs Types (Papers):
  - [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385)
  - [Densely Connected Convolutional Networks](https://arxiv.org/abs/1608.06993)
  - [Highway Networks
](https://arxiv.org/pdf/1505.00387.pdf)
  - [U-Net: Convolutional Networks for Biomedical Image Segmentation](https://arxiv.org/abs/1505.04597)

- Some Third Party useful Tutorials:
 - [Imagenet example](https://github.com/pytorch/examples/blob/master/imagenet/main.py#L327)
 - [Writing a better code with pytorch and einops](https://arogozhnikov.github.io/einops/pytorch-examples.html)
 - [Hands-on Graph Neural Networks with PyTorch & PyTorch Geometric - MEDIUM](https://towardsdatascience.com/hands-on-graph-neural-networks-with-pytorch-pytorch-geometric-359487e221a8)

- Books
  - [List of books for improving Pytorch knowledge](https://bookauthority.org/books/best-pytorch-books)

- GitHub Projetcs:
  - [PyTorch Geometric](https://github.com/rusty1s/pytorch_geometric)
  - [Minetorch](https://github.com/minetorch/minetorch)
  - [Pierogi](https://github.com/nalepae/pierogi/)
  - [Visdom](https://github.com/facebookresearch/visdom#vismatplot)