# Preâmbulo

Imports, funções, downloads e instalação do Pytorch.

In [None]:
# Basic imports.
import os
import time
import numpy as np
import torch

from torch import nn
from torch import optim

from torch.utils.data import DataLoader
from torch.utils import data
from torch.backends import cudnn

from torchvision import models
from torchvision import datasets
from torchvision import transforms

from skimage import io

from sklearn import metrics

from matplotlib import pyplot as plt

%matplotlib inline

cudnn.benchmark = True

In [None]:
# Setting predefined arguments.
args = {
    'epoch_num': 50,      # Number of epochs.
    'n_classes': 10,      # Number of classes.
    'lr': 1e-4,           # Learning rate.
    'weight_decay': 5e-4, # L2 penalty.
    'momentum': 0.9,      # Momentum.
    'num_workers': 4,     # Number of workers on data loader.
    'batch_size': 200,    # Mini-batch size.
    'w_size': 224,        # Width size for image resizing.
    'h_size': 224,        # Height size for image resizing.
}

if torch.cuda.is_available():
    args['device'] = torch.device('cuda')
else:
    args['device'] = torch.device('cpu')

print(args['device'])

In [None]:
# Root directory for the dataset (to be downloaded).
root = './'

# Data Augmentation transforms.
data_transform = transforms.Compose([
    transforms.Resize((100, 100)),
    transforms.RandomCrop((75, 75)),
    transforms.ColorJitter(brightness=0.5, contrast=0.5),
    transforms.ToTensor(),
])

# Setting datasets and dataloaders.
train_set = datasets.CIFAR10(root,
                             train=True,
                             download=True,
                             transform=data_transform)
test_set = datasets.CIFAR10(root,
                            train=False,
                            download=False,
                            transform=data_transform)
    
for iters in range(5):
    
    fig, ax = plt.subplots(1, 5, figsize=(20, 4))

    for i, test_data in enumerate(test_set):

        if i >= 5:
            break
    
        test_img, _ = test_data

        ax[i].imshow(test_img.numpy().transpose(1, 2, 0))
        ax[i].set_yticks([])
        ax[i].set_xticks([])
        ax[i].set_title('Image ' + str(i))
        
    plt.show()

In [None]:
# Root directory for the dataset (to be downloaded).
root = './'

# Setting transforms (resizing, conversion to tensor and normalizing).
data_transform = transforms.Compose([
    transforms.Resize((args['h_size'], args['w_size'])),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# Setting datasets and dataloaders.
train_set = datasets.CIFAR10(root,
                             train=True,
                             download=True,
                             transform=data_transform)
test_set = datasets.CIFAR10(root,
                            train=False,
                            download=False,
                            transform=data_transform)

train_loader = DataLoader(train_set,
                          args['batch_size'],
                          num_workers=args['num_workers'],
                          shuffle=True)
test_loader = DataLoader(test_set,
                         args['batch_size'],
                         num_workers=args['num_workers'],
                         shuffle=False)

# Printing training and testing dataset sizes.
print('Size of training set: ' + str(len(train_set)) + ' samples')
print('Size of test set: ' + str(len(test_set)) + ' samples')

# Redes Residuais (ResNets)

Entre 2012 e 2015 a comunidade de Visão Computacional percebeu que redes mais profundas conseguiam capturar características semânticas mais úteis dos dados para tarefas de reconhecimento de imagens (i.e. classificação, segmentação, detecção...). Porém, redes mais profundas que as maiores arquiteturas da época -- como a [VGG](https://arxiv.org/abs/1409.1556) e a [Inception](https://arxiv.org/abs/1409.4842) -- sofriam de um problema chamado **Vanishing Gradient**.

![VGG](https://www.researchgate.net/profile/Clifford_Yang/publication/325137356/figure/fig2/AS:670371271413777@1536840374533/llustration-of-the-network-architecture-of-VGG-19-model-conv-means-convolution-FC-means_W640.jpg)

![Inception](https://miro.medium.com/max/700/1*uW81y16b-ptBDV8SIT1beQ.png)


O **Vanishing Gradient** se torna mais problemático em redes mais profundas porque o gradiente dos erros precisa backpropagar desde a última camada até o começo da rede. Dessa forma, as últimas camadas conseguem ser treinadas de forma eficiente, mas o gradiente dos erros vai desaparecendo à medida em que backpropaga pela rede, praticamente impossibilitando o treinamento das primeiras camadas. Assim, foi constatado que uma rede com, por exemplo, 34 camadas acabava por conseguir resultados piores que uma rede com apenas 18 camadas.

![Rede Não Residual](https://www.dropbox.com/s/pq190al5b3qv194/normal_18_vs_34_layers.png?dl=1)

No final de 2015 foi proposta uma solução para o **Vanishing Gradient** na forma de **Blocos Residuais** que, juntos, formam **Redes Residuais** [**(ResNets)**](https://arxiv.org/abs/1512.03385). Esses blocos residuais recebem uma entrada $x$ e a alimentam para um bloco convolucional ($\mathcal{F}(x)$) composto por:

1.   Convolução 3x3;
2.   Batch Normalization;
3.   ReLU;
4.   Convolução 3x3;
5.   Batch Normalization.

A saída $\mathcal{F}(x)$ desse bloco, antes de ser passada por uma segunda ReLU, é passada em conjunto com a entrada $x$ para uma função identidade, que, no caso das **ResNets**, é uma simples soma. Dessa forma, a saída final de um **Bloco Residual** é dada por: $\mathcal{F}(x) + x$. O esquema de um **Bloco Residual** pode ser visto na figura abaixo.

![Bloco Residual](https://www.dropbox.com/s/ezydump33p95ohc/residual_block.png?dl=1)

Como pode ser visto na imagem a seguir, com o uso de blocos residuais, uma arquitetura com 34 camadas consegue resultados melhores que uma arquitetura com apenas 18 camadas. Esses resultados evidenciam que o uso da soma como **identity function** de fato permite que o backward treine mais efetivamente as primeiras camadas das **ResNets**. **ResNets** permitiram que CNNs chegassem até a casa das 100 camadas. A maior **ResNet** usada na prática possui 152 camadas, o que a deixa impraticável de imprimir numa figura como é mostrado abaixo na ResNet34.

![Rede Residual](https://www.dropbox.com/s/q4wcjwf8qj4xjrn/resnet_18_vs_34_layers.png?dl=1)

Como pode ser visto nas imagens abaixo, ResNets (e outras arquiteturas modernas como a [VGG](https://arxiv.org/abs/1409.1556) e as [DenseNets](https://arxiv.org/abs/1608.06993) são compostas basicamente de convoluções com kernels de tamanho 3x3. Além disso, é notável na arquitetura residual (à direita) a presença dos "atalhos" para o gradiente  na forma das funções identidade que ajudam no treinamento das primeiras camadas durante o backpropagation.

![VGG vs. Plain34 vs. ResNet34](https://www.dropbox.com/s/d2w3h7dlumgclx2/vgg_plain34_resnet34.png?dl=1)

# Atividade Prática

1.   Implementar a classe *ResidualBlock()*. Devem ser implementados os

In [None]:
# Implementation of residual block to be reused.
class ResidualBlock(nn.Module):
    
    def __init__(self, in_planes, out_planes):
        
        super(ResidualBlock, self).__init__()
        
        # TO DO: define first and second convolutional blocks.
        
        # If in_planes != out_planes, perform a 1x1 conv to match #channels.
        self.conv1x1 = None
        if in_planes != out_planes:
            self.conv1x1 = # TO DO: define 1x1 convolution.
            

    def forward(self, x):
        
        identity = x

        # TO DO: forward on first conv block.
        
        # TO DO: forward on second conv block. (don't forget the 1x1 conv to match the number of channels)
        
        # TO DO: return output.

# ResNet18 implementation.
class ResNet(nn.Module):
    
    def __init__(self, num_classes=10):

        super(ResNet, self).__init__()

        # TO DO: define first convolutional block:
        #        1. conv with 7x7 kernel, 64 output channels, stride of 2,
        #           padding of 3 and with no bias;
        #        2. 2d batch normalization;
        #        3. ReLU;
        #        4. 2d max pool with 3x3 kernel, stride of 2 and padding of 1.


        # TO DO: define first residual block + pooling:
        #        1. instantiate residual block from class ResidualBlock with
        #           number of output channels equal to 64;
        #        2. 2d max pool with 3x3 kernel, stride of 2 and padding of 1.
        
        
        # TO DO: define second residual block + pooling:
        #        1. instantiate residual block from class ResidualBlock with
        #           number of output channels equal to 128;
        #        2. 2d max pool with 3x3 kernel, stride of 2 and padding of 1.
        
        # TO DO: define classifier:
        #        1. define an nn.AdaptiveAvgPool2d(output_size(1, 1))
        #        2. define FC classification layer with 10 outputs.
        self.adaptive_pool = nn.AdaptiveAvgPool2d(output_size=(1, 1))
        
        self.classifier = nn.Linear(128, num_classes)
        
        self.initialize_weights()
    
    # Function for randomly initializing weights.
    def initialize_weights(self):
        
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight,
                                        mode='fan_out',
                                        nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)
                
    def forward(self, x):
        
        # TO DO: forward on first conv block.
        
        # TO DO: forward on first residual block.
        
        # TO DO: forward on second residual block.
        
        # TO DO: forward on adaptive_pool.
        
        # TO DO: linearizing tensor to serve as input to FC layer.
        
        # TO DO: obtain 10 predictions (one for each class) on FC layer.
        
        # TO DO: return output.

# Instantiating architecture.
net = ResNet(args['n_classes']).to(args['device'])

# Printing architecture.
print(net)

In [None]:
# Using predefined and pretrained model of ResNet18 on torchvision.
# net = models.resnet18(pretrained=True, progress=False).to(args['device'])
# net.fc = nn.Linear(in_features=512, out_features=10, bias=True).to(args['device'])

# Shrunk version of ResNet18 to be used in class.
# net = models.resnet18(pretrained=True, progress=False).to(args['device'])
# net.layer3 = nn.Sequential().to(args['device']) # Uncomment to shrink ResNet.
# net.layer4 = nn.Sequential().to(args['device']) # Uncomment to shrink ResNet.
# net.fc = nn.Linear(in_features=128, out_features=10, bias=True).to(args['device'])

# Definindo o otimizador

O Pytorch possui vários otimizadores prontos no subpacote [optim](https://pytorch.org/docs/stable/optim.html), desde o SGD básico a otimizadores mais complexos e com taxas de aprendizado por parâmetro como o Adagrad, RMSProp e Adam.

In [None]:
optimizer = optim.Adam(net.parameters(),
                       lr=args['lr'],
                       betas=(args['momentum'], 0.999),
                       weight_decay=args['weight_decay'])

# Definindo a loss

O subpacote [nn](https://pytorch.org/docs/stable/nn.html) possui várias funções de perda para diferentes tarefas (i.e. Cross Entropy, Negative Log Likelihood, loss L1, MSE, Kullback Leibler Divergence, etc) implementadas por padrão.



In [None]:
criterion = nn.CrossEntropyLoss().to(args['device'])

# Criando funções para Treino e Teste

Iterando sobre os datasets/dataloaders de treino e teste do CIFAR10. Abaixo são implementadas a função *train()* que itera sobre os batches do dataset de treino e atualiza o modelo e a função *test()* que apenas realiza o forward dos dados de teste no modelo e calcula a acurácia no dataset de teste para o modelo no estado atual.

In [None]:
def train(train_loader, net, criterion, optimizer, epoch):

    tic = time.time()
    
    # Setting network for training mode.
    net.train()

    # Lists for losses and metrics.
    train_loss = []
    
    # Iterating over batches.
    for i, batch_data in enumerate(train_loader):

        # Obtaining images, labels and paths for batch.
        inps, labs = batch_data
        
        # Casting to cuda variables.
        inps = inps.to(args['device'])
        labs = labs.to(args['device'])
        
        # Clears the gradients of optimizer.
        optimizer.zero_grad()

        # Forwarding.
        outs = net(inps)

        # Computing loss.
        loss = criterion(outs, labs)

        # Computing backpropagation.
        loss.backward()
        optimizer.step()
        
        # Updating lists.
        train_loss.append(loss.data.item())
    
    toc = time.time()
    
    train_loss = np.asarray(train_loss)
    
    # Printing training epoch loss and metrics.
    print('--------------------------------------------------------------------')
    print('[epoch %d], [train loss %.4f +/- %.4f], [training time %.2f]' % (
        epoch, train_loss.mean(), train_loss.std(), (toc - tic)))
    print('--------------------------------------------------------------------')
    
def test(test_loader, net, criterion, epoch):

    tic = time.time()
    
    # Setting network for evaluation mode (not computing gradients).
    net.eval()

    # Lists for losses and metrics.
    test_loss = []
    prd_list = []
    lab_list = []
    
    # Iterating over batches.
    for i, batch_data in enumerate(train_loader):

        # Obtaining images, labels and paths for batch.
        inps, labs = batch_data

        # Casting to cuda variables.
        inps = inps.to(args['device'])
        labs = labs.to(args['device'])

        # Forwarding.
        outs = net(inps)

        # Computing loss.
        loss = criterion(outs, labs)
        
        # Obtaining predictions.
        prds = outs.data.max(dim=1)[1].cpu().numpy()
        
        # Updating lists.
        test_loss.append(loss.data.item())
        prd_list.append(prds)
        lab_list.append(labs.detach().cpu().numpy())
    
    toc = time.time()
    
    # Computing accuracy.
    acc = metrics.accuracy_score(np.asarray(lab_list).ravel(),
                                 np.asarray(prd_list).ravel())
    
    test_loss = np.asarray(test_loss)
    
    # Printing training epoch loss and metrics.
    print('--------------------------------------------------------------------')
    print('[epoch %d], [test loss %.4f +/- %.4f], [acc %.4f], [testing time %.2f]' % (
        epoch, test_loss.mean(), test_loss.std(), acc, (toc - tic)))
    print('--------------------------------------------------------------------')

# Iterando sobre epochs

In [None]:
# Iterating over epochs.
for epoch in range(1, args['epoch_num'] + 1):

    # Training function.
    train(train_loader, net, criterion, optimizer, epoch)

    # Computing test loss and metrics.
    test(test_loader, net, criterion, epoch)