# Prática: AlexNet com recursos limitados

Se antes implementamos a [AlexNet](https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf) sem considerar a questão de quantidade de parâmetros, nesta prática focaremos nesse quesito.





In [1]:
!pip install mxnet-cu100

# imports basicos
import time, os, sys, numpy as np
import mxnet as mx
from mxnet import autograd, gluon, init, nd
from mxnet.gluon import loss as gloss, nn, utils as gutils, data as gdata

# Tenta encontrar GPU
def try_gpu():
    try:
        ctx = mx.gpu()
        _ = nd.zeros((1,), ctx=ctx)
    except mx.base.MXNetError:
        ctx = mx.cpu()
    return ctx

ctx = try_gpu()
ctx

## carregando dados

# código para carregar o dataset do CIFAR 10
# https://www.cs.toronto.edu/~kriz/cifar.html
def load_data_cifar10(batch_size, resize=None, root=os.path.join(
        '~', '.mxnet', 'datasets', 'cifar10')):
    """Download the MNIST dataset and then load into memory."""
    root = os.path.expanduser(root)
    transformer = []
    if resize:
        transformer += [gdata.vision.transforms.Resize(resize)]
    transformer += [gdata.vision.transforms.ToTensor()]
    transformer = gdata.vision.transforms.Compose(transformer)

    mnist_train = gdata.vision.CIFAR10(root=root, train=True)
    mnist_test = gdata.vision.CIFAR10(root=root, train=False)
    num_workers = 0 if sys.platform.startswith('win32') else 4

    train_iter = gdata.DataLoader(mnist_train.transform_first(transformer),
                                  batch_size, shuffle=True,
                                  num_workers=num_workers)
    test_iter = gdata.DataLoader(mnist_test.transform_first(transformer),
                                 batch_size, shuffle=False,
                                 num_workers=num_workers)
    return train_iter, test_iter
  
# funções básicas
def _get_batch(batch, ctx):
    """Return features and labels on ctx."""
    features, labels = batch
    if labels.dtype != features.dtype:
        labels = labels.astype(features.dtype)
    return (gutils.split_and_load(features, ctx),
            gutils.split_and_load(labels, ctx), features.shape[0])

# Função usada para calcular acurácia
def evaluate_accuracy(data_iter, net, loss, ctx=[mx.cpu()]):
    """Evaluate accuracy of a model on the given data set."""
    if isinstance(ctx, mx.Context):
        ctx = [ctx]
    acc_sum, n, l = nd.array([0]), 0, 0
    for batch in data_iter:
        features, labels, _ = _get_batch(batch, ctx)
        for X, y in zip(features, labels):
            # X, y = X.as_in_context(ctx), y.as_in_context(ctx)
            y = y.astype('float32')
            y_hat = net(X)
            l += loss(y_hat, y).sum()
            acc_sum += (y_hat.argmax(axis=1) == y).sum().copyto(mx.cpu())
            n += y.size
        acc_sum.wait_to_read()
    return acc_sum.asscalar() / n, l.asscalar() / n
  
# Função usada no treinamento e validação da rede
def train_validate(net, train_iter, test_iter, batch_size, trainer, loss, ctx,
                   num_epochs):
    print('training on', ctx)
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        for X, y in train_iter:
            X, y = X.as_in_context(ctx), y.as_in_context(ctx)
            with autograd.record():
                y_hat = net(X)
                l = loss(y_hat, y).sum()
            l.backward()
            trainer.step(batch_size)
            y = y.astype('float32')
            train_l_sum += l.asscalar()
            train_acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar()
            n += y.size
        test_acc, test_loss = evaluate_accuracy(test_iter, net, loss, ctx)
        print('epoch %d, train loss %.4f, train acc %.3f, test loss %.4f, '
              'test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_loss, 
                 test_acc, time.time() - start))



## AlexNet

Como vimos, a rede [AlexNet](https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf) foi uma das arquiteturas mais famosas dessa nova onda de rede neurais.

<p align="center">
  <img width=700 src="https://engmrk.com/wp-content/uploads/2018/10/AlexNet_Summary_Table.jpg">
</p>

Entretanto, ela possui muitos parâmetros.
Especificamente, essa arquitetura, para classificar 10 classes, tem um total de **58.312.736** de parâmetros como mostrado na tabela abaixo.

**Camada** | **Calc Parâmetros** | **Total Parâmetros**
--- | ---: | ---:
Convolução 1 | 11\*11\*3\*96 | 34.848
Convolução 2 | 5\*5\*96\*256 | 614.400
Convolução 3 | 3\*3\*256\*384 | 884.736
Convolução 4 | 3\*3\*384\*384 | 1.327.104
Convolução 5 | 3\*3\*384\*256 | 884.736
FC 6 | 9216*4096 | 37.748.736
FC 7 | 4096*4096 | 16.777.216
FC 8 | 4096*10 | 40.960
**Total** | | **58.312.736**

**Seu objetivo nessa prática é propor uma nova rede neural, baseada na [AlexNet](https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf), que possuia MENOS parâmetros e alcance uma acurácia similar ou melhor que a rede original vista na aula passada.**

Procure usar [*batch normalization*](https://mxnet.incubator.apache.org/api/python/gluon/nn.html#mxnet.gluon.nn.BatchNorm), camadas dilatadas e separáveis.
Neste caso, desconsidere os parâmetros da camada [*batch normalization*](https://mxnet.incubator.apache.org/api/python/gluon/nn.html#mxnet.gluon.nn.BatchNorm).

### Arquitetura 1

Essa primeira versão da [AlexNet](https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf) usa [*batch normalization*](https://mxnet.incubator.apache.org/api/python/gluon/nn.html#mxnet.gluon.nn.BatchNorm) para melhorar o resultado. Esse arquitetura é somente para efeitos de comparação já que, por enquanto, ainda temos a mesma quantidade de parâmetros da versão original.

In [0]:
class AlexNet(nn.HybridBlock):
    r"""AlexNet model from the `"One weird trick..." `_ paper.

    Parameters
    ----------
    classes : int, default 10
        Number of classes for the output layer.
    """
    def __init__(self, classes=10, **kwargs):
        super(AlexNet, self).__init__(**kwargs)
        with self.name_scope():
            self.features = nn.HybridSequential(prefix='')
            with self.features.name_scope():
                self.features.add(nn.Conv2D(96, kernel_size=11, strides=4, padding=0))                     # entrada: (b, 3, 227, 227) e saida: (b, 96, 55, 55)
                self.features.add(nn.BatchNorm())
                self.features.add(nn.Activation(activation='relu'))
                self.features.add(nn.MaxPool2D(pool_size=3, strides=2))                                    # entrada: (b, 96, 55, 55) e saida: (b, 96, 27, 27)
                self.features.add(nn.Conv2D(256, kernel_size=5, padding=2))                                # entrada: (b, 96, 27, 27) e saida: (b, 256, 27, 27)
                self.features.add(nn.BatchNorm())
                self.features.add(nn.Activation(activation='relu'))
                self.features.add(nn.MaxPool2D(pool_size=3, strides=2))                                    # entrada: (b, 256, 27, 27) e saida: (b, 256, 13, 13)
                self.features.add(nn.Conv2D(384, kernel_size=3, padding=1))                                # entrada: (b, 256, 13, 13) e saida: (b, 384, 13, 13)
                self.features.add(nn.BatchNorm())
                self.features.add(nn.Activation(activation='relu'))
                self.features.add(nn.Conv2D(384, kernel_size=3, padding=1))                                # entrada: (b, 384, 13, 13) e saida: (b, 384, 13, 13)
                self.features.add(nn.BatchNorm())
                self.features.add(nn.Activation(activation='relu'))
                self.features.add(nn.Conv2D(256, kernel_size=3, padding=1))                                # entrada: (b, 384, 13, 13) e saida: (b, 256, 13, 13)
                self.features.add(nn.BatchNorm())
                self.features.add(nn.Activation(activation='relu'))
                self.features.add(nn.MaxPool2D(pool_size=3, strides=2))                                    # entrada: (b, 256, 13, 13) e saida: (b, 256, 6, 6)
                self.features.add(nn.Flatten())                                                            # entrada: (b, 256, 13, 13) e saida: (b, 256*6*6) = (b, 9216)
                self.features.add(nn.Dense(4096))                                                          # entrada: (b, 9216) e saida: (b, 4096)
                self.features.add(nn.BatchNorm())
                self.features.add(nn.Activation(activation='relu'))
                self.features.add(nn.Dropout(0.5))
                self.features.add(nn.Dense(4096))                                                           # entrada: (b, 4096) e saida: (b, 4096)
                self.features.add(nn.BatchNorm())
                self.features.add(nn.Activation(activation='relu'))
                self.features.add(nn.Dropout(0.5))

            self.output = nn.Dense(classes)  # entrada: (b, 4096) e saida: (b, 10)

    def hybrid_forward(self, F, x):
        x = self.features(x)
        x = self.output(x)
        return x

In [0]:
num_epochs, lr, batch_size, wd_lambda = 20, 0.001, 100, 0.0001
    
net = AlexNet()
net.initialize(init.Normal(sigma=0.01), ctx=ctx)

# função de custo (ou loss)
loss = gloss.SoftmaxCrossEntropyLoss()

# carregamento do dado: fashion mnist
train_iter, test_iter = load_data_cifar10(batch_size, resize=227)

# trainer do gluon
trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': lr, 'wd': wd_lambda})

# treinamento e validação via MXNet
train_validate(net, train_iter, test_iter, batch_size, trainer, loss, 
               ctx, num_epochs)

training on gpu(0)
epoch 1, train loss 1.4996, train acc 0.489, test loss 1.4287, test acc 0.520, time 140.6 sec
epoch 2, train loss 1.0128, train acc 0.654, test loss 0.9690, test acc 0.666, time 112.8 sec
epoch 3, train loss 0.8140, train acc 0.720, test loss 0.9064, test acc 0.692, time 85.1 sec
epoch 4, train loss 0.7048, train acc 0.758, test loss 0.7015, test acc 0.761, time 84.9 sec
epoch 5, train loss 0.6302, train acc 0.784, test loss 0.7343, test acc 0.751, time 84.9 sec
epoch 6, train loss 0.5679, train acc 0.805, test loss 0.7254, test acc 0.758, time 84.9 sec
epoch 7, train loss 0.5213, train acc 0.820, test loss 0.7437, test acc 0.751, time 84.8 sec
epoch 8, train loss 0.4725, train acc 0.837, test loss 0.6186, test acc 0.796, time 84.9 sec
epoch 9, train loss 0.4364, train acc 0.849, test loss 0.5910, test acc 0.801, time 84.8 sec
epoch 10, train loss 0.4001, train acc 0.862, test loss 0.7092, test acc 0.772, time 84.6 sec
epoch 11, train loss 0.3738, train acc 0.871, te

### Arquitetura 2

Essa segunda versão da [AlexNet](https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf) usa [*batch normalization*](https://mxnet.incubator.apache.org/api/python/gluon/nn.html#mxnet.gluon.nn.BatchNorm) e camadas de [convolução dilatada](https://beta.mxnet.io/api/gluon/_autogen/mxnet.gluon.nn.Conv2D.html).
Neste caso, como usamos filtros dilatados em duas camadas (com dilatação aumentante, ou seja, dilatação 2 seguida da dilatação 4), foram removidos duas camadas convolucionais, já que o *receptive field* se mantem similar dessa forma.

Nessa arquitetura, já temos menos parâmetros. Precisamente, temos 



In [0]:
class AlexNet(nn.HybridBlock):
    r"""AlexNet model from the `"One weird trick..." `_ paper.

    Parameters
    ----------
    classes : int, default 10
        Number of classes for the output layer.
    """
    def __init__(self, classes=10, **kwargs):
        super(AlexNet, self).__init__(**kwargs)
        with self.name_scope():
            self.features = nn.HybridSequential(prefix='')
            with self.features.name_scope():
                self.features.add(nn.Conv2D(96, kernel_size=11, strides=4, padding=0))                     # entrada: (b, 3, 227, 227) e saida: (b, 96, 55, 55)
                self.features.add(nn.BatchNorm())
                self.features.add(nn.Activation(activation='relu'))
                self.features.add(nn.MaxPool2D(pool_size=3, strides=2))                                    # entrada: (b, 96, 55, 55) e saida: (b, 96, 27, 27)
                self.features.add(nn.Conv2D(256, kernel_size=5, padding=2, dilation=2))                    # entrada: (b, 96, 27, 27) e saida: (b, 256, 27, 27)
                self.features.add(nn.BatchNorm())
                self.features.add(nn.Activation(activation='relu'))
                self.features.add(nn.MaxPool2D(pool_size=3, strides=2))                                    # entrada: (b, 256, 27, 27) e saida: (b, 256, 13, 13)
#                 self.features.add(nn.Conv2D(384, kernel_size=3, padding=1))                              # entrada: (b, 256, 13, 13) e saida: (b, 384, 13, 13)
#                 self.features.add(nn.BatchNorm())
#                 self.features.add(nn.Activation(activation='relu'))
#                 self.features.add(nn.Conv2D(384, kernel_size=3, padding=1))                              # entrada: (b, 384, 13, 13) e saida: (b, 384, 13, 13)
#                 self.features.add(nn.BatchNorm())
#                 self.features.add(nn.Activation(activation='relu'))
                self.features.add(nn.Conv2D(256, kernel_size=3, padding=1, dilation=4))                    # entrada: (b, 256, 13, 13) e saida: (b, 256, 13, 13)
                self.features.add(nn.BatchNorm())
                self.features.add(nn.Activation(activation='relu'))
                self.features.add(nn.MaxPool2D(pool_size=3, strides=2))                                    # entrada: (b, 256, 13, 13) e saida: (b, 256, 6, 6)
                self.features.add(nn.Flatten())                                                            # entrada: (b, 256, 6, 6) e saida: (b, 256*6*6) = (b, 9216)
                self.features.add(nn.Dense(4096))                                                          # entrada: (b, 9216) e saida: (b, 4096)
                self.features.add(nn.BatchNorm())
                self.features.add(nn.Activation(activation='relu'))
                self.features.add(nn.Dropout(0.5))
                self.features.add(nn.Dense(4096))                                                           # entrada: (b, 4096) e saida: (b, 4096)
                self.features.add(nn.BatchNorm())
                self.features.add(nn.Activation(activation='relu'))
                self.features.add(nn.Dropout(0.5))

            self.output = nn.Dense(classes)  # entrada: (b, 4096) e saida: (b, 10)

    def hybrid_forward(self, F, x):
        x = self.features(x)
        x = self.output(x)
        return x

In [0]:
num_epochs, lr, batch_size, wd_lambda = 20, 0.001, 100, 0.0001
    
net = AlexNet()
net.initialize(init.Normal(sigma=0.01), ctx=ctx)

# função de custo (ou loss)
loss = gloss.SoftmaxCrossEntropyLoss()

# carregamento do dado: fashion mnist
train_iter, test_iter = load_data_cifar10(batch_size, resize=227)

# trainer do gluon
trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': lr, 'wd': wd_lambda})

# treinamento e validação via MXNet
train_validate(net, train_iter, test_iter, batch_size, trainer, loss, 
               ctx, num_epochs)

training on gpu(0)
epoch 1, train loss 1.4751, train acc 0.489, test loss 1.2504, test acc 0.561, time 78.7 sec
epoch 2, train loss 1.0481, train acc 0.637, test loss 0.9986, test acc 0.650, time 77.8 sec
epoch 3, train loss 0.8699, train acc 0.700, test loss 1.1546, test acc 0.626, time 77.7 sec
epoch 4, train loss 0.7454, train acc 0.745, test loss 0.8249, test acc 0.715, time 77.0 sec
epoch 5, train loss 0.6713, train acc 0.768, test loss 0.8899, test acc 0.711, time 76.3 sec
epoch 6, train loss 0.5727, train acc 0.800, test loss 0.7583, test acc 0.741, time 76.1 sec
epoch 7, train loss 0.5092, train acc 0.821, test loss 1.1919, test acc 0.657, time 76.6 sec
epoch 8, train loss 0.4513, train acc 0.842, test loss 1.0681, test acc 0.667, time 75.9 sec
epoch 9, train loss 0.3976, train acc 0.861, test loss 0.7757, test acc 0.753, time 75.8 sec
epoch 10, train loss 0.3453, train acc 0.880, test loss 0.8760, test acc 0.739, time 75.7 sec
epoch 11, train loss 0.3094, train acc 0.892, test

Para efeito de **comparação**, recriamos a rede sem as duas camadas convolucionais (removidas na arquitetura anterior) e sem camadas dilatadas. Dessa forma, podemos observar o ganho das convoluções dilatadas.

In [0]:
class AlexNet(nn.HybridBlock):
    r"""AlexNet model from the `"One weird trick..." `_ paper.

    Parameters
    ----------
    classes : int, default 10
        Number of classes for the output layer.
    """
    def __init__(self, classes=10, **kwargs):
        super(AlexNet, self).__init__(**kwargs)
        with self.name_scope():
            self.features = nn.HybridSequential(prefix='')
            with self.features.name_scope():
                self.features.add(nn.Conv2D(96, kernel_size=11, strides=4, padding=0))                     # entrada: (b, 3, 227, 227) e saida: (b, 96, 55, 55)
                self.features.add(nn.BatchNorm())
                self.features.add(nn.Activation(activation='relu'))
                self.features.add(nn.MaxPool2D(pool_size=3, strides=2))                                    # entrada: (b, 96, 55, 55) e saida: (b, 96, 27, 27)
                self.features.add(nn.Conv2D(256, kernel_size=5, padding=2))                                # entrada: (b, 96, 27, 27) e saida: (b, 256, 27, 27)
                self.features.add(nn.BatchNorm())
                self.features.add(nn.Activation(activation='relu'))
                self.features.add(nn.MaxPool2D(pool_size=3, strides=2))                                    # entrada: (b, 256, 27, 27) e saida: (b, 256, 13, 13)
#                 self.features.add(nn.Conv2D(384, kernel_size=3, padding=1))                              # entrada: (b, 256, 13, 13) e saida: (b, 384, 13, 13)
#                 self.features.add(nn.BatchNorm())
#                 self.features.add(nn.Activation(activation='relu'))
#                 self.features.add(nn.Conv2D(384, kernel_size=3, padding=1))                              # entrada: (b, 384, 13, 13) e saida: (b, 384, 13, 13)
#                 self.features.add(nn.BatchNorm())
#                 self.features.add(nn.Activation(activation='relu'))
                self.features.add(nn.Conv2D(256, kernel_size=3, padding=1))                                # entrada: (b, 256, 13, 13) e saida: (b, 256, 13, 13)
                self.features.add(nn.BatchNorm())
                self.features.add(nn.Activation(activation='relu'))
                self.features.add(nn.MaxPool2D(pool_size=3, strides=2))                                    # entrada: (b, 256, 13, 13) e saida: (b, 256, 6, 6)
                self.features.add(nn.Flatten())                                                            # entrada: (b, 256, 6, 6) e saida: (b, 256*6*6) = (b, 9216)
                self.features.add(nn.Dense(4096))                                                          # entrada: (b, 9216) e saida: (b, 4096)
                self.features.add(nn.BatchNorm())
                self.features.add(nn.Activation(activation='relu'))
                self.features.add(nn.Dropout(0.5))
                self.features.add(nn.Dense(4096))                                                           # entrada: (b, 4096) e saida: (b, 4096)
                self.features.add(nn.BatchNorm())
                self.features.add(nn.Activation(activation='relu'))
                self.features.add(nn.Dropout(0.5))

            self.output = nn.Dense(classes)  # entrada: (b, 4096) e saida: (b, 10)

    def hybrid_forward(self, F, x):
        x = self.features(x)
        x = self.output(x)
        return x

In [4]:
num_epochs, lr, batch_size, wd_lambda = 20, 0.001, 100, 0.0001
    
net = AlexNet()
net.initialize(init.Normal(sigma=0.01), ctx=ctx)

# função de custo (ou loss)
loss = gloss.SoftmaxCrossEntropyLoss()

# carregamento do dado: fashion mnist
train_iter, test_iter = load_data_cifar10(batch_size, resize=227)

# trainer do gluon
trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': lr, 'wd': wd_lambda})

# treinamento e validação via MXNet
train_validate(net, train_iter, test_iter, batch_size, trainer, loss, 
               ctx, num_epochs)

training on gpu(0)
epoch 1, train loss 1.4153, train acc 0.516, test loss 1.3438, test acc 0.540, time 189.2 sec
epoch 2, train loss 0.9543, train acc 0.675, test loss 0.8598, test acc 0.710, time 119.1 sec
epoch 3, train loss 0.8012, train acc 0.725, test loss 0.7977, test acc 0.725, time 159.1 sec
epoch 4, train loss 0.7022, train acc 0.758, test loss 0.7474, test acc 0.743, time 157.6 sec
epoch 5, train loss 0.6231, train acc 0.786, test loss 0.9733, test acc 0.686, time 138.7 sec
epoch 6, train loss 0.5766, train acc 0.803, test loss 0.6880, test acc 0.763, time 70.4 sec
epoch 7, train loss 0.5156, train acc 0.822, test loss 0.6579, test acc 0.780, time 70.4 sec
epoch 8, train loss 0.4814, train acc 0.834, test loss 0.6601, test acc 0.771, time 70.8 sec
epoch 9, train loss 0.4379, train acc 0.848, test loss 0.7038, test acc 0.764, time 71.0 sec
epoch 10, train loss 0.4064, train acc 0.858, test loss 0.6456, test acc 0.785, time 71.1 sec
epoch 11, train loss 0.3711, train acc 0.870,