In [1]:
# Import libraries

import torch
from torch import nn
from skimage import data

## Pooling

Documentation: https://pytorch.org/docs/stable/nn.html#torch.nn.MaxPool2d

```python
torch.nn.MaxPool2d(kernel_size, stride=None, padding=0)
```

**```kernel_size```** <br>
*Field of View* size. It can be a tuple or a single number. Ex: ```kernel_size = 3``` will set FoV of $3 \times 3$

**```stride```** <br>
Controls sliding window jumping.

**```padding```** <br>
Filling with zeros on the edges of the image.

The pooling layer expects input of **at least** 3 dimensions ($C \times H \times W$), but in general the network will also provide the batch dimension ($B \times C \times H \ teams W$)

In [6]:
# Create a tensor with the following values

tns = torch.FloatTensor([ [ [ 1, 2,3 ],
                            [4,5,6],
                            [7,8,9]  ] ] )

# Create a 2D max pooling layer with a kernel size of 2 and a stride of 1

pool = nn.MaxPool2d(2, stride=1)
saida = pool(tns)

print(tns.size())
print(tns)
print(saida.size())
print(saida)

torch.Size([1, 3, 3])
tensor([[[1., 2., 3.],
         [4., 5., 6.],
         [7., 8., 9.]]])
torch.Size([1, 2, 2])
tensor([[[5., 6.],
         [8., 9.]]])


When processing data with multiple channels, the pooling layer processes each input channel separately instead of processing all channels as in a convolutional layer. This means that **the number of output channels for the pooling layer is the same as the number of input channels**.

Let's process the image of the astronaut below.

In [8]:
# Create a convolutional layer with 3 input channels, 16 output channels, a kernel size of 3, and padding of 1

conv = nn.Conv2d(in_channels=3, out_channels=16,
                 kernel_size=3, padding=1)

rgb = data.astronaut()

# Convert the image to a tensor

rgb_tns = torch.Tensor(rgb)

# Permute the dimensions of the tensor so that the channels are first

rgb_tns = rgb_tns.permute(2, 0, 1).unsqueeze(0)

# Apply the convolutional layer to the tensor

mapa_de_ativacao = conv(rgb_tns)
print('Feature Map:', mapa_de_ativacao.shape)

Feature Map: torch.Size([1, 16, 512, 512])


In [9]:
# Create a 2D max pooling layer with a kernel size of 2

pool = nn.MaxPool2d(kernel_size=2)
saida = pool(mapa_de_ativacao)
print(saida.size())

torch.Size([1, 16, 256, 256])


## Batch Normalization

Documentation: https://pytorch.org/docs/stable/nn.html#torch.nn.BatchNorm2d

```python
torch.nn.BatchNorm2d(num_features)
```

**```num_features```**<br>
$\mathbf{\gamma}$ and $\mathbf{\beta}$ are learned individually for each input channel. In intermediate layer activations, this value corresponds to the **number of feature maps**.

In [10]:
# Create a convolutional block with the following layers:
#   - Convolutional layer with 3 input channels, 32 output channels, a kernel size of 3, and padding of 1
#   - Batch normalization layer with 32 channels
#   - ReLU activation layer
#   - Max pooling layer with a kernel size of 10

blococonv = nn.Sequential(
            nn.Conv2d(3,32,kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=10)
)
print(blococonv)

# Create a minibatch of 12 copies of the astronaut image

minibatch = torch.cat(12*[rgb_tns])

# Print the size of the output tensor
print(minibatch.size())
saida = blococonv(minibatch)
print(saida.size())

Sequential(
  (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU()
  (3): MaxPool2d(kernel_size=10, stride=10, padding=0, dilation=1, ceil_mode=False)
)
torch.Size([12, 3, 512, 512])
torch.Size([12, 32, 51, 51])
