# PyTorch Neural Networks

Import torch

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as func
import torch.utils.data as Data

we'll also need numpy and matplotlib

In [None]:
import numpy as np
import matplotlib.pyplot as plt

and the MNIST dataset. The most famous datasets are available in the `torchvision` library, see an example on how to load a dataset:

In [None]:
import torchvision
train = torchvision.datasets.MNIST('./', train=True, download=True, transform=torchvision.transforms.ToTensor())

In [None]:
# Train Data
train.data.size()

In [None]:
# Train Labels
train.targets.size()

In [None]:
test = torchvision.datasets.MNIST('./', train=False, download=True, transform=torchvision.transforms.ToTensor())

In [None]:
# Test Data
test.data.size()

In [None]:
# Test Labels
test.targets.size()

## 1. Network topology

### 1.1. Layers

Linear dense layers (as well as most common layers) are available in `torch.nn`

In [None]:
first_linear = nn.Linear(784, 500)
second_linear = nn.Linear(500,200)
third_linear = nn.Linear(200,2)

Convolutional Layers:

In [None]:
# Conv2d (input_channels, output_channels, stride)
first_conv = nn.Conv2d(1, 5, (5,5))
second_conv = nn.Conv2d(5, 10, (3,3))

Recurrent Layers:

In [None]:
# LSTM (input_size, hidden_size, num_layers)
first_rnn = nn.LSTM(10, 50, 1)

### 1.2. Normalization and Regularization Layers
The most common normalization and regularization strategies are also implemented by default in `torch.nn`:

In [None]:
nn.BatchNorm1d(100)

In [None]:
nn.Dropout(p=0.15)

### 1.3. Activation functions
A class version of the functions is available under `torch.nn`. The function versions are under `torch.nn.functional`, here are the most commonly used ones

#### Rectifier Linear Unit (ReLU)

In [None]:
nn.ReLU()

In [None]:
func.relu

#### Sigmoid

In [None]:
nn.Sigmoid()

In [None]:
func.sigmoid

#### Hyperbolic Tangent

In [None]:
nn.Tanh()

In [None]:
func.tanh

#### Leaky ReLU

In [None]:
nn.LeakyReLU()

In [None]:
func.leaky_relu

#### Softmax

In [None]:
nn.Softmax()

In [None]:
func.softmax

## 2. Network modules
### 2.1. Sequential module
Sequential modules are built by passing a sequence of layers and activators to the `nn.Sequential` constructor. The layers and functions will be applied in the provided order, hence the dimensions of consecutive layers must match each other.

Let's build a simple autoencoder with 4 layers and 2 latent dimensions:

#### Encoder

In [None]:
encoder = nn.Sequential(nn.Linear(28*28, 128), nn.ReLU(),\
                        nn.Linear(128, 64), nn.ReLU(),\
                        nn.Linear(64, 12), nn.ReLU(),\
                        nn.Linear(12, 2))

In [None]:
print(encoder)

#### Decoder

In [None]:
decoder = nn.Sequential(nn.Linear(2, 12), nn.ReLU(),\
                        nn.Linear(12, 64), nn.ReLU(),\
                        nn.Linear(64, 128), nn.ReLU(),\
                        nn.Linear(128, 28*28), nn.Sigmoid())

In [None]:
print(decoder)

#### Autoencoder

In [None]:
autoencoder = nn.Sequential(encoder, decoder)

In [None]:
print(autoencoder)

### 2.2. Module Class
More complex modules and non-sequential topologies must be implemented with custom classes inheriting `torch.nn.Module`. The classes must define the method `forward()` which describes how to generate the output given the input data.

Let's construct the same autoencoder Module classes:
#### Encoder

In [None]:
class Encoder(nn.Module):
    
    def __init__(self, lat_dim):
        # Call superClass initializer
        super(Encoder, self).__init__()
        
        self.layer_1 = nn.Linear(28*28, 128)
        self.layer_2 = nn.Linear(128, 64)
        self.layer_3 = nn.Linear(64, 12)
        self.layer_4 = nn.Linear(12, lat_dim)
    
    def forward(self, data_in):
        # Compute the output
        h = func.relu(self.layer_1(data_in))
        h = func.relu(self.layer_2(h))
        h = func.relu(self.layer_3(h))
        z = self.layer_4(h)
        
        return z

#### Decoder

In [None]:
class Decoder(nn.Module):
    
    def __init__(self, lat_dim):
        # Call superClass initializer
        super(Decoder, self).__init__()
        
        self.layer_1 = nn.Linear(lat_dim, 12)
        self.layer_2 = nn.Linear(12, 64)
        self.layer_3 = nn.Linear(64, 128)
        self.layer_4 = nn.Linear(128, 28*28)
    
    def forward(self, z):
        # Compute the output
        h = func.relu(self.layer_1(z))
        h = func.relu(self.layer_2(h))
        h = func.relu(self.layer_3(h))
        data_out = torch.sigmoid(self.layer_4(h))
        
        return data_out

#### Autoencoder

In [None]:
class AutoEncoder(nn.Module):
    
    def __init__(self, lat_dim):
        super(AutoEncoder, self).__init__()
        
        self.encoder = Encoder(lat_dim)
        self.decoder = Decoder(lat_dim)
        
    def forward(self, data_in):
        lat = self.encoder(data_in)
        data_out = self.decoder(lat)
        
        return data_out

In [None]:
autoencoder = AutoEncoder(2)

In [None]:
print(list(autoencoder.parameters()))

## 3. Training

### 3.1. Optimizer
To train a network we need to instantiate a _parameter optimizer_ and specify the parameters to be optimized. The parameter optimizer provides a `step()` function which will update the parameters using the gradients stored in the parameters.

In [None]:
optimizer = torch.optim.Adam(autoencoder.parameters(), lr=0.001)

#### Different Learning Rates
Different learning rates can be specified for a specific set of parameters. For instance, if we want to train the decoder faster, we could define the optimizer as follows:

In [None]:
uneven_optimizer = torch.optim.Adam([{'params': encoder.parameters(), 'lr': 1e-3}, {'params': decoder.parameters(), 'lr': 5e-3}])

In [None]:
uneven_optimizer

We could also train at different learning rates per layer passing a dictionary with `first_layer.parameters()`, and so on.

#### Multiple Optimizers
Note that many optimizers may be instantiated simultaneously, since they don't interfere with each other as long as each gradient step is performed only with one optimizer. That means we could train the first N steps with `uneven_optimizer.step()` and then continue with even learning rates using `optimizer.step()`.

### 3.2. Loss function
The loss function is the metric to minimize during training. It is usually defined as a function of the input and the output data. Loss functions are defined in `torch.nn.functional`, but we can always define them manually by applying arithmetic operations to the data.

The goal of an autoencoder is to produce outputs that resemble the inputs, so we will use the binary crossentropy metric:

In [None]:
func.binary_cross_entropy

### 3.3. Mini-batch training
Once an optimizer and a Loss function have been chosen, we can proceed to train our network with mini-batches.

We will compute the gradients of the parameters with batches of **100 training samples** during **10 epochs**. 

>Note that the gradients of the parameters are computed and stored in their containing tensor when we call `backward()`. The `optimizer.step()` function is called immediately after to update the parameters and the stored gradients are reset using `optimizer.zero_grad()`. If `zero_grad()` is not called after each step, we would keep accumulating the gradients!

In [None]:
%matplotlib notebook

n_test_img = 4
epochs     = 10
batch_size = 100

# Reshape data
train_samples = train.data.view(-1, 28*28).type(torch.float32)/255.0
test_samples  = test.data.view(-1,28*28).type(torch.float32)/255.0

# Lists to store training losses
train_loss = []
test_loss  = []

# Plot test input images
test_imgs = test.data[0:n_test_img,:].type(torch.float32).view(-1,28*28)/255.0
f, a = plt.subplots(2, n_test_img, figsize=(8, 3))
for i in range(n_test_img):
    a[0][i].imshow(255-np.reshape(test_imgs.data.numpy()[i], (28,28)), cmap='gray')
    a[0][i].set_xticks(())
    a[0][i].set_yticks(())
    
loss_text = f.text(0, 0, "epoch: 0, loss: 0")

# Data iterator
train_batches = Data.DataLoader(dataset=train, batch_size=batch_size, shuffle=True)

for e in np.arange(epochs):
    batch_loss = 0
    for batch_no, (batch, batch_labels) in enumerate(train_batches):
        # Input and target data (flatten)
        b_in = batch.view(-1, 28*28)
        target = batch.view(-1, 28*28)
        # Forward pass of the data through the network
        out  = autoencoder(b_in)
        # Compute the Loss
        loss = func.binary_cross_entropy(out, target)
        batch_loss += loss
        # Reset the gradients
        optimizer.zero_grad()
        loss.backward()
        # Update the gradients
        optimizer.step()

        # Test images
        if batch_no % 50 == 0:
            test_out = autoencoder(test_imgs)
            for i in range(n_test_img):
                a[1][i].imshow(1.0-np.reshape(test_out.data.numpy()[i], (28,28)), cmap='gray')
                a[1][i].set_xticks(())
                a[1][i].set_yticks(())
            loss_text.set_text("epoch: {}, loss: {:.3f}".format(e+1, loss))
            f.canvas.draw()
            
    # Compute batch loss
    train_loss.append(batch_loss/batch_no)
    test_loss.append(func.binary_cross_entropy(autoencoder(test_samples), test_samples))


### Loss plots

In [None]:
plt.figure()
plt.plot(train_loss)
plt.plot(test_loss)
plt.legend(["train", "test"])

## 4. Results
### 4.1. Loss on test dataset
We can easily compute the Loss by evaluating the loss function on the output data:

#### Train Loss

In [None]:
print("Train loss: {}".format(train_loss[-1]))

#### Test Loss

In [None]:
print("Test loss: {}".format(test_loss[-1]))

### 4.2. Latent space
The projections of the test images in the latent space after training look like this:

In [None]:
test_lat = autoencoder.encoder(test_samples).data.numpy()
plt.figure(figsize=(10,7))
plt.scatter(test_lat[:,0], test_lat[:,1], c=test.targets.numpy(), s=1.7)