# Visualization of a Simple Stochastic Gradient Descent

Here we visualize the minimization of the loss with the SGD algorithm in its variants vanilla GD, batch SGD and minibatch SGD. For this we consider a linear model.

In [None]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

## Dataset

Let's create a [PyTorch dataset](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html). In PyTorch, the `Dataset` object is an abstract class from `torch.utils.data` that we subclass to create out own custom dataset. It defines how to access individual data samples and how many samples there are.

In [None]:
class LinearDataset(Dataset):
    '''The training data is generated from the linear
    function
              y = w * x + v  
    where `w` is the slope and `b` is the offset.
    Random noise in the range `[-0.1, 0.1]` is added
    to the function value `y`.
    '''

    def __init__(self, slope, offset, nsamples):
        self.w = slope
        self.b = offset
        self.nsamples = nsamples

        # A dedicated random number generator for the dataset
        self.rnd_gen_data = torch.Generator().manual_seed(1)
    
    def __getitem__(self, idx):
        x = torch.FloatTensor(1,).uniform_(-0.5, 0.5, generator=self.rnd_gen_data)
        y = self.w * x + self.b
        noise = torch.FloatTensor(1,).uniform_(-0.1, 0.1, generator=self.rnd_gen_data)
        return (x, y + noise)

    def __len__(self):
        return self.nsamples

We generate a random vector $x \in [-0.5, 0.5]$ and evaluate it in a linear function $y = 2w$ (we set the offset $b$ to be 0).
We add noise to $y$ and that gives us $y \in [-1.1, 1.1]$.

In [None]:
train_set = LinearDataset(
    slope = 2.0,
    offset = 0.0,
    nsamples = 1500
)

train_loader = DataLoader(
    train_set,
    batch_size=128,
    shuffle=False,
    drop_last=True
)

In [None]:
# check how the batches look like
for x, y in train_loader:
    print(x.shape, y.shape)

In [None]:
x_plot = []
y_plot = []
for x, y in train_loader:
    x_plot.append(x)
    y_plot.append(y)

x_plot = torch.cat(x_plot)
y_plot = torch.cat(y_plot)

plt.plot(x_plot, y_plot, '.')
plt.xlabel('x')
plt.ylabel('y')
plt.grid()
plt.show()

## Defining the model, a loss function and an optimizer

In [None]:
device = 0   # First GPU

In [None]:
# Select a model: 
# Create a linear model with 1 input and 1 output (y = wx + b),
# and move it to the selected device
model = torch.nn.Linear(1, 1, device=device)

# Select a loss function
loss_fn = torch.nn.MSELoss()

# Select an optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

## Training

In [None]:
def log(model, loss):
    """Utility function for plotting"""

    return(model.weight.item(),
           model.bias.item(),
           loss.item())

In [None]:
num_epochs = 25
history = []

for epoch in range(num_epochs):                 #   loop over epochs:
    # set a seed at each epoch to always        #
    # generate the same set of random x         #
    train_set.rnd_gen_data.manual_seed(1)       #
    #                                           #
    for x, y in train_loader:                   #      loop over batches:  -> (x, y)
        optimizer.zero_grad()                   #         * reset automatic differentiation record
        y_hat = model(x.to(device))             #         * evaluate the model in a batch -> y_hat (forward pass)
        loss = loss_fn(y_hat, y.to(device))     #         * evaluate the loss function with the obtained y_hat and y
        history.append(log(model, loss))        #         [not part of the traing] keep values for plotting later
        loss.backward()                         #         * backpropagation -> gradients
        optimizer.step()                        #         * update weights with the gradients

## Plots

In [None]:
slope_hist  = np.array(history)[:, 0]
offset_hist = np.array(history)[:, 1]
loss_hist   = np.array(history)[:, 2]

matplotlib.rcParams['figure.figsize'] = (10, 4)

plt.subplot(1, 2, 1)
plt.plot(loss_hist, 'r-')
plt.xlabel('Training steps')
plt.ylabel('Loss')
plt.grid()

plt.subplot(1, 2, 2)
plt.plot(x_plot, y_plot, '.', alpha=.1)
plt.plot(x_plot, slope_hist[0]  * x_plot + offset_hist[0],  'r-', label='model (initial step)', lw=3)
plt.plot(x_plot, slope_hist[-1] * x_plot + offset_hist[-1], 'b-', label='model (trained)')
plt.xlabel('x')
plt.ylabel('y')
plt.grid()
plt.legend()

plt.tight_layout()
plt.show()

matplotlib.rcParams['figure.figsize'] = (6, 4)

In [None]:
def loss_function_field(m, n, xref, yref):
    '''Utility function for ploting the loss'''
    return np.mean(np.square(yref - m * xref - n ))

_m = np.arange( -0.5, 4.51, 0.1)
_n = np.arange(-1., 1.01, 0.1)
M, N = np.meshgrid(_m, _n)

Z = np.zeros(M.shape)
for i in range(M.shape[0]):
    for j in range(M.shape[1]):
        Z[i, j] = loss_function_field(M[i, j], N[i, j],
                                      x_plot.numpy(), y_plot.numpy())

matplotlib.rcParams['figure.figsize'] = (16.0, 5.2)

cp = plt.contour(M, N, Z, 16, vmin=Z.min(), vmax=Z.max(), alpha=0.99, colors='k', linestyles=':', linewidths=0.8)
plt.contourf(M, N, Z, 60, vmin=Z.min(), vmax=Z.max(), alpha=0.2, cmap='Blues')  #plt.cm.RdYlBu_r)
plt.clabel(cp, cp.levels[:6])
plt.colorbar()
m = slope_hist[-1]
n = offset_hist[-1]
plt.plot([train_set.w], [train_set.b], 'rx', ms=10)
plt.plot(slope_hist, offset_hist, '.-', lw=2, c='k')
plt.xlim([_m.min(), _m.max()])
plt.ylim([_n.min(), _n.max()])
plt.xlabel('w')
plt.ylabel('b')
plt.show()

matplotlib.rcParams['figure.figsize'] = (6, 4)

### Exercises

1. Try different batch sizes and see the effect path of the SGD:
 * `batch_size = 1000` - The whole dataset
 * `batch_size = 128`  - Something between 1 and the size of the whole dataset
 * `batch_size = 1`    - This is perhaps too slow. Maybe try a smaller batch size, such as 16

2. Why is the training with `batch_size = 1` so slow compared to `batch_size = 128`?
3. Try different learning rates and see how the path of the SGD looks like.

### Technical check

By this point, you should be comfortable with the following:
 * Explaining what a batch is and why batching is important in training neural networks.
 * Explaining what's the learning rate and it's eefect on the training
 * Implementing a custom Dataset and wraping it in a DataLoader using PyTorch.
 * Transfering data to GPU memory using `.to(device)` for faster computation.
 * Instantiating a model, defining a loss function, and setting up an optimizer in PyTorch.
 * Writing a full training loop that iterates over batches, computes loss, and updates model parameters.