# Deep Learning & Applied AI

We recommend going through the notebook using Google Colaboratory.

# Tutorial 5: Autograd and Modules

In this tutorial, we will cover:

- Autograd, back-propagation
- Modules, `torch.nn`

Based on original material by Dr. Luca Moschella, Dr. Antonio Norelli and Dr. Marco Fumero.

Course:

- Website and notebooks will be available at https://github.com/erodola/DLAI-s2-2024/

## Import dependencies (run the following cells)

In [None]:
# @title import dependencies

from typing import Mapping, Union, Optional

import numpy as np
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import plotly.graph_objects as go

from torchvision import datasets, transforms
from tqdm.notebook import tqdm

In [None]:
# @title reproducibility stuff

import random
torch.manual_seed(42)
np.random.seed(42)
random.seed(0)

torch.cuda.manual_seed(0)
torch.backends.cudnn.deterministic = True  # Note that this Deterministic mode can have a performance impact
torch.backends.cudnn.benchmark = False

In [None]:
# @title utility functions

from typing import Callable, Union, Sequence
import math

def peaks(meshgrid: torch.Tensor) -> torch.Tensor:
  """
  "Peaks" function that has multiple local minima.

  :params meshgrid: tensor of shape [..., 2], the (x, y) coordinates
  """
  meshgrid = torch.as_tensor(meshgrid, dtype=torch.float)
  xx = meshgrid[..., 0]
  yy = meshgrid[..., 1]
  return (0.25 * (3*(1-xx)**2*torch.exp(-xx**2 - (yy+1)**2) -
                  10*(xx/5 - xx**3 - yy**5)*torch.exp(-xx**2-yy**2) -
                  1/3*torch.exp(-(xx+1)**2 - yy**2)))


def rastrigin(meshgrid: torch.Tensor, shift: int = 0) -> torch.Tensor:
  """
  "Rastrigin" function with `A = 3`
  https://en.wikipedia.org/wiki/Rastrigin_function

  :params meshgrid: tensor of shape [..., 2], the (x, y) coordinates
  """
  meshgrid = torch.as_tensor(meshgrid, dtype=torch.float)
  xx = meshgrid[..., 0]
  yy = meshgrid[..., 1]
  A = 3
  return A * 2 + (((xx - shift) ** 2 - A * torch.cos(2 * torch.tensor(math.pi, dtype=torch.float, device=xx.device) * xx))
                  +
                  ((yy - shift) ** 2 - A * torch.cos(2 * torch.tensor(math.pi, dtype=torch.float, device=xx.device) * yy)))


def rosenbrock(meshgrid: torch.Tensor) -> torch.Tensor:
  """
  "Rosenbrock" function
  https://en.wikipedia.org/wiki/Rosenbrock_function

  It has a global minimum at $(x , y) = (a, a^2) = (1, 1)$

  :params meshgrid: tensor of shape [..., 2], the (x, y) coordinates
  """
  meshgrid = torch.as_tensor(meshgrid, dtype=torch.float)
  xx = meshgrid[..., 0]
  yy = meshgrid[..., 1]

  a = 1
  b = 100
  return (a - xx) ** 2 + b * (yy - xx**2)**2


def simple_fn(meshgrid: torch.Tensor) -> torch.Tensor:
  """
  :params meshgrid: tensor of shape [..., 2], the (x, y) coordinates
  """
  meshgrid = torch.as_tensor(meshgrid, dtype=torch.float)
  xx = meshgrid[..., 0]
  yy = meshgrid[..., 1]

  output = -1/(1 + xx**2 + yy**2)

  return output

def simple_fn2(meshgrid: torch.Tensor) -> torch.Tensor:
  """
  :params meshgrid: tensor of shape [..., 2], the (x, y) coordinates
  """
  meshgrid = torch.as_tensor(meshgrid, dtype=torch.float)
  xx = meshgrid[..., 0]
  yy = meshgrid[..., 1]

  output = (1 + xx**2 + yy**2) ** (1/2)

  return output

In [None]:
# @title utility plot functions
import plotly.express as px

def plot_landscape(
    fn: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
    resolution: int = 100,
    lim: int = 3,
    height: int = 900,
    landscape_opacity: float = 1.0,
    title: Optional[str] = None,
    autoshow: bool = False,
    **kwargs
) -> go.Figure:
    """ Plot the landscape defined by the function `fn`.

  Creates a domain grid $x,y \in R^2$ with $x \in [-lim, lim]$ and
  $y \in [-lim, lim]. The number of points in this grid is resolution**2.
  """
    xx = torch.linspace(-lim, lim, resolution)
    yy = torch.linspace(-lim, lim, resolution)

    yy = yy.repeat(yy.shape[0], 1)
    xx = xx.unsqueeze(-1).repeat(1, xx.shape[0])
    meshgrid = torch.stack((xx, yy), dim=-1)
    zz = fn(meshgrid, **kwargs)

    xx = xx.cpu().detach()
    yy = yy.cpu().detach()
    zz = zz.cpu().detach()

    fig = go.Figure(data=[go.Surface(z=zz, x=xx, y=yy, opacity=landscape_opacity,
                                     cmid=0,
                                     colorscale='Viridis')])
    fig.update_traces(
        contours_z=dict(
            show=True, usecolormap=True, highlightcolor="lightgray", project_z=True
        )
    )
    fig.update_layout(
        height=height,
    )

    if autoshow:
      fig.show()
    return fig


def plot_points_over_landscape(
    fn: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
    points: (float, float) = None,
    resolution: int = 100,
    lim: int = 3,
    landscape_opacity: float = 1.0,
    height: int = 900,
    title: Optional[str] = None,
    autoshow: bool = False,
) -> go.Figure:
    """ Plot a point over the landascape defined by the cunction `fn`

    :param fn: an universal function $R^2 -> R$
    :param points: tensor of shape [..., 3]
    :param title: the title of the plots, if None defaults to  the fn name
    :param autoshow: if True, calls fig.show() before returning the figure

    :retuns: the figure that contains the plot
    """
    points = torch.as_tensor(points)
    fig = plot_landscape(
        fn,
        resolution=resolution,
        lim=lim,
        height=height,
        landscape_opacity=landscape_opacity,
        title=title
    )

    # Create starting path
    x_points = points[..., 0]
    y_points = points[..., 1]
    z_points = points[..., 2]

    for point in points:
      fig.add_trace(
          go.Scatter3d(
              visible=True,
              showlegend=False,
              mode="markers",
              marker=dict(size=6, color="darkred", symbol="circle"),
              x=x_points,
              y=y_points,
              z=z_points,
          )
      )

    if autoshow:
        fig.show()

    return fig

# Intro


# Autograd: automatic differentiation



Let's begin.

---

We have already seen many PyTorch features, from linear algebra to the useful ``Dataset`` and ``Optimizer`` classes.

And yet, with little effort we could have used the Numpy or Scikit-learn libraries instead of Pytorch to accomplish the feats of the last notebooks.

Today we will go through the main _raison d'être_ of Pytorch; the ``autograd`` package.

### Differentiable programming (optimization)


>Differentiable programming is a programming paradigm in which the programs can be differentiated throughout, usually via **automatic differentiation**. This allows for gradient based optimization of parameters in the program, often via gradient descent. Differentiable programming has found use in a wide variety of areas, particularly scientific computing and artificial intelligence. (*Wikipedia [entry](https://en.wikipedia.org/wiki/Differentiable_programming) for Differentiable programming*)

The ``autograd`` package of PyTorch is here to provide this *automatic differentiation* for all operations on Tensors.

It follows a *define-by-run* philosophy, which means that the computational graph used for backpropagation is defined as your code is executed. This allows, for instance, executing iterative optimization processes where every single iteration may be different (!).

This is in contrast to the (admittedly more intuitive) idea of running automatic differentiation over a *static computational graph* (which is what TensorFlow 1.0 did).

Both these approaches are based on the *reverse-mode automatic differentation*.

As seen in theory class, it scales to **high dimensional data** and **very complex computational graphs**, differently from the forward approach.

In the context of Neural Networks we refer to the reverse-mode automatic differentation as *backpropagation*.

## Basics

Let's start by defining a tensor $x$ that may appear in some computation like $f(x) = x^2 + x^3$.

Suppose we want to calculate its derivative at the point $x=42$:
$$\frac{\partial f}{\partial x}\Bigr\rvert_{x=42}$$

PyTorch does it through the *reverse mode automatic differentiation*, composed of a forward and backward pass, as seen in lecture.

Think of the forward pass this way: it performs some computation, and **the execution of each instruction contributes to the construction of the computational graph**.

In [None]:
x = torch.tensor(42., requires_grad=True)  # we'll compute the gradient w.r.t. this variable!
x2 = x ** 2
x3 = x ** 3
f = x2 + x3

Now the backward pass, where we'll appreciate the *automatic* part of the differentation: you just need to call the `backward()` method from your output $f$.  

In [None]:
f.backward()

Now you have $\frac{\partial f}{\partial x}\Bigr\rvert_{x=42}$ in the `grad` of $x$

In [None]:
x.grad

In [None]:
2 * 42 + 3 * 42**2  # Yep, it's correct.

Now you know the basics. And this is enough for training very standard models on PyTorch.

Nevertheless, the design principles behind the PyTorch `autograd` package are not always as straightforward. For instance, what do you think will happen executing `backward()` a second time?





In [None]:
# f.backward()  # try!


To fully understand the world of the `Autograd` package we must go deeply down the rabbit hole.

You do not need to get at the first pass everything we are going to mention from now on. There are explanations of advanced concepts and some PyTorch internals, which are usually not needed but can be useful (e.g. in debugging or complex implementations).

Feel free to refer back to this notebook when needed!

### `Autograd` aggressive buffer freeing

#### The second backward
So, what was the problem with the second backward?

When we computed the first `backward()`, the intermediate variables needed for the computation of $f$, as well as its gradient, were freed to save memory. So PyTorch does not have the necessary information to do backward from $f$ a second time.

`Autograd` has an aggressive buffer freeing policy to be very memory efficient!



If you want to prevent this, you can use `.backward(retain_graph=True)`.

Let's redo from scratch the previous computation:

In [None]:
x = torch.tensor(42., requires_grad=True)
x2 = x ** 2
x3 = x ** 3
f = x2 + x3
f.backward(retain_graph=True)
f.backward()

So we did backward two times. Let's check again the gradient of $x$:

In [None]:
x.grad

It's doubled!

The reason is that `Autograd` keeps accumulating into the `grad` attribute. This means that multiple `backward()` calls **will sum up previously computed gradients** if they are not explicitly zeroed out.

#### Intermediate gradients are not kept by default
Intermediate gradients are other victims of PyTorch's aggressive buffer freeing policy.

We do not have access to the gradient with respect to $x_2$, even if we actually computed it to calculate the one with respect to $x$.

In [None]:
x2.requires_grad  # we *require* the grad w.r.t. x2, in order to compute the one w.r.t. x...

In [None]:
x2.grad is None  # ...but we had asked Pytorch to only compute the gradient w.r.t. x, so the one wrt x2 is not maintained in memory!

Did you read the user warning up there? That should already give you an intuition of what a _leaf_ tensor is 🍃!

### Sick of being tracked? 🍪

You can call `detach()` to **remove a tensor from the computational graph**. This means that the tensor will _not_ be used for computing the gradient and will not partake to the chain rule.

We saw one example in the previous notebook, where we were implementing gradient descent by ourselves and we didn't want to compute gradients of the descent steps! Another classical example is when you run a trained model just for inference, which means you already know you won't call `backward()` at all.

In [None]:
x = torch.tensor(42., requires_grad=True)
x2 = x ** 2
x2sig = x2
print(x2sig.requires_grad)
x2nog = x2.detach()
print(x2nog.requires_grad)

Of course, if a tensor is `detach()`ed, a gradient won't be computed for it and thus `requires_grad` will be `False`.

As a "blanket solution", you can also wrap the code block in a context `with torch.no_grad()`. This is equivalent to calling `detach()` everywhere:

In [None]:
x = torch.tensor(42., requires_grad=True)
x2 = x ** 2
print(x2.requires_grad)
with torch.no_grad():
    x2nog = x ** 2
    x3nog = (x2 + 7) ** 3
    print(x2nog.requires_grad)
    print(x3nog.requires_grad)

`.no_grad()` is particularly useful for inference, when you are certain that you won't ever call `.backward()`.

Clearly, you won't be able to backpropagate trough a detached tensor because it was removed from the graph:

In [None]:
try:
  x2nog.sum().backward()
except Exception as e:
  print(e)

In [None]:
# backward() still works for the tensor that we didn't detach:
x2sig.sum().backward()

### Tensors 🎲



``torch.Tensor`` is the central class of the `autograd` package.


In order to understand in detail how autograd works, it is necessary to dissect some of the most relevant attributes of the Tensors:

---

- **`data`**:

It is the data stored in the tensor. Usually you do not need to access directly this attribute.

In [None]:
t = torch.rand(4, 4)
t.data


---

- **`requires_grad`**:

  - If `True`, the gradient with respect to this tensor will be computed.
  - If `True` and the tensor is a leaf (more on this later!), the gradient will also be saved in the `.grad` attribute.
  - If `False`, the gradient with respect to this tensor will _not_ be computed.

In [None]:
x = torch.tensor(42., requires_grad=True)
x2 = x ** 2
x3 = x ** 3
f = x2 + x3

x.requires_grad, x2.requires_grad, x3.requires_grad, f.requires_grad

f.backward()

Note how all the tensors involved in the computation above have `requires_grad=True`. This means that a gradient will be computed for all of them; these intermediate gradients are all needed by the chain rule, when we will compute `f.backward()`!

You can't force any of the intermediate tensors to _not_ have their gradient computed, because this would break the computation of the entire gradient from `f` back to `x`.

In [None]:
x = torch.tensor(42., requires_grad=True)
x2 = x ** 2
x3 = x ** 3
try:
  x3.requires_grad = False
except RuntimeError as e:
  print(f"Error: {e}")
f = x2 + x3
f.backward()

---

- **`grad`**:

This attribute is `None` by default; it actually becomes a Tensor when `backward()` is called. The attribute will then contain the computed gradient, and future calls to `backward()` will accumulate (add) gradients into it. Only the leaf nodes of the computational graph with `requires_grad=True` will have the `grad` attribute populated.




---

- **`grad_fn`**:

The backward function that `autograd` will use to use to compute the gradient. For example, if we sum two tensors during the forward pass, then the `grad_fn` attribute of the result will indicate that it was created as a result of an addition operation.

In [None]:
t3 = x + x2
t3.grad_fn

When we call `backward()` on a tensor, PyTorch will traverse the computational graph from the tensor backward to its inputs, using these `grad_fn` functions to calculate gradients along the way.


---

- **`is_leaf`**: a boolean.

🍃 **Only *leaf* tensors with `requires_grad=True` will have their `grad` populated during a call to `backward()`**. To get `grad` populated for non-leaf tensors, you can use `retain_grad()`.
Keep in mind that:
  - All tensors that have `requires_grad=False` will be leaf tensors by default.
  - For tensors that have `requires_grad=True`, they will be leaf tensors if their `grad_fn` is `None`. This means that they are not the result of an operation of tracked tensors, but rather they were created directly by the user.

**NOTE:** Make sure you are on a GPU runtime before running the following.

In [None]:
a = torch.rand(10, requires_grad=True)
a.is_leaf, a.requires_grad

In [None]:
a = torch.rand(10, requires_grad=True) + 2
a.is_leaf, a.requires_grad  # was created by the addition operation

In [None]:
a = torch.rand(10, requires_grad=True, device="cuda")
a.is_leaf, a.requires_grad  # requires grad, directly created by the user

In [None]:
a = torch.rand(10).cuda()
a.is_leaf, a.requires_grad  # requires_grad=False, thus it is a leaf by default

In [None]:
a = torch.rand(10, requires_grad=True).cuda()
a.is_leaf, a.requires_grad  # Was created by the operation that casts a cpu tensor into a cuda tensor.
                            # Since we are moving a cpu tensor that requires gradients, this is creating a new version of the tensor in GPU.
                            # Therefore 'a' is not a leaf, but the cpu tensor was.

In [None]:
a = torch.rand(10).cuda().requires_grad_()  # Here we move a cpu tensor that does not require gradients, so it stays a leaf, and then modify it.
a.is_leaf, a.requires_grad  # requires gradients and has `grad_fn=None`

In [None]:
a = torch.rand(10, requires_grad=True, device="cuda")
b = a + 2                          # non leaf, since requires grad and it is produced by an operation
print(b.is_leaf, b.requires_grad)
c = b.detach()                     # leaf, it has been detached and now has requires_grad=False
print(c.is_leaf, c.requires_grad)

---

- **`backward()`**:

Computes the gradient of current tensor w.r.t. computational graph leaves.

> 🧠 **MEMO**: Remember, the graph is created on the fly during the forward pass, as operations are performed on tensors. When you call `backward()` on the final tensor (usually the rank-0 tensor representing the loss value), Pytorch traverses the computational graph back to the leaf tensors (usually the network parameters), calculating the gradient with respect to them and storing it in their `.grad` attribute.

> ### Leaves recap
>
> Let's recap the answer to the following question:
>
> *What are the nodes that will have the `.grad` attribute populated?*
>
> Here's a computational graph:
>
> ![](https://raw.githubusercontent.com/erodola/DLAI-s2-2021/main/labs/05/pics/leaves.svg)
>
> 1. Take the subgraph of nodes with `requires_grad=True` *(green and blue nodes)*
> 2. Take the leaves of this subgraph *(green nodes)*
>
> The nodes selected with this procedure *(green nodes)* will have their `.grad` attribute populated.

### Gradients

Let's look at one last example.

Create a tensor and set ``requires_grad=True`` to track operations:

In [None]:
x = torch.ones(2, 2, requires_grad=True)
x

Do some operation:


In [None]:
y = x + 2
y

``y`` was created as a result of a tracked operation, so it has a ``grad_fn``:



In [None]:
y.grad_fn

Do more operations on `y`:

In [None]:
z = y * y * 3
out = z.mean()

print(z, out)

In [None]:
out.backward()

With this operation we computed $\frac{\partial \, \text{out}}{\partial \, x}$ as well as all the intermediate partial derivatives, but the only one we can actually read is $\frac{\partial \, \text{out}}{\partial \, x}$:


In [None]:
x.grad

Let's double-check why `x.grad` is a `2x2` tensor full of $4.5$.

The output is defined as:

$$ \mathrm{out} = \frac{1}{4} \sum_i 3(x_i + 2)^2 \: \text{ with } x_i = 1 \, \forall i$$

We have the partial derivatives:

$$
\frac{\partial \mathrm{out}}{\partial x_i}
= \frac{3 \times 2}{4} (x_i + 2)
= \frac{3}{2} (x_i + 2)
$$

*(Note: the derivative for every $x_j$ with $j \neq i$ is zero)*


Thus, since $x_i=1$ for all $i$ in the input, we obtain $\frac{\partial \mathrm{out}}{\partial x_i} = \frac{9}{2} = 4.5$.

##### **EXERCISE**
> Understanding if a tensor is a leaf or not is suprisingly tricky, but it is very important to be able to distinguish leaf tensors: **only leaves with `requires_grad=True` tensors will have the grad attribute populated**. The leaves will be the parameters of our neural networks.
>
> Consider the two following scenarios and try to understand if `a.grad` and/or `b.grad` will be populated.
>
> **Scenario 1**
>
> ```python
> a = torch.randn(2, 2, requires_grad=True)
> b = a ** 2                                
> b.requires_grad_(True)                    
> b.sum().backward()                        
> ```
> - [ ] `a.grad` is populated (it is not `None`)
> - [ ] `b.grad` is populated (it is not `None`)
>
>
> **Scenario 2**
>
> ```python
> a = torch.randn(2, 2, requires_grad=False)
> b = a ** 2                                
> b.requires_grad_(True)                    
> b.sum().backward()                        
> ```
> - [ ] `a.grad` is populated (it is not `None`)
> - [ ] `b.grad` is populated (it is not `None`)

In [None]:
# @title Solution 👀

if False:  # Change to true to enable the prints
  # 1)
  a = torch.randn(2, 2, requires_grad=True)  # leaf tensor that requires grad

  b = a ** 2                                 # non leaf tensor: requires grad and produced by an op
  b.requires_grad_(True)                     # it already requires a grad!

  print(f'a.is_leaf: {a.is_leaf} \t a.requires_grad: {a.requires_grad}  \t a.grad_fn: {a.grad_fn}')
  print(f'b.is_leaf: {b.is_leaf} \t b.requires_grad: {b.requires_grad}  \t b.grad_fn: {b.grad_fn}')

  b.sum().backward()                         # just a sample backprop

  print("\nGradients:")
  print(f'a.grad: {a.grad}')                 # a is a leaf, thus it will have .grad
  print(f'b.grad: {b.grad}')                 # b is not a leaf, thus it will not have .grad

  print('\n\n---\n\n')

  # 2)
  a = torch.randn(2, 2, requires_grad=False) # leaf tensor that does not requires grad

  b = a ** 2                                 # leaf tensor, because not requires grad
  b.requires_grad_(True)                     # now it requires a grad and has grad_fn=None! It is a leaf

  print(f'a.is_leaf: {a.is_leaf} \t a.requires_grad: {a.requires_grad}  \t a.grad_fn: {a.grad_fn}')
  print(f'b.is_leaf: {b.is_leaf} \t b.requires_grad: {b.requires_grad}  \t b.grad_fn: {b.grad_fn}')

  b.sum().backward()                         # just a sample backprop

  print("\nGradients:")
  print(f'a.grad: {a.grad}')                 # a is a leaf but does not require grad, thus it will not have .grad
  print(f'b.grad: {b.grad}')                 # b is a leaf and requires grad, thus it will have .grad

  print('\n\n---\n\n')

##### **EXERCISE**
>
> Consider the following expression:
>
> $$ z = \frac{\sqrt{x^2 +1} - \sqrt{y - 1}}{\sqrt{x^2 + y^2}} + \sqrt{y - 1} $$
>
> Compute the gradients $\frac{\partial z}{\partial x}$, $\frac{\partial z}{\partial y}$, $\frac{\partial z}{\partial \sqrt{x^2 +1}}$ and $\frac{\partial z}{\partial \sqrt{y-1}}$ at $x=2$, $y=10$

In [None]:
# Expected results, respectively:
# x.grad: 0.08914636820554733
# y.grad: 0.15752650797367096
# x3.grad: 0.0980580672621727
# y2.grad: 0.9019419550895691

# ✏️ your solution here


### Autograd Mechanics 🧑‍🔧



#### Custom `Function` 📖

Remember this example?


In [None]:
t = torch.rand(4, 4, requires_grad=True)
t2 = torch.rand(4, 4)

t3 = t + t2
t3.grad_fn

That `AddBackward0` is an object of the `Function` class. It indicates that `t3` was created by a sum operation, but not only! Together with the `Tensor` class, `Function` makes up the graph that encodes a complete history of computation.

All mathematical operations in PyTorch are implemented as objects of the `torch.nn.Autograd.Function` class.

📜 **Story time**

Once upon a time, we needed to backpropagate through the operation `lambda = eig(X)`, which computes the eigenvalues of a matrix `X`. But the `eig()` operation was not a `Function`! 😱

So we implemented our own `Function` and defeated the evil derivative.

**Good ending!** Our heroes make their way directly into the sun 🌅.


Our heroes had to implement these two methods:

- `forward()`: the code that performs the operation. It can take as many arguments as you want. All Python objects are accepted as input. _Any input of the `Tensor` type should be explicitly `detach()`ed inside the `forward()` call, so that whatever happens inside the function will not affect the computational graph_; recall that we are going to manually implement the gradient anyway! You can return either a single `Tensor` or a tuple of `Tensor`. Refer to the docs of `Function` to find descriptions of useful methods that can be called only from `forward()`.

- `backward()`: gradient formula. The size of its input matches the size of `forward()`'s output. It should return as many `Tensor` s as there were inputs in `forward()`, with each of them containing the gradient w.r.t. its corresponding input. If your inputs didn't require a gradient (`needs_input_grad`, in the `ctx` argument, is a tuple of booleans indicating whether each input needs gradient computation), or were non-Tensor objects, you can return `None`. Also, if you have optional arguments to `forward()` you can return more gradients than there were inputs, as long as they're all `None`.

Confused? Let's see an example.


We are going to implement our own ReLU from scratch.

$$f(x) = \max \{0, x \} $$

The _forward_ pass is easy to implement: just write the operation above, and return the result. We'll also need the value of $x$ for computing the derivative $\frac{\partial f}{\partial x}$, so `forward()` must save $x$ for later use.

The _backward_ pass is a bit more tricky. Reverse-mode autodiff requires us to compute the _derivative of the **loss** with respect to $x$_:

$$ {\color{blue}{\frac{\partial\ell}{\partial x}}} = {\color{green}{\frac{\partial \ell}{\partial f}}} {\color{red}{\frac{\partial f}{\partial x}}} $$

In particular, `backward()` will receive ${\color{green}{\frac{\partial \ell}{\partial f}}}$ as input, and must produce ${\color{blue}{\frac{\partial\ell}{\partial x}}}$ in the output. All we must do is compute the portion:

$${\color{red}{\frac{\partial f}{ \partial x}}} =  \begin{cases} 1 & \text{if } x > 0\\ 0 & \text{if } x \le 0 \end{cases}$$

and simply output the product ${\color{green}{\frac{\partial \ell}{\partial f}}} {\color{red}{\frac{\partial f}{\partial x}}}$. Note how, as promised, we are also using $x$ for this calculation.



In [None]:
class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, x):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(x)

        # The operation we do here can be even external to PyTorch, like playing a Mario🥸 level and recording the final score.
        # We're going simple here: let's implement a standard ReLU.
        x_device = x.device
        x_dtype = x.dtype
        xnumpy = x.cpu().detach().numpy()  # detach() ensures that operations done here do not interfere with the autograd
        xnumpy = xnumpy.clip(min=0)

        return torch.tensor(xnumpy, dtype=x_dtype, device=x_device)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors  # unpack the tuple to its only element

        grad_input = torch.zeros_like(grad_output)
        grad_input[input > 0] = 1
        grad_input *= grad_output

        # Alternatively, to avoid the element-wise product:
        # grad_input = grad_output.clone()  # deep copy
        # grad_input[input <= 0] = 0

        return grad_input

myrelu = MyReLU.apply  # not really needed, but useful to have an alias for future use

Let's test this out:

In [None]:
x = torch.rand(50, requires_grad=True)

In [None]:
out = myrelu(x - 0.5)
print(out)  # grad_fn=<MyReLUBackward>
out.sum().backward()
x.grad

In [None]:
x.grad.zero_()  # usually you should not use this method

# -> Let's check our implementation against torch.relu
out = torch.relu(x - 0.5)
print(out)  # grad_fn=<MyReLUBackward>
out.sum().backward()
x.grad      # Negative numbers get zeroed, and their grad is zero

> **EXERCISE**
>
> Implement your own "ReCU", defined as:
>
> $$ f(x) = \max \{0, x^3\} $$
>
> Write the `forward()` and `backward()` functions, and test them out.

In [None]:
# your solution here ✏️


In [None]:
# @title 👀 Solution

class MyReCU(torch.autograd.Function):

    @staticmethod
    def forward(ctx, x):

        ctx.save_for_backward(x)

        x_device = x.device
        x_dtype = x.dtype
        xnumpy = x.cpu().detach().numpy() ** 3
        xnumpy = xnumpy.clip(min=0)

        return torch.tensor(xnumpy, dtype=x_dtype, device=x_device)

    @staticmethod
    def backward(ctx, grad_output):
        input, = ctx.saved_tensors
        # no cloning necessary, since we are not modifying grad_output directly
        grad_input = grad_output * 3 * (input**2) * (input > 0).float()
        return grad_input

myrecu = MyReCU.apply

# testing

x = torch.rand(50, requires_grad=True)

out = myrecu(10 * x - 5)
print(out)

out.sum().backward()
x.grad

#### Excluding subgraphs from backward

The `requires_grad` flag allows for fine-grained exclusion of subgraphs from gradient computation and can increase efficiency. As a reminder, if any input tensor of an operation has `requires_grad=True`, the output tensor automatically gets `requires_grad=True` as well.

In [None]:
x = torch.randn(5, 5)  # requires_grad=False by default
y = torch.randn(5, 5)  # requires_grad=False by default
z = torch.randn((5, 5), requires_grad=True)

a = x + y
b = a + z

a.requires_grad, b.requires_grad

Explicitly setting certain tensors to `requires_grad=False` is useful when you want to **❄️ freeze a subset of parameters of your model** so they are not updated during training. This would be done, for instance, to **finetune** the last layer of a pretrained CNN: simply set `requires_grad=False` for all the parameter tensors except the ones in the last layer.

Let's do it:

In [None]:
import torchvision
model = torchvision.models.resnet18(pretrained=True)  # no need to understand this right now

In [None]:
# compute some random prediction from this pretrained network
random_prediction = model(torch.rand(2, 3, 224, 224))

# dummy loss, just to get some gradients
f = random_prediction.sum()

# compute the gradients
f.backward()

The model's parameters, together with the gradient of `f` with respect to them, are stored in...

In [None]:
model.parameters()

For example, we can look for all the parameters having a nonzero gradient (based on our dummy loss function):

In [None]:
grads = list(x.grad for x in model.parameters() if x.grad.bool().any())
len(grads)

Let's now freeze the pretrained model except for the last layer:

In [None]:
# Clear the previous gradients to avoid undue accumulation later
model.zero_grad()

# Freeze the pretrained model
for param in model.parameters():
    param.requires_grad = False  # you can do this, because they are all leaves!

# Replace the last fully-connected layer
# These parameters have requires_grad=True by default
model.fc = nn.Linear(512, 100)

# Configure an optimizer for the last layer only.
# NOTE: we don't actually optimize, this is just to show you how we would setup the training.
optimizer = optim.SGD(model.fc.parameters(), lr=1e-2, momentum=0.9)

# Let's compute some gradients
random_prediction = model(torch.rand(2, 3, 224, 224))
f = random_prediction.sum()
f.backward()

# Return all grads different than all zeros
grads = list(x.grad for x in model.parameters() if x.grad is not None and x.grad.bool().any())
len(grads)

#### How autograd encodes the history 📖

Internally, autograd holds a computational graph of `Function` objects, which can be `apply()` ed to evaluate the result while traversing through the graph. In the forwards pass, autograd simultaneously evaluates the functions and builds populates the graph with objects that will compute the partial derivatives (the `grad_fn` attribute of each `torch.Tensor` is an entry point into this graph). When the forwards pass is completed, we traverse this graph in the backward pass to actually _evaluate_ the gradients.

An important thing to note is that **the graph is recreated from scratch at every iteration**, allowing to use arbitrary Python control flow statements, that can change the overall shape and size of the graph at every iteration. You don’t have to encode all possible paths before you launch the training - what you run is what you differentiate.

In [None]:
x = torch.arange(5, dtype=torch.float, requires_grad=True)
y = x - x
v = y ** 2
z = v + 2
z.grad_fn

In [None]:
z.grad_fn.next_functions  # The computational graph is encoded in the grad_fn attributes

In [None]:
z.grad_fn.next_functions[0][0].next_functions  # If we want, we can manually traverse it

In [None]:
z.grad_fn.next_functions[0][0].next_functions[0][0].next_functions  # Leaf tensors have private functions to accumulate grads

In [None]:
# After the leaf tensors, we don't compute anything!
# Even if there are other leaf tensors in the computational graph behind those (that do not require a grad)
z.grad_fn.next_functions[0][0].next_functions[0][0].next_functions[0][0].next_functions


#### In-place operations with autograd

From our discussion so far, you might suppose that in-place operations on Pytorch tensors can potentially **overwrite values required to compute gradients**. This is true: with an in-place operation, we may break the backpropagation mechanism.

Here's an example:

```python
x = torch.rand(5, requires_grad=True)
y = x * 2
y.add_(torch.sqrt(y * x))
```

What happens to the internal attributes of `y` as we keep overwriting it?

Each in-place operation actually rewrites the computational graph. This can be tricky, especially if there are many `Tensors` that reference the same storage (e.g. created by indexing or transposing), and in-place functions will actually raise an error if the storage of modified inputs is referenced by any other `Tensor`. In contrast, **out-of-place versions simply allocate new objects and keep references** to the old graph.

##### In-place correctness checks 📖

Every tensor keeps a _version counter_, incremented each time the tensor is marked as "dirty" by an in-place operation. When a `Function` uses `save_for_backward()` to save references of any tensors for its backward pass, a version counter of their containing `Tensor` is saved as well. Once you access `self.saved_tensors`, the version is checked. If it is greater than the saved value, an error is raised. This ensures that if you’re using in-place functions and not seeing any errors, you can be sure that the computed gradients are correct.

In [None]:
x = torch.rand(10, requires_grad=True)
o = x * 10
o.retain_grad()
o2 = o + 10
o2.retain_grad()
y = torch.rand(10)

In [None]:
o._version  # the version counter is initialized to zero

In [None]:
o.add_(-1)  # dirty modify, increase the version counter
o._version

In [None]:
z = x + y  # it does not modify x in place
x._version

In [None]:
x = x + x  # x is a new tensor
x._version

In [None]:
# 😈 Let's break autodiff with in-place operations

try:
  x = torch.ones(5, requires_grad=True)
  x2 = (x + 1).sqrt()
  z = (x2 - 10)
  x2[0] = -1
  z.sum().backward()
except Exception as e:
  print(e)

References:

- [PyTorch docs](https://pytorch.org/docs/stable/index.html)
- [Autograd tutorial](https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html)
- [Autograd mechanics](https://pytorch.org/docs/stable/notes/autograd.html)
- [Extending PyTorch](https://pytorch.org/docs/stable/notes/extending.html)
- Nice [blogpost](https://blog.paperspace.com/pytorch-101-understanding-graphs-and-automatic-differentiation/)
- Nice [blogpost](https://towardsdatascience.com/pytorch-autograd-understanding-the-heart-of-pytorchs-magic-2686cd94ec95) number  two


## JAX: functional differentiation 📖

The approach used by PyTorch or Tensorflow is essentially the same, apart from small differences in how they handle the computational graph. However, _there are alternatives_.

We will not introduce [**Jax**](https://jax.readthedocs.io/en/latest/) in these notebooks, but it deserves a mention of honor because it employs a fundamentally different technique to perform autodiff:

![](https://sjmielke.com/images/blog/jax-purify/banner.png)

# The `torch.nn` package



PyTorch provides the elegantly designed modules and classes
[`torch.nn`](https://pytorch.org/docs/stable/nn.html),
[`torch.optim`](https://pytorch.org/docs/stable/optim.html),
[`Dataset`](https://pytorch.org/docs/stable/data.html?highlight=dataset#torch.utils.data.Dataset),
and [`DataLoader`](https://pytorch.org/docs/stable/data.html?highlight=dataloader#torch.utils.data.DataLoader)
to help you create and train neural networks.
You have already seen how to use `torch.optim`, `Dataset` and `DataLoader`. In this section we will review all these classes together with the new `torch.nn` package to understand how they work together to simplify our life.

To develop this understanding, we will first train a basic neural net on the MNIST dataset _without_ using any of these modules: we will just use the most basic PyTorch tensor functionality.

---

Our final goal is to reach an elegant, general structure suitable for most problems and models with minor tweaks:

```python
# load data
# instantiate model
# instantiate optimizer

# for each epoch:
  # train the model on the training set
  # evaluate the model on one or more evaluation sets
  # log metrics (e.g. log, accuracy)
```


### Classifying *all* handwritten digits


Last time we used MNIST, we considered the binary classification of ones and sevens. This time we'll consider all classes, leading to _multinomial_ logistic regression problem. This is also known as _softmax regression_.

#### One-hot encodings

We will represent the class information using the *one-hot* representation (aka indicator vectors):

$$0 = (1,0,0,0,0,0,0,0,0,0) $$
$$1 = (0,1,0,0,0,0,0,0,0,0) $$
$$...$$
$$9 = (0,0,0,0,0,0,0,0,0,1) $$

The output of our model will thus be a 10-dimensional vector, which we can interpret as a probability distribution. For example:

$$(0,0,0.98,0.01,0,0,0,0,0.01,0) $$

In this example, the model predicts that the given input is a 2 with 98\% probability, a 3 with 1% probability, and an 8 with 1%.

#### Softmax

In standard logistic regression, the predicted value $p=\sigma(\mathbf{w}^\top \mathbf{x}+b)$ was easy
to interpret as a probability, thanks to the squashing between $[0,1]$ performed by the sigmoid $\sigma()$.

What is the analogous of the sigmoid for more than one dimension? It's called _softmax_:

$$\text{softmax}(\mathbf{x}) = \{\frac{\exp(x_0)}{\sum_{j}^{ }\exp(x_j))}, \frac{\exp(x_1)}{\sum_{j}^{ }\exp(x_j))}, ... , \frac{\exp(x_9)}{\sum_{j}^{ }\exp(x_j))}\}$$

By construction, $\text{softmax}(\mathbf{x})$ sums up to one.

If we use $\text{softmax}$ instead of $\sigma$, we have a multinomial logistic model yielding a distribution $\mathbf{p}$ instead of a single value $p$. However, as we'll explain below, we can output $\log(\mathbf{p})$ instead of $\mathbf{p}$ to make things simpler, and we can also generalize the cross-entropy loss to the multinomial case.

Let's start by gaining an intuition of how softmax behaves; we'll do this by considering $\exp(\alpha x)$ with different $\alpha$:

In [None]:
x = torch.rand(40)

In [None]:
# @title Softmax: crank up the alpha!  { run: "auto" }

import plotly.graph_objects as go

alpha = 47  #@param {type:"slider", min:1, max:50, step:1}

sx = torch.exp(alpha*x)
sx /= sx.sum()

fig = go.Figure()
# fig.add_trace(go.Bar(y=x, name='x', marker_color='blue'))
fig.add_trace(go.Bar(y=sx, name='sx', marker_color='red'))
fig.update_layout(barmode='group', title='Softmax of a random vector', width=800, height=300)
fig.show()

You got an idea if why it's called soft**max**? One useful way to think about the softmax is as a smooth approximation of the indicator function.

Now let's prepare for our first neural model. For this example, *we will avoid the use of the ready-made MNIST dataset in the `torchvision` package* but we will manually build it, to understand how to adapt the concepts on other datasets not available in `torchvision`.

This MNIST dataset is in numpy array format.



In [None]:
!wget https://s3.amazonaws.com/img-datasets/mnist.npz

def load_data_impl():
    # file retrieved by:
    #   wget https://s3.amazonaws.com/img-datasets/mnist.npz -O code/dlgo/nn/mnist.npz
    # code based on:
    #   site-packages/keras/datasets/mnist.py
    path = 'mnist.npz'
    f = np.load(path)
    x_train, y_train = f['x_train'].reshape(-1, 784), f['y_train']
    x_test, y_test = f['x_test'].reshape(-1, 784), f['y_test']
    f.close()
    return (x_train.astype(np.float32), y_train), (x_test.astype(np.float32), y_test)

In [None]:
(x_train, y_train), (x_valid, y_valid) = load_data_impl()

> **EXERCISE**: We almost always want to normalize datasets, i.e. we _subtract_ their mean, and _scale_ by their standard deviation. Compute the mean and standard deviation of the MNIST dataset and then normalize it.

In [None]:
# Normalization with pre-computed values

x_train = (x_train / 255 - 0.13) / 0.3  # data normalization
x_valid = (x_valid / 255 - 0.13) / 0.3

Each image is `28 x 28`, and is being stored as a flattened row of length
`784 (=28x28)`. Let's take a look at one; we need to reshape it to 2d
first.



In [None]:
import plotly.express as px
import numpy as np

print(x_train.shape)
px.imshow(x_train[0].reshape((28, 28)), color_continuous_scale='gray')

PyTorch uses ``torch.tensor``, rather than numpy arrays, so we need to
convert our data:

In [None]:
import torch

x_train, y_train, x_valid, y_valid = map(
  torch.tensor, (x_train, y_train, x_valid, y_valid)
)
n, c = x_train.shape
y_train = y_train.long()  # pytorch wants int64 as indices
y_valid = y_valid.long()
print(x_train, y_train)
print(x_train.shape)
print(y_train.min(), y_train.max())

### Neural net from scratch (no torch.nn)

Our first neural model is built using nothing but PyTorch tensor operations.

For the weights, we set `requires_grad` **after** the initialization, since we don't want the initialization function to be included in the gradient computation. (remember that a trailling `_` in PyTorch means that the operation is performed in-place.)

We are initializing the weights with a simplified version of
[Xavier initialization](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf), i.e. by multiplying with $\frac{1}{\sqrt{n}}$


In [None]:
import math

weights = torch.randn(784, 10) / math.sqrt(784)  # Xavier init.
weights.requires_grad_()                         # Start to track the weights
bias = torch.zeros(10, requires_grad=True)       # Initialize the bias with zeros

#### Model

How did we choose the shape of the `weights` tensor? Let's recall the good old binary logistic regression model:

$$ p = \sigma(\mathbf{w}^\top \mathbf{x}_i + b) $$

where $\mathbf{x}_i$ is a flattened image and $\mathbf{w}$ is a vector of weights parametrizing our model. This means that we have one weight per pixel, in total $28\cdot 28=784$ weights.

In the multinomial case, we now have:

$$ \mathbf{p} = \text{softmax} (\mathbf{W}^\top \mathbf{x}_i + \mathbf{b}) \,.$$

Since $\mathbf{p}$ represents the predicted probabilities over 10 classes, matrix $\mathbf{W}$ must be $784 \times 10$.

In fact, a more numerically stable approach is to ask our model to output the log-probabilities instead of the probabilities, namely:

\begin{align}
\log(\mathbf{p}) &= \log(\text{softmax} (\mathbf{W}^\top \mathbf{x}_i + \mathbf{b}) )\\
&=(\mathbf{W}^\top \mathbf{x}_i + \mathbf{b})- \log ( \sum_{j=0}^9 \exp( \mathbf{W}^\top \mathbf{x}_i + \mathbf{b} ) )
\end{align}

Seems complex, but it's easy! Here's our implementation of the model:

In [None]:
def log_softmax(x):
  return x - x.exp().sum(-1).log().unsqueeze(-1)

def model(xb):
  return log_softmax(xb @ weights + bias)

The `model()` function will take a mini-batch of data as input. Let's test it:

In [None]:
bs = 64  # batch size (number of images)

xb = x_train[0:bs]  # a mini-batch from x
preds = model(xb)  # log predictions
preds[0], preds.shape
print(torch.exp(preds[0]), preds.shape)

#### Loss and accuracy

Of course, we don't expect these predictions to be any better than random, since we are still using our model with random weights. You know what we must do: **define a loss**, and update the model weights iteratively to minimize it.

> **EXERCISE**: The loss.
>
> Implement the **negative log-likelihood** (NLL) loss, given a batch of log probabilities (`input`) and the corresponding ground-truth labels (`target`).
>
> Mathematically, for an image $\mathbf{x}_i$ with true class label $c$, this loss is defined as:
>
> $$ \ell(\mathbf{x}_i) = -\log (p_{c_i}) $$
>
> where $p_{c_i}$ is the model's predicted probability that $\mathbf{x}_i$ belongs to class $c$.
>
> The loss makes sense: higher probabilities for the correct class yield lower losses.
>
> For a batch of $N$ images, simply average their NLL loss to make it independent from the batch size:
>
> $$ \ell(\mathbf{x}_i) = - \frac{1}{N} \sum_i \log (p_{c_i}) $$

In [None]:
def nll(input, target):
    # your code here ✏️  -- can you do it without a for loop?
    return loss

In [None]:
# @title Solution 👀

def nll(input, target):
    loss = -torch.gather(input, 1, target[:, None]).mean()
    return loss

loss_func = nll

Let's check the loss of our random model, so we can see if we improve
after a backprop pass later.


In [None]:
yb = y_train[0:bs]
print(loss_func(preds, yb))

Let's also implement a function to calculate the **accuracy** of our model. For each prediction, if the largest probability is at the same index as the ground-truth target, then the prediction was correct:

In [None]:
def accuracy(out, yb):
  preds = torch.argmax(out, dim=1)
  return (preds == yb).float().mean()

Let's check the accuracy of our random model, so we can see if our
accuracy improves as our loss improves:

In [None]:
print(accuracy(preds, yb))

> **EXERCISE**: What is the maximum possible accuracy?

#### Training

We can now run a training loop.  For each iteration, we will:

- select a mini-batch of data (of size ``bs``)
- use the model to make predictions
- calculate the loss
- ``loss.backward()`` computes the gradient of the loss w.r.t. to `weights` and `bias`.
- use the gradients to update weights and bias

When we update the weights and bias, we must remember using the `torch.no_grad()` context manager to avoid that the update steps themselves will be tracked for the next calculation of the gradient.

Also, we must set the gradients to zero before the next iteration, otherwise they will be accumulated across all the iterations.

🐞 **Debugging**: Unfortunately, Colab only provides the built-in debugger available in `IPython.core.debugger`. Uncomment ``set_trace()`` below to try it out. [Here](https://nblock.org/2011/11/15/pdb-cheatsheet/) you can find a cheatsheet of the pdb commands.

Alternatively, you can fire up your favorite IDE; the debugger integrated in [PyCharm](https://www.jetbrains.com/pycharm/) is convenient, as the pro license is free for students.

In [None]:
from IPython.core.debugger import set_trace

lr = 0.05  # learning rate
epochs = 5

for epoch in range(epochs):
  for i in range((n - 1) // bs + 1):
    start_i = i * bs
    # set_trace()

    end_i = start_i + bs
    xb = x_train[start_i:end_i]
    yb = y_train[start_i:end_i]
    pred = model(xb)
    loss = loss_func(pred, yb)
    loss.backward()

    with torch.no_grad():
      weights -= weights.grad * lr
      bias -= bias.grad * lr
      weights.grad.zero_()
      bias.grad.zero_()

  print(f"epoch: {epoch}, training loss: {loss:.2f}")

That's it: we've created and trained a minimal neural network with just one layer entirely from scratch! 🎉



### Refactor: use `torch.nn.functional`


We will now refactor our code, so that it does the same thing as before, only
we'll start taking advantage of PyTorch's ``nn`` classes to make it more concise
and flexible.

First step: let's use `torch.nn.functional`'s loss and activation functions. Currently we are using the log-softmax activation, and the negative log-likelihood loss. PyTorch does all in one: `F.cross_entropy`.


In [None]:
import torch.nn.functional as F

loss_func = F.cross_entropy

def model(xb):
  return xb @ weights + bias  # we don't explicitly apply log-softmax anymore

We can still use the same training loop code from before, where the `model` and `loss_func` have been redefined.

### Refactor: use `nn.Module`

Next up, we'll use ``nn.Module`` and ``nn.Parameter``, for a clearer and more
concise training loop. We subclass ``nn.Module`` to create a class that
holds our weights, bias, and method for the forward step.  ``nn.Module`` has a
number of attributes and methods (such as ``.parameters()`` and ``.zero_grad()``)
which we will be using.

In [None]:
from torch import nn

class Mnist_Logistic(nn.Module):
  def __init__(self):
    super().__init__()
    self.weights = nn.Parameter(torch.randn(784, 10) / math.sqrt(784))
    self.bias = nn.Parameter(torch.zeros(10))

  def forward(self, xb):
    return xb @ self.weights + self.bias

Since we're now using an object instead of just using a function (our old `model()`), we first have to instantiate our model:



In [None]:
model = Mnist_Logistic()

Now we can calculate the loss in the same way as before. Note that
``nn.Module`` objects are used as if they are functions (i.e they are
*callable*), but behind the scenes Pytorch will call our ``forward``
method automatically.

> The `__call__` method of the Modules, internally calls the `forward` method and *does other stuff* (e.g. registers some hooks, you can check the implementation [here](https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module)). Thus, you should always call the forward with `model(inputs)` and never directly `model.forward(inputs)`.

In [None]:
print(loss_func(model(xb), yb))

Previously in our training loop we had to update `weights` and `bias` explicitly and manually zero out the grads, like this:

```python
  with torch.no_grad():
      weights -= weights.grad * lr
      bias -= bias.grad * lr
      weights.grad.zero_()
      bias.grad.zero_()
```

Now we can take advantage of `model.parameters()` and `model.zero_grad()` (which are both defined by PyTorch for ``nn.Module``) to make these steps more concise and less prone to error:

```python
  with torch.no_grad():
      for p in model.parameters(): p -= p.grad * lr
      model.zero_grad()
```

We'll wrap our little training loop in a ``fit`` function so we can run it
again later.



In [None]:
def fit():
  for epoch in range(epochs):
    for i in range((n - 1) // bs + 1):
      start_i = i * bs
      end_i = start_i + bs
      xb = x_train[start_i:end_i]
      yb = y_train[start_i:end_i]
      pred = model(xb)
      loss = loss_func(pred, yb)

      loss.backward()
      with torch.no_grad():
        for p in model.parameters():
          p -= p.grad * lr
        model.zero_grad()

fit()

Let's double-check that our loss has gone down:

In [None]:
print(loss_func(model(xb), yb))

### Refactor: use `nn.Linear`



We continue to refactor our code.  Instead of manually defining and
initializing ``self.weights`` and ``self.bias``, and calculating ``xb  @
self.weights + self.bias``, we will instead use the Pytorch class
[`nn.Linear`](https://pytorch.org/docs/stable/nn.html#linear-layers) for a
linear layer, which does all that for us.

Pytorch has many predefined layers that can greatly simplify our code, and often making it faster too.

In [None]:
class Mnist_Logistic(nn.Module):
  def __init__(self):
    super().__init__()
    self.lin = nn.Linear(784, 10)

  def forward(self, xb):
    return self.lin(xb)

We instantiate our model and calculate the loss in the same way as before:

In [None]:
model = Mnist_Logistic()
print(loss_func(model(xb), yb))

We are still able to use our same ``fit`` method as before:

In [None]:
fit()

print(loss_func(model(xb), yb))

### Refactor: use `torch.optim`

We already explored the `torch.optim` package in the *Logistic Regression and Optimization* notebook: obviously we can use optimizers to train our models!

We can use the ``step`` method from our optimizer to take a forward step, instead
of manually updating each parameter.

This will let us replace our previous custom optimization step:

```python
with torch.no_grad():
  for p in model.parameters(): p -= p.grad * lr
  model.zero_grad()
```

and instead use just:
```python
opt.step()
opt.zero_grad()
```
where `opt` can be any fancy optimizer.

In [None]:
from torch import optim

We'll define a little function to create our model and optimizer so we
can reuse it in the future:

In [None]:
def get_model():
  model = Mnist_Logistic()
  return model, optim.SGD(model.parameters(), lr=lr)

model, opt = get_model()
print(loss_func(model(xb), yb))

for epoch in range(epochs):
  for i in range((n - 1) // bs + 1):
    start_i = i * bs
    end_i = start_i + bs
    xb = x_train[start_i:end_i]
    yb = y_train[start_i:end_i]
    pred = model(xb)
    loss = loss_func(pred, yb)

    loss.backward()
    opt.step()
    opt.zero_grad()

print(loss_func(model(xb), yb))

Note that we are **not replacing the training loop**. The optimizer only helps us for a single step; we still have to run the training loop ourselves.

### Refactor: use `Dataset`


We already explored the PyTorch abstract Dataset class in the *Linear models and Pytorch Datasets* notebook. PyTorch's [`TensorDataset`](https://pytorch.org/docs/stable/data.html#torch.utils.data.TensorDataset)
is a `Dataset` that wraps tensors. This also gives us a way to easily iterate, index, and slice the dataset with a more compact code.

In [None]:
from torch.utils.data import TensorDataset

Both ``x_train`` and ``y_train`` can be combined in a single ``TensorDataset``,
which will be easier to iterate over and slice:



In [None]:
train_ds = TensorDataset(x_train, y_train)

Previously, we had to iterate through minibatches of x and y values separately:

```python
    xb = x_train[i*bs : i*bs+bs]
    yb = y_train[i*bs : i*bs+bs]
```

Now, we can do these two steps together:

```python
    xb, yb = train_ds[i*bs : i*bs+bs]
```



In [None]:
model, opt = get_model()

for epoch in range(epochs):
  for i in range((n - 1) // bs + 1):
    xb, yb = train_ds[i * bs: i * bs + bs]
    pred = model(xb)
    loss = loss_func(pred, yb)

    loss.backward()
    opt.step()
    opt.zero_grad()

print(loss_func(model(xb), yb))

### Refactor: use `DataLoader`

We already explored the PyTorch abstract `DataLoader` class in the *Linear models and Pytorch Datasets* notebook.  

Remember that among other things Pytorch's ``DataLoader`` can provide us with minibatches automatically.


In [None]:
from torch.utils.data import DataLoader

train_ds = TensorDataset(x_train, y_train)
train_dl = DataLoader(train_ds, batch_size=bs, shuffle=True)

Previously, our loop iterated over batches `xb, yb` like this:

```python
for i in range((n-1)//bs + 1):
  xb,yb = train_ds[i*bs : i*bs+bs]
  pred = model(xb)
```

Now, our loop is much cleaner, as `xb, yb` are loaded automatically from the data loader:

```python
for xb, yb in train_dl:
  pred = model(xb)
```


In [None]:
model, opt = get_model()

for epoch in range(epochs):
  for xb, yb in train_dl:
    pred = model(xb)
    loss = loss_func(pred, yb)

    loss.backward()
    opt.step()
    opt.zero_grad()

print(loss_func(model(xb), yb))

Thanks to Pytorch's ``nn.Module``, ``nn.Parameter``, ``Dataset``, and ``DataLoader``,
our training loop is now dramatically smaller and easier to understand. Let's
now try to add the basic features necessary to create effecive models in practice.


### Add: validation set

Although we have neglected it so far for simplicity, you should **always**  have
a [`validation set`](https://www.fast.ai/2017/11/13/validation-sets/) to identify overfitting.

**A note about shuffling:** Shuffling the training data is important to prevent correlation between batches and overfitting. On the other hand, the validation loss will be identical whether we shuffle the validation set or not. Since shuffling takes extra time and makes qualitative comparisons more difficult, _it makes no sense to shuffle the validation data_.

Still, we'll build mini-batches for the validation set as well, for efficiency reasons (e.g. avoid a memory bottleneck of loading the entire validation set at once). We'll use a batch size for the validation set that is twice as large as
that for the training set, because it doesn't need to
store any gradients.




In [None]:
train_ds = TensorDataset(x_train, y_train)
train_dl = DataLoader(train_ds, batch_size=bs, shuffle=True)

valid_ds = TensorDataset(x_valid, y_valid)
valid_dl = DataLoader(valid_ds, batch_size=bs * 2)

We will calculate and print the validation loss at the end of each epoch.

In the code below, we also call `model.train()` before training, and `model.eval()` before inference; this will be required by layers such as ``nn.BatchNorm2d``
and ``nn.Dropout`` to ensure appropriate behavior, and it's good practice to do this always to be safe.

In [None]:
model, opt = get_model()

for epoch in range(epochs):

  model.train()
  for xb, yb in train_dl:
    pred = model(xb)
    loss = loss_func(pred, yb)

    loss.backward()
    opt.step()
    opt.zero_grad()

  model.eval()
  with torch.no_grad():
    valid_loss = sum(loss_func(model(xb), yb) for xb, yb in valid_dl) / len(valid_dl)

  print(epoch, valid_loss)

Is the loss always going down? Try a few times!

### Add: fit() and get_data()

We'll now do a little refactoring of our own. Since we go through a similar
process twice of calculating the loss for both the training set and the
validation set, let's make that into its own function, ``step``, which
computes the loss for one batch and possibly one optimization step.

We pass an optimizer in for the training set, and use it to perform
backprop.  For the validation set, we don't pass an optimizer, so the
method doesn't perform backprop.

In [None]:
def step(model, loss_func, xb, yb, opt=None):
  loss = loss_func(model(xb), yb)

  if opt is not None:
    loss.backward()
    opt.step()
    opt.zero_grad()

  return loss.item(), len(xb)

``fit`` runs the necessary operations to train our model and compute the
training and validation losses for each epoch:

In [None]:
import numpy as np

def fit(epochs, model, loss_func, opt, train_dl, valid_dl):
  for epoch in range(epochs):
    model.train()
    for xb, yb in train_dl:
      step(model, loss_func, xb, yb, opt)

    model.eval()
    with torch.no_grad():
      losses, batch_sizes = zip(
        *[step(model, loss_func, xb, yb) for xb, yb in valid_dl]
      )
    val_loss = np.sum(np.multiply(losses, batch_sizes)) / np.sum(batch_sizes)

    # Here's how zip() works:
    # - step() returns (loss.item(), len(xb)) for each batch.
    # - the input to zip() is [(loss1, size1), (loss2, size2), ...].
    # - the output is two tuples: (loss1, loss2, ...) and (size1, size2, ...).

    print(epoch, val_loss)

``get_data`` returns dataloaders for the training and validation sets.

In [None]:
def get_data(train_ds, valid_ds, bs):
  return (
    DataLoader(train_ds, batch_size=bs, shuffle=True),
    DataLoader(valid_ds, batch_size=bs * 2),
  )

Now, our whole process of obtaining the data loaders and fitting the
model can be run in 3 lines of code:

In [None]:
train_dl, valid_dl = get_data(train_ds, valid_ds, bs)
model, opt = get_model()
fit(epochs, model, loss_func, opt, train_dl, valid_dl)

You can use these basic 3 lines of code to train a wide variety of models.
Let's see if we can use them to train a convolutional neural network (CNN, you will learn about them in the next lessons)!


### Our first CNN

We are now going to build our neural network with three convolutional layers.
Because none of the functions in the previous section assume anything about
the model form, we'll be able to use them to train a CNN without any modification.

We will use Pytorch's predefined [`Conv2d`](https://pytorch.org/docs/stable/nn.html#torch.nn.Conv2d) class
as our convolutional layer. We define a CNN with 3 convolutional layers.
Each convolution is followed by a ReLU.  At the end, we perform an
average pooling.



In [None]:
class Mnist_CNN(nn.Module):
  def __init__(self):
    super().__init__()
    self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=2, padding=1)
    self.conv2 = nn.Conv2d(16, 16, kernel_size=3, stride=2, padding=1)
    self.conv3 = nn.Conv2d(16, 10, kernel_size=3, stride=2, padding=1)

  def forward(self, xb):
    xb = xb.view(-1, 1, 28, 28) # conv layers expect input with shape (batch size, channels, height, width)
    xb = F.relu(self.conv1(xb))
    xb = F.relu(self.conv2(xb))
    xb = F.relu(self.conv3(xb))
    xb = F.avg_pool2d(xb, 4)
    return xb.view(-1, xb.size(1)) # reshape to 2d tensor:
                                   # first dimension is batch size (calculated automatically) .
                                   # second dimension is whatever the channel dimension xb.size(1)
                                   # has become after the convolutions and pooling.

lr = 0.1

In [None]:
model = Mnist_CNN()
opt = optim.SGD(model.parameters(), lr=lr, momentum=0.9)

fit(epochs, model, loss_func, opt, train_dl, valid_dl)

### Use: nn.Sequential

``torch.nn`` has another handy class we can use to simplify our code: [`Sequential`](https://pytorch.org/docs/stable/nn.html#torch.nn.Sequential).

A ``Sequential`` object runs each of the modules contained within it, in a
sequential manner. This is a simpler way of writing a neural network.

Also, this is especially useful when we want to easily define a
**custom layer** that wraps around a given function. For instance, PyTorch doesn't have a `view` layer. We are going to create one for our network.

The class we define below, called `Lambda`, will create a layer that we can then use to construct a network with `Sequential`.

In [None]:
class Lambda(nn.Module):
  def __init__(self, func):
    super().__init__()
    self.func = func

  def forward(self, x):
    return self.func(x)


def preprocess(x):
  return x.view(-1, 1, 28, 28)

Instead of using the module `Mnist_CNN`, we can now build our network by using ``Sequential``:

In [None]:
model = nn.Sequential(
  Lambda(preprocess),  # prepare the images for the conv layers
  nn.Conv2d(1, 16, kernel_size=3, stride=2, padding=1),
  nn.ReLU(),
  nn.Conv2d(16, 16, kernel_size=3, stride=2, padding=1),
  nn.ReLU(),
  nn.Conv2d(16, 10, kernel_size=3, stride=2, padding=1),
  nn.ReLU(),
  nn.AvgPool2d(4),
  Lambda(lambda x: x.view(x.size(0), -1)),
)

opt = optim.SGD(model.parameters(), lr=lr, momentum=0.9)

fit(epochs, model, loss_func, opt, train_dl, valid_dl)

### Use: DataLoader wrappers

Our CNN is fairly concise, but it only works with MNIST, because:
 - It assumes the input is a 28\*28 long vector (look at the reshaping done with `Lambda(preprocess)`)
 - It assumes that the final CNN grid size is 4\*4 (since that's the average
pooling kernel size we used)

Let's get rid of these two assumptions, so our model works with any 2d
single channel image. First, we can remove the initial Lambda layer,
moving the data preprocessing directly into the data loader:



In [None]:
def preprocess(x, y):
    return x.view(-1, 1, 28, 28), y


class WrappedDataLoader:
  def __init__(self, dl, func):
    self.dl = dl
    self.func = func

  def __len__(self):
    return len(self.dl)

  # By using yield, we turn the __iter__ method into a generator,
  # making it lazily evaluate and only process batches as needed:

  def __iter__(self):
    batches = iter(self.dl)
    for b in batches:
      yield (self.func(*b))  # func() reshapes the images, *b unpacks the batch

train_dl, valid_dl = get_data(train_ds, valid_ds, bs)
train_dl = WrappedDataLoader(train_dl, preprocess)
valid_dl = WrappedDataLoader(valid_dl, preprocess)

Next, we can replace ``nn.AvgPool2d`` with ``nn.AdaptiveAvgPool2d``, which
allows us to define the size of the *output* tensor we want, rather than
the *input* tensor we have. As a result, our model will work with inputs of any size.

In [None]:
model = nn.Sequential(
  nn.Conv2d(1, 16, kernel_size=3, stride=2, padding=1),
  nn.ReLU(),
  nn.Conv2d(16, 16, kernel_size=3, stride=2, padding=1),
  nn.ReLU(),
  nn.Conv2d(16, 10, kernel_size=3, stride=2, padding=1),
  nn.ReLU(),
  nn.AdaptiveAvgPool2d(1),
  Lambda(lambda x: x.view(x.size(0), -1)),
)

opt = optim.SGD(model.parameters(), lr=lr, momentum=0.9)

Let's try it out:

In [None]:
fit(epochs, model, loss_func, opt, train_dl, valid_dl)

### Use: your GPU

If you're lucky enough to have access to a CUDA-capable GPU, you can
use it to speed up your code. First check that your GPU is working in
Pytorch:



In [None]:
print(torch.cuda.is_available())

And then create a device object for it:



In [None]:
dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

Let's update `preprocess` to move batches to the GPU:

In [None]:
def preprocess(x, y):
    return x.view(-1, 1, 28, 28).to(dev), y.to(dev)

train_dl, valid_dl = get_data(train_ds, valid_ds, bs)
train_dl = WrappedDataLoader(train_dl, preprocess)
valid_dl = WrappedDataLoader(valid_dl, preprocess)

Finally, we can move our model to the GPU.



In [None]:
model.to(dev)
opt = optim.SGD(model.parameters(), lr=lr, momentum=0.9)

It runs faster now!



In [None]:
fit(epochs, model, loss_func, opt, train_dl, valid_dl)

### Closing thoughts


We now have a general data pipeline and training loop that you can use for
training many types of models using Pytorch.

Of course, there are many things you'll want to add, such as data augmentation, hyperparameter tuning, monitoring training, transfer learning, and so forth.

Let's summarize what we've seen:

 - **torch.nn**

   + ``Module``: creates a callable that behaves like a function, but can also
     contain a state. It knows what ``Parameter``s it
     contains and can zero all their gradients, loop through them for weight updates, etc.
   + ``Parameter``: a wrapper for a tensor that tells a ``Module`` that it has weights
     that need updating during backprop. Only tensors with the `requires_grad` attribute set are updated.
   + ``functional``: a module (usually imported into the ``F`` namespace by convention) containing activation functions, loss functions, etc, as well as non-stateful
     versions of layers such as convolutional and linear layers.
 - ``torch.optim``: Contains optimizers such as ``SGD``, which update the weights
   of ``Parameter`` during the backward step.
 - ``Dataset``: An abstract interface of objects with a ``__len__`` and a ``__getitem__``,
   including classes provided with Pytorch such as ``TensorDataset``.
 - ``DataLoader``: Takes any ``Dataset`` and creates an iterator that returns batches of data.

---

Tutorial on `torch.nn` adapted from this [tutorial](https://pytorch.org/tutorials/beginner/nn_tutorial.html).


## Playground: MLPs expressivity


What type of functions does a MLP learn?

To help our intuition let's try to visualize the *functions that a deep mlp learns*, while changing some hyperparameters.

In order to do that, we will use this simple parametric model, where the dimension of the intermediate tensors (the **hidden dimension**), the **number of layers** and the **type of activation** are parametric.

⚠️ **Compatibility warning:** For this last plot we are downgrading to plotly 5.11.0.

In [None]:
!pip install plotly==5.11.0

In [None]:
import torch
from torch import nn
import plotly.express as px
import numpy as np
import torch.nn.functional as F
from typing import Mapping, Union, Optional, Callable
import numpy as np
import argparse
import torch.optim as optim
import plotly.graph_objects as go
from torchvision import datasets, transforms
from tqdm.notebook import tqdm

In [None]:
class MLP2D(nn.Module):
  def __init__(self,
               num_layers: int,
               hidden_dim: int,
               activation: Callable[[torch.Tensor], torch.Tensor]
               ) -> None:
    super().__init__()

    self.first_layer = nn.Linear(in_features=2,
                                 out_features=hidden_dim)

    self.layers = nn.ModuleList()  # A list of modules: automatically exposes nested parameters to optimize.
                                   # Parameters contained in a normal python list are not returned by model.parameters()
    for i in range(num_layers):
      self.layers.append(
          nn.Linear(in_features=hidden_dim, out_features=hidden_dim)
      )
    self.activation = activation

    self.last_layer = nn.Linear(in_features=hidden_dim,
                                out_features=1)


  def forward(self, meshgrid: torch.Tensor) -> torch.Tensor:
    """
    Applies transformations to each (x, y) independently

    :param meshgrid: tensor of dimensions [..., 2], where ... means any number of dims
    """
    out = meshgrid

    out = self.first_layer(out)  # First linear layer, transforms the hidden dimensions from 2 (the coordinates) to `hidden_dim`
    for layer in self.layers:    # Apply `k` (linear, activation) layer
      out = layer(out)
      out = self.activation(out)
    out = self.last_layer(out)   # Last linear layer to bring the `hiddem_dim` features back to the 2 coordinates x, y

    return out.squeeze(-1)


Let's try this model to understand how it works:

In [None]:
model = MLP2D(num_layers=3,
              hidden_dim=10,
              activation=torch.nn.functional.relu)

In [None]:
model.parameters  # Explore the parameters, i.e. the trainable tensors that require a grad in this model

Let's try to execute this model on some input:

In [None]:
model(torch.as_tensor([1., 2.]))

In [None]:
model(torch.as_tensor([[1., 2.]]))

In [None]:
model(torch.as_tensor([[1., 2.], [1., 2.]]))

In [None]:
model(torch.rand(2, 3, 2, 2))

Now we are going to sample points from some simple function, to create our dataset. Our model will try to reconstruct the function from these points.

We can use a `Dataset` to keep the code clean and organize the sampled points.
Keep in mind, for each point the model will receive the $(x, y)$ coordinates and try to predict the $z = f(x, y)$ coordinate.

In [None]:
from typing import Dict

class PointsDataset(torch.utils.data.Dataset):
  def __init__(self, x: torch.Tensor, y_true: torch.Tensor) -> None:
    super().__init__()

    self.x = x
    self.y_true = y_true

  def __len__(self) -> int:
    return self.y_true.shape[0]

  def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
    return {                    # For the idx-th sample
        'x': self.x[idx, ...],  # the "x" are the (x, y) coordinates of the idx-th point
        'y': self.y_true[idx]   # the "y" is  the (z) coordinate of the idx-th point
    }

In [None]:
# @title Dataset creation { run: "auto" }

fn_names = {
    'peaks': peaks,
    'rastrigin': rastrigin,
    'rosenbrock': rosenbrock,
    'simple_fn': simple_fn,
    'simple_fn2': simple_fn2
}

base_fn = 'peaks' #@param ["simple_fn", "simple_fn2", "peaks",  "rastrigin", "rosenbrock"]
n_rand = 80 #@param {type:"slider", min:0, max:200, step:1}
plot_lim = 3  #@param {type:"slider", min:0, max:50, step:1}
plot_height = 700 #@param {type:"slider", min:0, max:1500, step:50}

lim = plot_lim
fn = fn_names[base_fn]

x_random = torch.rand(n_rand) * lim * 2 - lim
y_random = torch.rand(n_rand) * lim * 2 - lim

xy_data = torch.stack((x_random, y_random), dim=-1)
xy_groundtruth = fn(xy_data)


points_dl =  torch.utils.data.DataLoader(
    PointsDataset(x=xy_data, y_true=xy_groundtruth),
    batch_size=16,
    shuffle=True
)

points = torch.cat((xy_data, xy_groundtruth[..., None]), dim=-1)


plot_points_over_landscape(fn, points, lim=lim, height=plot_height)

In [None]:
# @title Learned function from sampled points { run: "auto" }

activation_names = {
    'relu': torch.relu,
    'leaky_relu': torch.nn.functional.leaky_relu,
    'elu': torch.nn.functional.elu,
    'sigmoid': torch.sigmoid,
    'tanh': torch.tanh,
}

num_layers = 2 #@param {type:"slider", min:0, max:20, step:1}

activation_fn = 'relu' #@param ["relu", "leaky_relu", "elu", "sigmoid",  "tanh"]
activation_fn = activation_names[activation_fn]

hidden_dim = 16 #@param {type:"slider", min:0, max:512, step:1}

num_epochs = 610 #@param {type:"slider", min:0, max:1000, step:10}
learning_rate = 0.001 #@param {type:"number"}

plot_lim = 4  #@param {type:"slider", min:0, max:50, step:1}
plot_height = 900 #@param {type:"slider", min:0, max:1500, step:50}

def energy(y_pred, y_true):
  return torch.nn.functional.mse_loss(y_pred, y_true)


model = MLP2D(num_layers=num_layers, activation=activation_fn, hidden_dim=hidden_dim)
opt = torch.optim.Adam(model.parameters(), lr=learning_rate)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

for i in tqdm(range(num_epochs), desc="epoch"):
    for batch in points_dl:
      x = batch['x'].to(device)
      y = batch['y'].to(device)
      y_pred = model(x)

      loss = energy(y_pred, y)

      loss.backward()
      opt.step()
      opt.zero_grad()


plot_points_over_landscape(model.cpu(), points.cpu(), lim=lim, height=plot_height).show()

!!!

If you have problems visualizing the previous animation in Google Colab, please explore it in this streamlit app:

-  https://lucmos-demo-nn-expressivity-ui-plhjjk.streamlit.app/