# Simple example
https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html

In [1]:
### 
import torch

a = torch.tensor([2.,3.],requires_grad = True)
b = torch.tensor([6.,4.],requires_grad = True)

\begin{align*}
Q = 3a^3 - b^2
\end{align*}

In [2]:
Q = 3*a**3 - b**2

\begin{align*}
\frac{\partial Q}{\partial a} &= 9a^2\\
\frac{\partial Q}{\partial b} &= -2b
\end{align*}

When we call `.backward()` on `Q`, autograd calculates these gradients and stores them in the respective tensors’ `.grad` attribute. However, the `.backward()` method expects the object to be a scalar by default, so we first aggregate `Q` and then call backward.

In [3]:
Q.sum().backward()

In [4]:
print('gradient of Q wrt a: ')
print('analytical: ', 9*a**2)
print('autograd: ', a.grad)

print('gradient of Q wrt b: ')
print('analytical: ', -2*b)
print('autograd: ', b.grad)

gradient of Q wrt a: 
analytical:  tensor([36., 81.], grad_fn=<MulBackward0>)
autograd:  tensor([36., 81.])
gradient of Q wrt b: 
analytical:  tensor([-12.,  -8.], grad_fn=<MulBackward0>)
autograd:  tensor([-12.,  -8.])


# Slightly more complicated example

https://pytorch.org/tutorials/beginner/basics/autogradqs_tutorial.html

![](https://pytorch.org/tutorials/_images/comp-graph.png)

In [5]:
x = torch.ones(5)  # input tensor
y = torch.zeros(3)  # expected output - 3 dimensional

print(x)
print(y)

# parameters
w = torch.randn(5, 3, requires_grad=True)
b = torch.randn(3, requires_grad=True)

# linear predictors
z = torch.matmul(x, w)+b

# cross-entropy loss for unnomalized pre-activations
loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)

tensor([1., 1., 1., 1., 1.])
tensor([0., 0., 0.])


To optimize the parameters in the network, we compute the derivatives of our loss function with respect to parameters. Specifically, we are concerned with:
\begin{align*}
&\frac{\partial L}{\partial w} \textrm{ and } \frac{\partial L}{\partial b}
\end{align*}

at some `x` and `y`. To compute those derivatives, we again call `loss.backward()`, which can be retrieved respectively via `w.grad` and `b.grad`:

In [6]:
loss.backward()
print(w.grad)
print(b.grad)

tensor([[0.0505, 0.1686, 0.3321],
        [0.0505, 0.1686, 0.3321],
        [0.0505, 0.1686, 0.3321],
        [0.0505, 0.1686, 0.3321],
        [0.0505, 0.1686, 0.3321]])
tensor([0.0505, 0.1686, 0.3321])


# Autograd in training
https://pytorch.org/tutorials/beginner/introyt/autogradyt_tutorial.html

In [7]:
BATCH_SIZE = 32
DIM_IN = 784 
HIDDEN_SIZE = 1000
DIM_OUT = 10

class Model(torch.nn.Module):
    def __init__(self):
        super(Model, self).__init__()

        self.layer1 = torch.nn.Linear(DIM_IN, HIDDEN_SIZE)
        self.relu = torch.nn.ReLU()
        self.layer2 = torch.nn.Linear(HIDDEN_SIZE, DIM_OUT)

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        return x

# instanstiate the model
model = Model()

Within a subclass of `torch.nn.Module`, gradients are tracked by default, so we need not specify `requires_grad = True`.

In [8]:
print(model.layer2.weight.requires_grad)

True


In [9]:
# generate some data
some_input = torch.randn(BATCH_SIZE, DIM_IN, requires_grad=False)
ideal_output = torch.randn(BATCH_SIZE, DIM_OUT, requires_grad=False)

# generate a prediction
prediction = model(some_input)

# associate the model's parameters with an optimizer, in this case SGD
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

# define a loss function
loss = (ideal_output - prediction).pow(2).mean()

In [10]:
# one training step
loss.backward()
optimizer.step()

In [11]:
from torch.nn import MSELoss

# reinitialize the model
model = Model()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
loss_func = MSELoss()

# several passes through the dataset
for i in range(0, 100):
    prediction = model(some_input)
    
    loss = loss_func(prediction,ideal_output)
    # print(loss.item())
    
    loss.backward()
    optimizer.step()
    
    # the optimizer accumulates gradients by default; reset to zero or else it will blow up
    optimizer.zero_grad(set_to_none=False)
    # print(model.layer2.weight.grad)

In [12]:
# To evaluate the trained model, we turn off autograd because we don't need it
some_input_test = torch.randn(BATCH_SIZE, DIM_IN, requires_grad=False)
with torch.no_grad():
    prediction = model(some_input_test)
    val_loss = loss_func(prediction,ideal_output)
    print(val_loss)

tensor(1.0155)
