In [1]:
# from https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html

In [3]:
import torch
from torchvision.models import resnet18, ResNet18_Weights

### torch.autograd
is PyTorch's automatic differentiation engine

In [4]:
# The example loads a pretrained resnet 18 model, creates random data tensor (image) with label.
model = resnet18(weights=ResNet18_Weights.DEFAULT)
data = torch.rand(1,3,64,64)
labels = torch.rand(1, 1000)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to C:\Users\chris/.cache\torch\hub\checkpoints\resnet18-f37072fd.pth


  0%|          | 0.00/44.7M [00:00<?, ?B/s]

In [11]:
# forward pass
# creating a prediction also results in calculation of a gradient
# for each of the model parameters at that data point
prediction = model(data)

In [12]:
# Use the prediction and label to calculate loss.
# Backprop applies the loss through the model.
# Backprop uses the gradient calculated as part of the forward pass
#
# Autograd stores the gradients for each model parameter in the .grad attribute
loss = (prediction - labels).sum() # individual differences, reduction
loss.backward() # backprop

In [14]:
# Load an optimizer (SGD).
# register the parameters of the model in the optimizer
optim = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)

In [15]:
# .step() initiates gradient descent. The optimizer adjusts each parameter
optim.step()

# Exclusion from the DAG
a directed acyclic graph is maintained with the computations performed. At each computation the gradient is maintained. Trafersing the graph in the forward direction (prediction) causes gradients to be saved. Traversing the graph in the reverse direction (backprop) allows the loss value to be applied to each parameter in the graph.

The DAG is re-created (starting after each backward() call). This allows changing the structure of the graph during training - flow control statements allowed in the training algorithm.

By default torch tracks all tensors that have requires_grad flag set to True. For tensors not requiring gradients setting this to False excludes it from the gradient computation DAG.


In [17]:
# The output tensor of an operation will require gradients even if only a
# single input gtensor has requires_grad=True
x = torch.rand(5,5)
y = torch.rand(5,5)
z = torch.rand((5,5), requires_grad=True)

a = x + y
print(f"Does 'a' require gradients? : {a.requires_grad}")
b = x + z
print(f"Does 'b' require gradients? : {b.requires_grad}")


Does 'a' require gradients? : False
Does 'b' require gradients? : True


In [18]:
# In a NN parameters that don't compute gradients are "frozen parameters"
# It is useful to freeze part of model if you know in advance you don't need those gradients
# This can save computes and focus on the intended parameters
#
# Another case is fine tuning a pretrained network
#
# In fine tuning we freeze most of the model and only modify the layers
# to make predictions on new labels.
#
# Here we load a pretrained resnet18 and freeze all the parameters...

from torch import nn, optim

model = resnet18(weights=ResNet18_Weights.DEFAULT)

# Freeze all the parmeters in the network
for param in model.parameters():
    param.requires_grad = False


In [19]:
# If we want to fine tun just the last layer just replace that with a ne linear layer (unfrozen
# by default).
model.fc = nn.Linear(512, 10)

In [21]:
# Optimize (only the last layer)
optimizer = optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)

In [22]:
# the only parametersthat are computing gradients are the last stage