# Gradient calculation with PyTorch

Although PyTorch already comes with a series of common loss functions, it is still important to be able to use PyTorch to calculate gradients at will for future model implementations.

In [1]:
import torch
from torch import nn

PyTorch uses **Autograd** to evaluate gradients values as specified values. See quadratic function example below:

In [2]:
# Quadratic function example

x = torch.tensor(
    2, 
    requires_grad=True, #required_grad must be TRUE for gradient calculation
    dtype=torch.float #doesn't work for integer types
) 

y = 3 + 2*x + x**2 # f(x) = 3 + 2x + x^2 >> f'(x) = 2 + 2x

print(f"x = {x}")
print(f"f(x) = {y}")
print(f"y type: {type(y)}")
print(f"f\'(x={x}) = {x.grad}")

y.backward() #calculates gradient

print(f"f\'(x={x}) = {x.grad}")


x = 2.0
f(x) = 11.0
y type: <class 'torch.Tensor'>
f'(x=2.0) = None
f'(x=2.0) = 6.0


In [3]:
# Quadratic function example - non-scalar version

x = torch.tensor(
    [2], 
    requires_grad=True, #required_grad must be TRUE for gradient calculation
    dtype=torch.float #doesn't work for integer types
) 

y = 3 + 2*x[0] + x[0]**2 # f(x) = 3 + 2x + x^2 >> f'(x) = 2 + 2x

print(f"x = {x[0]}")
print(f"f(x) = {y}")
print(f"f\'(x={x[0]}) = {x.grad}")

y.backward() #calculates gradient

print(f"f\'(x={x[0]}) = {x.grad}")


x = 2.0
f(x) = 11.0
f'(x=2.0) = None
f'(x=2.0) = tensor([6.])


Computing gradients for multivariate functions:

In [4]:
# Functions with multiple inputs:

x0 = torch.tensor(
    2,
    requires_grad=True,
    dtype=torch.float16
)
x1 = torch.tensor(
    3,
    requires_grad=True,
    dtype=torch.float16
)

y = x0 ** 2 + x1 ** 3 + x0 * x1 # f(x) = x0^2 + x1^3 + x0 * x1

print(f"x0={x0}")
print(f"x1={x1}")
print(f"f({x0}, {x1})={y}")
print(f"Gradient with respect to x0: {x0.grad}")
print(f"Gradient with respect to x1: {x1.grad}")

y.backward()

print(f"Gradient with respect to x0 (df/dx0(2,3)): {x0.grad}")
print(f"Gradient with respect to x1 (df/dx1(2,3)): {x1.grad}")

print("---vector version---")


x = torch.tensor(
    [2,3],
    requires_grad=True,
    dtype=torch.float16
)

y = x[0] ** 2 + x[1] ** 3 + x[0] * x[1] # f(x) = x0^2 + x1^3 + x0 * x1

print(f"x={x}")
print(f"f(x)={y}")
print(f"f\'(x)={x.grad}")

y.backward()

print(f"Gradient vector: f\'(x)={x.grad}")



x0=2.0
x1=3.0
f(2.0, 3.0)=37.0
Gradient with respect to x0: None
Gradient with respect to x1: None
Gradient with respect to x0 (df/dx0(2,3)): 7.0
Gradient with respect to x1 (df/dx1(2,3)): 29.0
---vector version---
x=tensor([2., 3.], dtype=torch.float16, requires_grad=True)
f(x)=37.0
f'(x)=None
Gradient vector: f'(x)=tensor([ 7., 29.], dtype=torch.float16)


In [5]:
# Vector function example

x = torch.tensor([2,3], requires_grad=True, dtype=torch.float16)

def vector_fn(x: torch.Tensor) -> torch.Tensor:
    return torch.stack([
        x[0] ** 2 + x[1] * 4,
        x[1] ** 3 + torch.log(x[0])
    ])

y = vector_fn(x)

print(f"x = {x}")
print(f"f(x) = {y}")

jacobian = torch.zeros(2,2)

for i in range(2):
    print(f"iteration: {i}")
    # Zero out previous gradients
    if x.grad is not None:
        x.grad.zero_()
    
    # Compute gradient for each component
    y[i].backward(
        retain_graph=True #must set retain_graph=True for gradient computation more than once
    ) 
    jacobian[i] = x.grad
    print(f"{i}th row/gradient of {i}th function: {jacobian[i]}")

print(jacobian)


x = tensor([2., 3.], dtype=torch.float16, requires_grad=True)
f(x) = tensor([16.0000, 27.6875], dtype=torch.float16, grad_fn=<StackBackward0>)
iteration: 0
0th row/gradient of 0th function: tensor([4., 4.])
iteration: 1
1th row/gradient of 1th function: tensor([ 0.5000, 27.0000])
tensor([[ 4.0000,  4.0000],
        [ 0.5000, 27.0000]])


In [None]:
# Sum square loss

class MeanSquareLoss(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, predictions, targets):
        loss = torch.sum((targets - predictions) ** 2) #sum
        return loss
    
predictions = torch.tensor([1,2,3], requires_grad=True, dtype=torch.float16)
targets = torch.tensor([2,3,4], dtype=torch.float16)

loss_function = MeanSquareLoss()
loss = loss_function(predictions, targets)
print(f"loss value: {loss}")

loss.backward() #calculates gradients

print(f"gradients: {predictions.grad}")

loss value: 3.0
gradients: tensor([-2., -2., -2.], dtype=torch.float16)


Let's look at more complicated functions below:

In [None]:
# Mean square loss

class MeanSquareLoss(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, predictions, targets):
        loss = torch.mean((targets - predictions) ** 2) #square
        return loss
    
predictions = torch.tensor([1,2,3], requires_grad=True, dtype=torch.float16)
targets = torch.tensor([2,3,4], dtype=torch.float16)

loss_function = MeanSquareLoss()
loss = loss_function(predictions, targets)
print(f"loss value: {loss}")

loss.backward() #calculates gradients

print(f"gradients: {predictions.grad}")

loss value: 1.0
gradients: tensor([-0.6665, -0.6665, -0.6665], dtype=torch.float16)


In [8]:
# More complex loss function

class ComplexCustomLoss(nn.Module):
    def __init__(self, alpha = 0.65, beta = 0.35):
        super().__init__()
        self.alpha = alpha
        self.beta = beta
    
    def forward(self, predictions, targets):
        mse = torch.mean((targets - predictions) ** 2)
        l1 = torch.mean(torch.abs(predictions - targets))

        loss = self.alpha * mse + self.beta * l1
        return loss
    

predictions = torch.tensor([1,2,3], requires_grad=True, dtype=torch.float16)
targets = torch.tensor([2,3,7], dtype=torch.float16)

loss_function = ComplexCustomLoss(alpha=0.65, beta=0.35)
loss = loss_function(predictions, targets)

print(f"loss value: {loss}")

loss.backward() #calculate gradients

print(f"gradients evaluated to: {predictions.grad}")

loss value: 4.6015625
gradients evaluated to: tensor([-0.5498, -0.5498, -1.8496], dtype=torch.float16)
