In [0]:
import torch

In [0]:
tensor1 = torch.Tensor([[1, 2, 3],
                      [4, 5, 6]])

tensor1

tensor([[1., 2., 3.],
        [4., 5., 6.]])

In [0]:
tensor2 = torch.Tensor([[7, 8, 9],
                      [10, 11, 12]])

tensor2

tensor([[ 7.,  8.,  9.],
        [10., 11., 12.]])

When value of ".requires_grad" property is set to true, this tracks computations for a tensor in the forward phase and will calculate gradients for this tensor in the backward phase

In [0]:
tensor1.requires_grad

False

In [0]:
tensor2.requires_grad

False

Enable tracking history for a tensor so gradients are calculated with respect to that tensor OR

In order to set value of ".requires_grad" flag/property to true, use ".requires_grad_()"

In [0]:
tensor1.requires_grad_()

tensor([[1., 2., 3.],
        [4., 5., 6.]], requires_grad=True)

In [0]:
tensor1.requires_grad

True

In [0]:
tensor2.requires_grad

False

The gradients calculated using automatic differentiation wrt to any tensor is present in the grad matrix associated with the tensor

This will be none because because we have setup a tensor but not used within a computation graph i.e. we havn't performed a forward or backward pass

In [0]:
print(tensor1.grad)

None


Computation graph in Pytorch is made up of Tensors and Functions. And these Tensors and Functions make up our directed acyclic computation graph. Tensors can be thought of as the nodes and functions as the transformations performed on the tensors. 

Every tensor has "grad.fn" function to create that function that performs the computation

In [0]:
|print(tensor1.grad_fn)

None


In [0]:
output_tensor = tensor1 * tensor2

When you create a tensor using an operation, "requires_grad" property of the resulting tensor is based on the input tensors that we used to create this output tensor

Here "requires_grad" is true because tensor1 was used to create this output_tensor and requires_grad for tensor1 is already set to true

In [0]:
output_tensor.requires_grad

True

.grad is none because we havn't done any backward pass

In [0]:
print(output_tensor.grad)

None


In [0]:
print(output_tensor.grad_fn)

<MulBackward0 object at 0x7f28eead7390>


In [0]:
print(tensor2.grad_fn)

None


Why creating a tensor using an operation has grad_fn associated with it? 

In [0]:
output_tensor = (tensor1 * tensor2).mean()
print(output_tensor.grad_fn)

<MeanBackward0 object at 0x7f28eead7f28>


Even though tensor1 has been part of the multiple computational graphs, the grad matrix is none because we havn't done backward pass yet

In [0]:
print(tensor1.grad)

None


Gradient calculation i.e. vector of partial derivatives can be calculated only when we call ".backward()" function on an output

In [0]:
output_tensor.backward()

These gradients here are the partial derivatives for the parameters of tensor1 calculated with reference to the output_tensor

In [0]:
print(tensor1.grad)

tensor([[1.1667, 1.3333, 1.5000],
        [1.6667, 1.8333, 2.0000]])


Since gradients are partiel derivatives wrt tensor1, gradient will exactly match shape of the tensor

In [0]:
tensor1.grad.shape, tensor1.shape

(torch.Size([2, 3]), torch.Size([2, 3]))

tensor2 has no gradient because requires_grad is set to False

In [0]:
print(tensor2.grad)

print(tensor2.requires_grad)

None
False


There are also no gradients associated with output tensor because this is tensor wrt which we calculated partial derviatives for tensor1

In [0]:
print(output_tensor.grad)

None


In [0]:
new_tensor = tensor1 * 3
print(new_tensor.requires_grad)

True


In [0]:
new_tensor

tensor([[ 3.,  6.,  9.],
        [12., 15., 18.]], grad_fn=<MulBackward0>)

Tracking history can be turned off for tensor with requires_grad turned on using torch.no_grad()

In [0]:
with torch.no_grad():
  
    new_tensor = tensor1 * 3
    
    print('new_tensor = ', new_tensor)
    
    print('requires_grad for tensor1 = ', tensor1.requires_grad)
    
    print('requires_grad for tensor2 = ', tensor2.requires_grad)
    
    print('requires_grad for new_tensor = ', new_tensor.requires_grad)

new_tensor =  tensor([[ 3.,  6.,  9.],
        [12., 15., 18.]])
requires_grad for tensor1 =  True
requires_grad for tensor2 =  False
requires_grad for new_tensor =  False


requires_grad is set to False because it was created within a torch.no_grad block

In [0]:
print('requires_grad for new_tensor = ', new_tensor.requires_grad)

requires_grad for new_tensor =  False


Using decorators to set requires_grad to False

In [0]:
def calculate(t):
  return t * 2

With @torch.no_grad() decorator, history will not be enabled even if requires_grad property is set to True

In [0]:
@torch.no_grad()
def calculate_with_no_grad(t):
  return t * 2

In [0]:
result_tensor = calculate(tensor1)

result_tensor

tensor([[ 2.,  4.,  6.],
        [ 8., 10., 12.]], grad_fn=<MulBackward0>)

In [0]:
result_tensor.requires_grad

True

In [0]:
result_tensor_no_grad = calculate_with_no_grad(tensor1)

result_tensor_no_grad

tensor([[ 2.,  4.,  6.],
        [ 8., 10., 12.]])

Turning on requires_grad within  a torch.no_grad()

In [0]:
with torch.no_grad():
  
  new_tensor_no_grad = tensor1 * 3
  
  print('new_tensor_no_grad = ', new_tensor_no_grad)
  
  with torch.enable_grad():
    
    new_tensor_grad = tensor1 * 3
    
    print('new_tensor_grad = ', new_tensor_grad)

new_tensor_no_grad =  tensor([[ 3.,  6.,  9.],
        [12., 15., 18.]])
new_tensor_grad =  tensor([[ 3.,  6.,  9.],
        [12., 15., 18.]], grad_fn=<MulBackward0>)


Setting requires_grad set to True when instantiating a tensor

In [0]:
tensor_one = torch.tensor([[1.0, 2.0],
                          [3.0, 4.0]], requires_grad=True)

tensor_one

tensor([[1., 2.],
        [3., 4.]], requires_grad=True)

By default requires_grad is set to False

In [0]:
tensor_two = torch.Tensor([[5, 6],
                          [7, 8]])

tensor_two

tensor([[5., 6.],
        [7., 8.]])

In [0]:
tensor_one.requires_grad

True

In [0]:
tensor_two.requires_grad_()

tensor([[5., 6.],
        [7., 8.]], requires_grad=True)

Executing the forward pass 

In [0]:
final_tensor = (tensor_one + tensor_two).mean()

final_tensor

tensor(9., grad_fn=<MeanBackward0>)

In [0]:
final_tensor.requires_grad

True

In [0]:
print(tensor_one.grad)

None


In [0]:
print(tensor_two.grad)

None


When we call ".backward" gradients are calcuated

In [0]:
final_tensor.backward()

In [0]:
print(tensor_one.grad)

tensor([[0.2500, 0.2500],
        [0.2500, 0.2500]])


In [0]:
print(tensor_two.grad)

tensor([[0.2500, 0.2500],
        [0.2500, 0.2500]])


If you want to create a copy of tensor which is detahced from the computation graph then call .detach() . 

Detached tensor will always have requires_grad set to False

In [0]:
detached_tensor = tensor_one.detach()

detached_tensor

tensor([[1., 2.],
        [3., 4.]])

Remember tensor1 is part of computation graph and has requires_grad set to True

In [0]:
tensor_one

tensor([[1., 2.],
        [3., 4.]], requires_grad=True)

After we do forward and backward pass, we will see that detached_tensor has no gradients calculated

In [0]:
mean_tensor = (tensor_one + detached_tensor).mean()

mean_tensor.backward()

In [0]:
tensor_one.grad

tensor([[0.5000, 0.5000],
        [0.5000, 0.5000]])

In [0]:
print(detached_tensor.grad)

None


This is old stuff but good to know

In [0]:
import torch
from torch.autograd import Variable

In [0]:
var = Variable(torch.FloatTensor([9]))
var

tensor([9.])

In [0]:
var.requires_grad

False

In [0]:
var.requires_grad_()

tensor([9.], requires_grad=True)

In [0]:
w1 = Variable(torch.FloatTensor([3]), requires_grad = True)
w2 = Variable(torch.FloatTensor([7]), requires_grad = True)

In [0]:
w1

tensor([3.], requires_grad=True)

In [0]:
w2

tensor([7.], requires_grad=True)

In [0]:
result_var = var * w1

result_var

tensor([27.], grad_fn=<MulBackward0>)

In [0]:
result_var.requires_grad

True

In [0]:
result_var.backward()

In [0]:
w1.grad

tensor([9.])

In [0]:
var.grad

tensor([3.])