In [12]:
import torch
from IPython.core.debugger import set_trace

### Tracing Basics 

In [28]:
def my_relu(x: torch.FloatTensor) -> torch.FloatTensor:
#     set_trace() 
    if x < 0:
        print('if branch x:', x)
        return x * 0
    return x

x = torch.tensor(-2., requires_grad=True)

res = my_relu(x)
res.backward()

print('val: ',res.item())
print('grad: ',x.grad.item())

if branch x: tensor(-2., requires_grad=True)
val:  -0.0
grad:  0.0


### Tensors and GPU

In [31]:
a = torch.rand(3,3)
a.shape

torch.Size([3, 3])

In [33]:
a + a @ a.t()

tensor([[1.3346, 1.8376, 0.9870],
        [1.7223, 2.4487, 1.2176],
        [1.5101, 0.8725, 1.4726]])

In [34]:
cuda = torch.device('cuda')
a = a.to(cuda)
a + a @ a.t() # Operation on GPU

tensor([[1.3346, 1.8376, 0.9870],
        [1.7223, 2.4487, 1.2176],
        [1.5101, 0.8725, 1.4726]], device='cuda:0')

In [37]:
a.cpu() #a.cuda()

tensor([[0.4766, 0.7130, 0.3498],
        [0.5977, 0.9711, 0.4211],
        [0.8729, 0.0760, 0.4772]])

## torch autograd

In [19]:
# lets say you want gradient of some function w.r.t x then set requires_grad=True
x = torch.rand(3, requires_grad=True)
print(x)

# Pytorch builds computation graph in grad_fn
y = x + 2
print(y)

# lets add more computation on top of it
z = y*y*2
print(z)

# At he end apply mean operation to get scalar value
z = z.mean()
print(z)
# Now to calculate gradient dz/dx
z.backward()
# Gradient for x are stored in x.grad attribute
print(x.grad)

tensor([0.2507, 0.9057, 0.8503], requires_grad=True)
tensor([2.2507, 2.9057, 2.8503], grad_fn=<AddBackward0>)
tensor([10.1316, 16.8861, 16.2487], grad_fn=<MulBackward0>)
tensor(14.4221, grad_fn=<MeanBackward0>)
tensor([3.0010, 3.8743, 3.8004])


In [7]:
# lets say you want gradient of some function w.r.t x then set requires_grad=True
x = torch.rand(3, requires_grad=True)
print(x)

# Pytorch builds computation graph in grad_fn
y = x + 2
print(y)

# lets add more computation on top of it
z = y*y*2
print(z)

# If you dont have scaler output then you have to provide vector to backward function
# z = z.mean() 
# print(z)

# Now to calculate gradient dz/dx
# Remember partial derivatives are stored in jacobian matrix 
# and its multiplied with gradient vector (J.v) to get final gradient
v = torch.tensor([0.1, 1.0, 0.001], dtype=torch.float32)
z.backward(v)
# Gradient for x are stored in x.grad attribute
print(x.grad)

tensor([0.1876, 0.8791, 0.2681], requires_grad=True)
tensor([2.1876, 2.8791, 2.2681], grad_fn=<AddBackward0>)
tensor([ 9.5712, 16.5780, 10.2889], grad_fn=<MulBackward0>)
tensor([8.7504e-01, 1.1516e+01, 9.0725e-03])


### Make sure to clear gradient

In [11]:
# Lets create dummy training to show accumulation of gradient if you run in loop
weights = torch.ones(4, requires_grad=True)

for epoch in range(3):
    model_output = (weights*3).sum()
    
    model_output.backward()
    
    # It is problematic when gradient accumulates
    print(weights.grad)
    
    # Uncomment below line to clear the gradient before optimization
    weights.grad.zero_()

tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
