Autograd : pytorch module to compute automatic differntiation for tensor computations. it enables gradient computing used in optimization algorithms like gradient descent.

In [1]:
def dy_dx(x):
  return 2*x

In [2]:
dy_dx(3)

6

In [3]:
import torch

x = torch.tensor(4.0, requires_grad=True) # it tells pytorch that we want to comput its gradient
x

# internally it makes a computation graph for calculating gradients

tensor(4., requires_grad=True)

In [4]:
y = x**2
y

tensor(16., grad_fn=<PowBackward0>)

In [5]:
# o/p tensor.backward()

y.backward()

In [6]:
x.grad  # to see the gradients

tensor(8.)

In [7]:
a = torch.tensor(16.0, requires_grad=True)  # requires floatingpoint only
b = a **2
c = torch.sin(b)

In [8]:
print(a)
print(b)
print(c)

tensor(16., requires_grad=True)
tensor(256., grad_fn=<PowBackward0>)
tensor(-0.9992, grad_fn=<SinBackward0>)


In [9]:
c.backward()

In [10]:
a.grad

tensor(-1.2733)

In [11]:
# derivatives are only calculated for root nodes / leafnodes not intermediate nodes in dags

## training simplest nn and calculating gradient

In [12]:
# inputs

x = torch.tensor(6.7)  # input feature
y = torch.tensor(0.0)  # True label

w = torch.tensor(1.0) # weight
b = torch.tensor(0.0)   # bias

In [13]:
# Binary Cross Entropy Loss for Scalar

def binary_cross_entropy_loss(y_pred, target):
  epsilon = 1e-8  # to prevent log(0)
  y_pred = torch.clamp(y_pred, epsilon, 1-epsilon)
  return -(target * torch.log(y_pred) + (1-target) * torch.log(1-y_pred))

In [14]:
# Forward Pass

z = w * x + b   # weighted sum (linear part)
y_pred = torch.sigmoid(z)  # activation fun predicted probability

loss = binary_cross_entropy_loss(y_pred, y)

In [15]:
loss

tensor(6.7012)

In [16]:
# Derivatives
# 1. dL/d(y_pred)    # loss w.r.t prediction
dL_dP = (y_pred - y)/ (y_pred *(1- y_pred))

#2. dy_pred/ dz :    # prediction w.r.t activation fun
dy_dz = y_pred * (1- y_pred)

#3. dz/dW & dz/db    # z  w.r.t   weights , bias
dz_dW = x
dz_db = 1            # bias contributes directly to z)

dL_dW = dL_dP * dy_dz * dz_dW
dL_db = dL_dP * dy_dz * dz_db

In [17]:
print(f"manual Gradient of loss w.r.t weights: {dL_dW}")
print(f"manual Gradient of loss w.r.t bias: {dL_db}")

manual Gradient of loss w.r.t weights: 6.691762447357178
manual Gradient of loss w.r.t bias: 0.998770534992218


In [18]:
x = torch.tensor(6.7)
y = torch.tensor(0.0)

In [19]:
w = torch.tensor(1.0, requires_grad=True)
b = torch.tensor(0.0, requires_grad=True)

In [20]:
z = w*x + b
z

tensor(6.7000, grad_fn=<AddBackward0>)

In [21]:
y_pred = torch.sigmoid(z)
y_pred

tensor(0.9988, grad_fn=<SigmoidBackward0>)

In [22]:
loss = binary_cross_entropy_loss(y_pred, y)
loss

tensor(6.7012, grad_fn=<NegBackward0>)

In [23]:
loss.backward()

In [24]:
print(w.grad)
print(b.grad)

tensor(6.6918)
tensor(0.9988)


In [25]:
# for vector inputs

x = torch.tensor([3.8, 6.7, 8.9],requires_grad=True)
x

tensor([3.8000, 6.7000, 8.9000], requires_grad=True)

In [39]:
y = (x ** 3 + 3 * (x**2) - 4 * x + 5 ).mean()

In [40]:
y.backward()

In [41]:
x.grad

tensor([ 62.1200, 170.8700, 287.0300])

In [24]:
# clearing grad

# when muiltiple times backward pass done gradients start accumulating
# i.e. 1st time its 3 then becomes 6 like this earlier pass gradients add to current grad
# so clear each time at last  using .grad.zero_()

In [44]:
x = torch.tensor(2.0,requires_grad=True)
for i in range(400):
  y = x**2
  y.backward()
  print(x.grad)
  # x.grad.zero_()

tensor(4.)
tensor(8.)
tensor(12.)
tensor(16.)
tensor(20.)
tensor(24.)
tensor(28.)
tensor(32.)
tensor(36.)
tensor(40.)
tensor(44.)
tensor(48.)
tensor(52.)
tensor(56.)
tensor(60.)
tensor(64.)
tensor(68.)
tensor(72.)
tensor(76.)
tensor(80.)
tensor(84.)
tensor(88.)
tensor(92.)
tensor(96.)
tensor(100.)
tensor(104.)
tensor(108.)
tensor(112.)
tensor(116.)
tensor(120.)
tensor(124.)
tensor(128.)
tensor(132.)
tensor(136.)
tensor(140.)
tensor(144.)
tensor(148.)
tensor(152.)
tensor(156.)
tensor(160.)
tensor(164.)
tensor(168.)
tensor(172.)
tensor(176.)
tensor(180.)
tensor(184.)
tensor(188.)
tensor(192.)
tensor(196.)
tensor(200.)
tensor(204.)
tensor(208.)
tensor(212.)
tensor(216.)
tensor(220.)
tensor(224.)
tensor(228.)
tensor(232.)
tensor(236.)
tensor(240.)
tensor(244.)
tensor(248.)
tensor(252.)
tensor(256.)
tensor(260.)
tensor(264.)
tensor(268.)
tensor(272.)
tensor(276.)
tensor(280.)
tensor(284.)
tensor(288.)
tensor(292.)
tensor(296.)
tensor(300.)
tensor(304.)
tensor(308.)
tensor(312.)
tensor(316.)

In [45]:
x = torch.tensor(2.0,requires_grad=True)
for i in range(400):
  y = x**2
  y.backward()
  print(x.grad)
  x.grad.zero_()

tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)
tensor(4.)

### How to disable gradient tracking

In [24]:
# need when grad tracking is not required
# grad cal needed while forward and backward pass while training nn
# when we're doing predictions then not needed as it occupies memory
# so we off it while backtracking and not forward pass

In [None]:
# 1. requires_grad_(False)
# 2. detach()
# 3. torch.no_grad()

In [None]:
x.grad
x.requires_grad_(False)


In [49]:
x = torch.tensor(2.0,requires_grad=True)
z = x.detach()  # this is detached from computational graph and gradient tracking is off
y = x**2
y1 = z**2
print(x)
print(z)

tensor(2., requires_grad=True)
tensor(2.)


In [50]:
y.backward()
print(x.grad)

tensor(4.)


In [51]:
y1.backward()

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [52]:
with torch.no_grad():
  y = x**2

In [53]:
y.backward()

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn