In [40]:
import torch, torchvision

In [41]:
## Autograd
 # torc.autograd is an automatic differentiation tool for use in NN training, backprop with gradient    descent, etc

In [42]:
## Demonstration of a single training step
model = torchvision.models.resnet18(pretrained=True)  # pretrained model provided
data = torch.rand(1,3,64,64)  # 1 image, 3 channels, 64x64
labels = torch.rand(1,1000)   # Random initial labels

# forward pass
pred = model(data)

# Backward Pass
loss = (pred-labels).sum()  #tensor(-520.9473, grad_fn=<SumBackward0>)
# The loss calculation defines the gradient function. Autograd will calculate the gradients and store them in the model.{parameter}grad attribute
loss.backward()

# Optimization
# We can create an optimizer, and specify the optimization approach. Example uses Stochastic Gradient Descent. Learning Rate = .01 and momentum - 0.9
optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)
optimizer.step()  # This adjusts each parameter by the gradient stored in .grad affter .backwards()

In [53]:
## How Autograd works under the hood
a = torch.tensor([2.,3.], requires_grad=True)  # require_grad will track every op on the tensor
b = torch.tensor([6.,4.], requires_grad=True)

## Imagine a and b are neural network parameters, let's define the error as Q = 3a^3 - b^2
# So, the gradient with respect to a: dQ/da = 9a^2
# the gradient with respect to b: dQ/db = -2b
Q = 3*a**3 - b**2

## We can all .backward() on Q, and the above gradients will be calculated.
# We need to pass a gradient argument equivalent to backwards, which == torch.ones(Q.shape)
Q.backward(gradient=torch.ones(Q.shape))

# Let's check of our math is right
assert all(a.grad == 9*a.mul(a))  # 9a^2
assert all(b.grad == -2*b)        # -2b

In [54]:
## Vector Calculus using Autograd
# let y> = f(x>) where y>, x> indicate that y and x are vectors

# The gradient of y> with respect to x> is a jacobian Matrix: (dy/dx1, dy/dx2...dy/dxn). Assuming Y is an m dimensional vecotr, it follows that J = (dy1/dx1 ... dy1.dxn
#                    .        .      .            
#                    dym/dx1 ... dym/dxn )
#
# "Generally speaking", torch.autograd is a Ti84 that can compute a vector-Jacobian product. i.e. give a vector v>, compute
# J.T * v
#
# If v is a gradient of scalar function: l = g(y>) --> v = (dl/dy1 ... dldym).T
# THEN (chain rule), the vector-Jacobian product J.T * v IS THE GRADIENT OF l WITH RESPECT TO x
# 
# J.T * v = [[dy1/dx1...dym.dx1] ... [dy1/dxn...dym/dxn]] * [dl/dy1 ... dl/dym] == [dl/dx1...dl/dxn]
# ***v> is what torch.ones(Q.shape) is above***
#
# !! Okay so v> = [1,1]. So [1,1] is the gradient of some scalar function l = g(y>).  I think Q> == y> in this case. We define
# !! g as some function on Q> that produces a scalar. If g = sum(), then the gradient of l=sum(y>), with Q> having two 
# !! dimensions, is like taking the gradient of l=sum([a,b]), then we have [dl/da, dl/db] == [1,1].
# !!
# TODO: !DF! re-review this

In [55]:
# TODO: !DF! Come back and read https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html#computational-graph