In [3]:
import mxnet as mx
from mxnet import nd, autograd as ag
mx.random.seed(1)

- Let’s say that we are interested in differentiating a function f = 2 * (x * 2) with respect to parameter x.
- Once we compute the gradient of f with respect to x, we’ll need a place to store it. In MXNet, we can tell an NDArray that we plan to store a gradient by invoking its attach_grad() method.

In [5]:
x = nd.array([[1, 2], [3, 4]])

In [6]:
x.attach_grad()

- Now we’re going to define the function f and MXNet will generate a computation graph on the fly.
- Note that building the computation graph requires a nontrivial amount of computation. So MXNet will only build the graph when explicitly told to do so. 
- We can instruct MXNet to start recording by placing code inside a with autograd.record(): block.

In [7]:
with ag.record():
    y = x * 2
    z = y * x 

In [8]:
z.backward()

In [9]:
x.grad


[[ 4.  8.]
 [12. 16.]]
<NDArray 2x2 @cpu(0)>

In [42]:
with ag.record():
    y = x * 2
    z = y * x

head_gradient = nd.array([[10, 1.], [.1, .01]])
z.backward(head_gradient)
x.grad


[[40.    8.  ]
 [ 1.2   0.16]]
<NDArray 2x2 @cpu(0)>

In [49]:
nd.norm?

In [18]:
a = nd.random_normal(shape=3)
a


[-0.3469252  -0.22134334 -1.804719  ]
<NDArray 3 @cpu(0)>

In [19]:
a.attach_grad()

In [40]:
def f(a):
    b = a * 2
    while (nd.norm(b) < 1000).asscalar():
        b = b * 2

    if (mx.nd.sum(b) > 0).asscalar():
        c = b
    else:
        c = 100 * b  
    return c

with ag.record():
    c = f(a)

c.backward()
a.grad


[102400. 102400. 102400.]
<NDArray 3 @cpu(0)>

In [41]:
a.grad == c / a


[1. 1. 1.]
<NDArray 3 @cpu(0)>

In [56]:
with ag.record():
    y = x * 2
    z = x * y
head_gradient = nd.array([[0.01, 1.0], [0.34, 0.2]])
z.backward(head_gradient)

In [55]:
x, x.grad

(
 [[1. 2.]
  [3. 4.]]
 <NDArray 2x2 @cpu(0)>, 
 [[0.04 8.  ]
  [4.08 3.2 ]]
 <NDArray 2x2 @cpu(0)>)