## Reverse Mode Auto-Diff Development

In [None]:
import numpy as np
from autodiff.value import Value
from autodiff.mlp import MLP

### Manual Backpropagation on an Expression

In [None]:
a = Value(2.0, label='a')
b = Value(-3.0, label='b')
c = Value(10.0, label='c')
f = Value(-2.0, label='f')
e = a * b
e.label = 'e'
d = e + c
d.label = 'd'
L = d * f
L.label = 'L'

# Backpropagation
L.dv = 1 # dL/dL = 1

d.dv = f.v # dL/dd = d/dd(d * f) = f
f.dv = d.v # dL/df = d/df(d * f) = d

c.dv = d.dv # dL/dc = dL/dd * dd/dc = f * d/dc(e + c) = f * 1 = f
e.dv = d.dv # dL/de = dL/dd * dd/de = f * d/de(e + c) = f * 1 = f

a.dv = e.dv * b.v # dL/da = dL/de * de/da = f * d/da(a * b) = f * b
b.dv =  e.dv * a.v # dL/db = dL/de * de/db = f * d/db(a * b) = f * a

L.draw_graph()

### Manual Backpropagation on a Neuron

In [None]:
# Inputs x1, x2
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')

# Weights w1, w2
w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')

# Bias
b = Value(6.8813735870195432, label='b')

# o = tanh(x1 * w1 + x2 * w2 + b)
x1w1 = x1 * w1
x1w1.label = 'x1 * w1'
x2w2 = x2 * w2
x2w2.label = 'x2 * w2'
x1w1x2w2 = x1w1 + x2w2
x1w1x2w2.label = 'x1 * w1 + x2 * w2'
z = x1w1x2w2 + b
z.label = 'z'
o = z.tanh()
o.label = 'o'

# Backpropagation.
o.dv = 1

z.dv = 1 - o.v**2 # do/dn = d/dn(tanh(n)) = 1 - tanh(n)**2 = 1 - o**2

x1w1x2w2.dv = z.dv # do/dx1w1x2w2 = do/dn * dn/dx1w1x2w2 = do/dn * d/dx1w1x2w2((x1 * w1 + x2 * w2) + b) = do/dn
b.dv = z.dv # do/db = do/dn * dn/db = do/dn * d/db((x1 * w1 + x2 * w2) + b) = do/dn

x1w1.dv = x1w1x2w2.dv # do/dx1w1 = do/dx1w1x2w2 * dx1w1x2w2/dx1w1 = do/dx1w1x2w2 * d/dx1w1(x1 * w1 + x2 * w2) = do/dx1w1x2w2
x2w2.dv = x1w1x2w2.dv # do/dx2w2 = do/dx1w1x2w2 * dx1w1x2w2/dx2w2 = do/dx1w1x2w2 * d/dx2w2(x1 * w1 + x2 * w2) = do/dx1w1x2w2

x1.dv = x1w1.dv * w1.v #do/x1 = do/x1w1 * dx1w1/dx1 = do/x1w1 * d/dx1(x1 * w1) = do/x1w1 * w1
w1.dv = x1w1.dv * x1.v #do/w1 = do/x1w1 * dx1w1/dw1 = do/x1w1 * d/dw1(x1 * w1) = do/x1w1 * x1
x2.dv = x2w2.dv * w2.v #do/x2 = do/x2w2 * dx2w2/dx2 = do/x2w2 * d/dx2(x2 * w2) = do/x2w2 * w2
w2.dv = x2w2.dv * x2.v #do/w2 = do/x2w2 * dx2w2/dw2 = do/x2w2 * d/dw2(x2 * w2) = do/x2w2 * x2

o.draw_graph()

In [None]:
print('Before: ', o.v)

# Single optimisation step. Increasing o by moving in the direction of the gradient.
w1.v += 0.1 * w1.dv
w2.v += 0.1 * w2.dv
b.v += 0.1 * b.dv

# Forward pass.
x1w1 = x1 * w1
x2w2 = x2 * w2
x1w1x2w2 = x1 * w1 + x2 * w2
n = x1w1x2w2 + b
o = n.tanh()

print('After: ', o.v)

### Testing the Backpropagation

In [None]:
def finite_difference_gradient(f: callable, x: Value, eps: float = 1e-8) -> np.ndarray:
    """Estimates the gradient of a function using the central difference formula."""
    return (f(x + eps) - f(x - eps)) / (2 * eps)

x = Value(2)
f = lambda x: (-(x**2).sin() + 5).exp()
y = f(x)
y.backward()
y.draw_graph()

In [None]:
assert np.isclose(finite_difference_gradient(f, x).v, x.dv, atol=1e-4)

### Automatic Backpropagation on a Neuron

In [None]:
# Inputs x1, x2
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')

# Weights w1, w2
w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')

# Bias
b = Value(6.8813735870195432, label='b')

# o = tanh(x1 * w1 + x2 * w2 + b)
x1w1 = x1 * w1
x1w1.label = 'x1 * w1'
x2w2 = x2 * w2
x2w2.label = 'x2 * w2'
x1w1x2w2 = x1w1 + x2w2
x1w1x2w2.label = 'x1 * w1 + x2 * w2'
z = x1w1x2w2 + b
z.label = 'z'
o = z.tanh()
o.label = 'o'

# Backpropagation
o.backward()
o.draw_graph()

### Backpropagation on an MLP

In [None]:
x = [2.0, 3.0, -1.0]
n = MLP(2, [4, 4, 1])
print(len(n.parameters()))
n(x).draw_graph()

In [None]:
# Example data
xs = [
    [2.0, 3.0, -1.0],
    [3.0, -1.0, 0.5],
    [0.5, 1.0, 1.0],
    [1.0, 1.0, -1.0]
]
ys = [1.0, -1.0, -1.0, 1.0] # Labels

def loss(ys, ypreds):
    """Mean squared error loss."""
    return 0.5 * sum([(y - ypred)**2 for y, ypred in zip(ys, ypreds)])

max_iters = 10
lr = 0.01 # Learning rate

# Training loop. Gradient descent
for k in range(max_iters):
    # Forward pass
    ypreds = [n(x) for x in xs]
    l = loss(ys, ypreds)

    # Backward pass
    n.zero_grad()
    l.backward()

    # Update weights
    for param in n.parameters():
        param.v -= lr * param.dv

    print(f'Iteration {k}: loss = {l.v}')

# Predictions
print('Predictions:', ypreds)