# Example 25.3

Using Analytic Gradient for Backpropagation

$$
Z_{n,m} = f_1( X_{n,d}.W_{1\;d,m}) \\
O_{n,p} = f_2(Z_{n,m}.W_{2\;m,p}) \\ 
$$

$$
\mathcal{E} = \frac{1}{n}\sum^{n}_{i=1}\frac{1}{2} \left( \mathbf Y_{i} - \mathbf O_{i}\right)^2 
$$

$$
\mathcal{E} = \frac{1}{n}\sum^{n}_{i=1}\sum^{p}_{j=1}\left( -Y_{i,j}\log O_{i,j}\right)
$$

$$
\left(\frac{\partial\mathcal{E}}{\partial net_2}\right)_{n,p}=\left(\frac{\partial\mathcal E}{\partial f_2}\right)_{n,p}\odot\left(\frac{\partial f_2}{\partial net_2}\right)_{n,p}
$$

$$
\left(\frac{\partial \mathcal{E}}{\partial net_1}\right)_{n,m}=\left(\left(\frac{\partial\mathcal{E}}{\partial net_2}\right)_{n,p}\cdot
\left(W_2^T\right)_{p,m}\right)\odot\left(\frac{\partial f_1}{\partial net_1}\right)_{n,m}
$$


$$
\left(\nabla_{W_2}\mathcal{E}\right)_{m,p} = \left(Z^T\right)_{m,n} \cdot \left(\frac{\partial\mathcal{E}}{\partial net_2}\right)_{n,p}
$$

$$
\left(\nabla_{W_1}\mathcal{E}\right)_{d,m} = \left(X^T\right)_{d,n} \cdot \left(\frac{\partial\mathcal{E}}{\partial net_1}\right)_{n,m}
$$

$$
\left(\frac{\partial\mathcal{E}}{\partial net_2}\right)_{n,p}=\frac{1}{n}( O - Y )_{n,p}
$$

In [3]:
Backpropagation import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

def f1(x): return np.tanh(x)
def df1(x): return 1 - np.power(np.tanh(x),2)
def f2(x): return x
def df2(x): return 1

def aug(X):
    n = len(X)
    X1 = np.ones((n,1))
    return np.hstack((X1,X))

def predict(X,W1,W2):
    Z = f1( aug(X) @ W1)
    O = f2( aug(Z) @ W2)
    return O

def update_pars(Y,X,W1,W2):
    Xt = aug(X)
    Z = f1( aug(X) @ W1)
    Zt = aug(Z)
    dZ = df1(Xt @ W1)
    O = f2(Zt @ W2)
    G2 = (O - Y)/len(Y)
    dW2 = Zt.T @ G2
    W2 -= eta * dW2
    G1 = (G2 @ W2[1:].T)*dZ
    dW1 = Xt.T @ G1
    W1 -= eta * dW1


SyntaxError: invalid syntax (1441051073.py, line 1)

In [None]:
n,d,m,p = 25,1,10,1

X = np.random.uniform(-10,10,n).reshape((n,d))
Y = np.sin(X) # n,d

np.random.seed(11)
W1 = np.random.randn(d+1,m)
W2 = np.random.randn(m+1,p)

eta = 1e-1  # step size (learning rate)
num_steps = int(1e5)

for i in tqdm(range(num_steps)):
    # idx = np.random.choice(np.arange(n),5)
    # update_pars(Y[idx],X[idx],W1,W2)
    update_pars(Y,X,W1,W2)

t = np.arange(-10,10,0.1)
yt = np.sin(t)
yp = predict(t[:,None],W1,W2)

plt.plot(t,yp,'b')
plt.plot(t,yt,':r')
plt.scatter(X,Y,ec='red',fc='none')
plt.show()


-------------------------------------
# Example 25.3

Using Numerical Gradient for Backpropagation

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

def f1(x): return np.tanh(x)
# def df1(x): return 1 - np.power(np.tanh(x),2)
def f2(x): return x
# def df2(x): return 1

def aug(X):
    n = len(X)
    X1 = np.ones((n,1))
    return np.hstack((X1,X))

def predict(X,W1,W2):
    Z = f1( aug(X) @ W1)
    O = f2( aug(Z) @ W2)
    return O

def loss(Y,X,W1,W2):
    Ypr = predict(X,W1,W2)
    return np.mean(np.power(Ypr-Y,2))

def update(x,i,j):
    h=1e-5
    x0 = x[i,j].copy()
    x[i,j] = x0 + h
    f2 = loss(Y,X,W1,W2)
    x[i,j] = x0 - h
    f1 = loss(Y,X,W1,W2)
    g = (f2-f1)/(2*h)
    x0 = x0 - eta*g
    x[i,j] = x0


In [None]:
n,d,m,p = 25,1,10,1

np.random.seed(111)

X = np.random.uniform(-np.pi,np.pi,n).reshape((n,d))
Y = np.sin(X) # n,d

W1 = np.random.randn(d+1,m)
W2 = np.random.randn(m+1,p)

num_steps = int(1e4)
eta = 1e-2

print(f"loss = {loss(Y,X,W1,W2).round(3)}")


In [None]:
for i in tqdm(range(num_steps)):

    a,b = W1.shape
    for i in range(a):
        for j in range(b):
            update(W1,i,j)
    a,b = W2.shape
    for i in range(a):
        for j in range(b):
            update(W2,i,j)

t = np.arange(-2*np.pi,2*np.pi,0.1)
yt = np.sin(t)
yp = predict(t[:,None],W1,W2)

print(f"training loss = {loss(Y,X,W1,W2).round(3)}")
print(f"testing loss = {loss(yt[:,None],t[:,None],W1,W2).round(3)}")

plt.plot(t,yp,'b')
plt.plot(t,yt,':r')
plt.scatter(X,Y,ec='red',fc='none')
plt.show()


$$
Z_{n,m} = f_1( X_{n,d}.W_{1\;d,m}) \\
O_{n,p} = f_2(Z_{n,m}.W_{2\;m,p}) \\ 
$$

$$c = \sqrt{a^2 + b^2}$$