# Recitation 4: Backpropagation
_Date_: 09/25/2025

## Implement feedforward and backpropagation using `numpy` from scratch

In [1]:
import numpy as np

In [2]:
# =======================
# DO NOT MODIFY THIS CELL
# =======================
def sigmoid(x):
    return (1 / (1 + np.exp(-x)))


def sigmoid_deriv(z):
    return z * (1-z)


def softmax(X, theta = 1.0, axis = None):
    """
    Compute the softmax of each element along an axis of X.

    Parameters
    ----------
    X: ND-Array. Probably should be floats.
    theta (optional): float parameter, used as a multiplier
        prior to exponentiation. Default = 1.0
    axis (optional): axis to compute values along. Default is the
        first non-singleton axis.

    Returns an array the same size as X. The result will sum to 1
    along the specified axis.
    """

    # make X at least 2d
    y = np.atleast_2d(X)

    # find axis
    if axis is None:
        axis = next(j[0] for j in enumerate(y.shape) if j[1] > 1)

    # multiply y against the theta parameter,
    y = y * float(theta)

    # subtract the max for numerical stability
    y = y - np.expand_dims(np.max(y, axis = axis), axis)

    # exponentiate y
    y = np.exp(y)

    # take the sum along the specified axis
    ax_sum = np.expand_dims(np.sum(y, axis = axis), axis)

    # finally: divide elementwise
    p = y / ax_sum

    # flatten if X was 1D
    if len(X.shape) == 1: p = p.flatten()

    return p

In [3]:
# Given both input matrix and truth matrix
x = np.array([
    [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
    [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
    [0, 0, 0, 1, 1, 0, 0, 0, 0, 0]
])

y = np.array([
    [1, 0, 0, 0],
    [0, 1, 0, 0],
    [0, 1, 0, 0]
])

In [4]:
# Dimensions for input and label matrices
n, d = x.shape
_, c = y.shape

# Hidden dimensions
h1, h2, h3 = 8, 6, 4

### Exercise: Fill in the dimensions for weight matrices

In [5]:
# Initializing 3 weight matrices in the neural net
w1 = np.ones((h1, d))
w1[1, range(6)] = 0.3
w1[3, range(6)] = 0.5

# Second weight matrix
w2 = np.ones((h2, h1))
w2[2,range(5)] = 0.2
w2[0, range(5)] = 0.6

# Third weight matrix
w3 = np.ones((h3, h2))
w3[1, range(4)] = 0.7

### Exercise: Implement feedforward pass
Given the input matrix `X`, implement
* 2 hidden score matrices `z1`, `z2`
* Logits `o` and
* a predicted score matrix `y_hat` for each class

using weight matrices and sigmoid function as the activation function. 

In [6]:
z1 = sigmoid(x @ w1.T)
z2 = sigmoid(z1 @ w2.T)
o = z2 @ w3.T
y_hat = softmax(o, axis=1)

print(f"After softmax:\n{'-' * 20}\n{y_hat}")

After softmax:
--------------------
[[0.30258923 0.09223232 0.30258923 0.30258923]
 [0.30268056 0.09195832 0.30268056 0.30268056]
 [0.30258923 0.09223232 0.30258923 0.30258923]]


### Exercise: Implement backpropagation pass
You need to implement three components in the backpropagation pass (in sequence):
* Error signal $D^{(i)}$
* Cache matrix that combines current error signal and associated weight matrix $$C = D \cdot \Theta$$
* Gradient matrix w.s.t specific weight using cache matrix and hidden matrix $$\nabla \Theta = D^T \cdot Z = (F'(Z) \odot C)^T \cdot Z$$

In [7]:
# Last layer: loss -> W3
D_o = y_hat - y
cache_z2 = D_o @ w3
grad_w3 = D_o.T @ z2

# Second layer
D_2 = sigmoid_deriv(z2) * cache_z2
cache_z1 = D_2 @ w2
grad_w2 = D_2.T @ z1

# First layer
D_1 = sigmoid_deriv(z1) * cache_z1
grad_w1 = D_1.T @ x

### Exercise: Implement weight update
Assume using Gradient Descent, and the learning rate is fixed to $0.1$, update the weight for next iteration. 

In [8]:
lr = 0.1
new_w1 = w1 - lr*grad_w1
new_w2 = w2 - lr*grad_w2
new_w3 = w3 - lr*grad_w3

## Implement using `torch`

In [9]:
import torch
import torch.nn as nn

In [10]:
x_ = torch.tensor(x, dtype=torch.float)
y_ = torch.tensor(y, dtype=torch.float)
w1_ = torch.tensor(w1, dtype=torch.float, requires_grad=True)
w2_ = torch.tensor(w2, dtype=torch.float, requires_grad=True)
w3_ = torch.tensor(w3, dtype=torch.float, requires_grad=True)
F = nn.Sigmoid()
softmax_fn = nn.Softmax(dim=1)

In [11]:
z1_ = F(x_ @ w1_.T)
z2_ = F(z1_ @ w2_.T)
o_ = z2_ @ w3_.T
y_hat_ = softmax_fn(o_)

In [12]:
y_hat_

tensor([[0.3026, 0.0922, 0.3026, 0.3026],
        [0.3027, 0.0920, 0.3027, 0.3027],
        [0.3026, 0.0922, 0.3026, 0.3026]], grad_fn=<SoftmaxBackward0>)

In [13]:
ce_loss = nn.functional.cross_entropy(o_, y_)
ce_loss

tensor(1.9884, grad_fn=<DivBackward1>)

In [14]:
ce_loss.backward()

In [15]:
w1_.grad

tensor([[2.2689e-05, 2.2689e-05, 3.4619e-05, 1.1742e-04, 1.1742e-04, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [1.3148e-04, 1.3148e-04, 1.5748e-04, 2.5587e-04, 2.5587e-04, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [2.2689e-05, 2.2689e-05, 3.4619e-05, 1.1742e-04, 1.1742e-04, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [9.1951e-05, 9.1951e-05, 1.1429e-04, 2.1989e-04, 2.1989e-04, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [2.2689e-05, 2.2689e-05, 3.4619e-05, 1.1742e-04, 1.1742e-04, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [7.8520e-05, 7.8520e-05, 1.1608e-04, 3.6965e-04, 3.6965e-04, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [7.8520e-05, 7.8520e-05, 1.1608e-04, 3.6965e-04, 3.6965e-04, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [7.8520e-05, 7.8520

In [16]:
grad_w1

array([[6.80683602e-05, 6.80683602e-05, 1.03859519e-04, 3.52263257e-04,
        3.52263257e-04, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [3.94447798e-04, 3.94447798e-04, 4.72437831e-04, 7.67592433e-04,
        7.67592433e-04, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [6.80683602e-05, 6.80683602e-05, 1.03859519e-04, 3.52263257e-04,
        3.52263257e-04, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [2.75859494e-04, 2.75859494e-04, 3.42882340e-04, 6.59651347e-04,
        6.59651347e-04, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [6.80683602e-05, 6.80683602e-05, 1.03859519e-04, 3.52263257e-04,
        3.52263257e-04, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [2.35562921e-04, 2.35562921e-04, 3.48236358e-04, 1.10895296e-03,
   