# Recitation 4: Backpropagation
_Date_: 09/25/2025

## Implement feedforward and backpropagation using `numpy` from scratch

In [None]:
import numpy as np

In [None]:
# =======================
# DO NOT MODIFY THIS CELL
# =======================
def sigmoid(x):
    return (1 / (1 + np.exp(-x)))


def sigmoid_deriv(z):
    return z * (1-z)


def softmax(X, theta = 1.0, axis = None):
    """
    Compute the softmax of each element along an axis of X.

    Parameters
    ----------
    X: ND-Array. Probably should be floats.
    theta (optional): float parameter, used as a multiplier
        prior to exponentiation. Default = 1.0
    axis (optional): axis to compute values along. Default is the
        first non-singleton axis.

    Returns an array the same size as X. The result will sum to 1
    along the specified axis.
    """

    # make X at least 2d
    y = np.atleast_2d(X)

    # find axis
    if axis is None:
        axis = next(j[0] for j in enumerate(y.shape) if j[1] > 1)

    # multiply y against the theta parameter,
    y = y * float(theta)

    # subtract the max for numerical stability
    y = y - np.expand_dims(np.max(y, axis = axis), axis)

    # exponentiate y
    y = np.exp(y)

    # take the sum along the specified axis
    ax_sum = np.expand_dims(np.sum(y, axis = axis), axis)

    # finally: divide elementwise
    p = y / ax_sum

    # flatten if X was 1D
    if len(X.shape) == 1: p = p.flatten()

    return p

In [None]:
# Given both input matrix and truth matrix
x = np.array([
    [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
    [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
    [0, 0, 0, 1, 1, 0, 0, 0, 0, 0]
])

y = np.array([
    [1, 0, 0, 0],
    [0, 1, 0, 0],
    [0, 1, 0, 0]
])

In [None]:
# Dimensions for input and label matrices
n, d = x.shape
_, c = y.shape

# Hidden dimensions
h1, h2, h3 = 8, 6, 4

### Exercise: Fill in the dimensions for weight matrices

In [None]:
# Initializing 3 weight matrices in the neural net
w1 = np.ones((?, ?))
w1[1, range(6)] = 0.3
w1[3, range(6)] = 0.5

# Second weight matrix
w2 = np.ones((?, ?))
w2[2,range(5)] = 0.2
w2[0, range(5)] = 0.6

# Third weight matrix
w3 = np.ones((?, ?))
w3[1, range(4)] = 0.7

### Exercise: Implement feedforward pass
Given the input matrix `X`, implement
* 2 hidden score matrices `z1`, `z2`
* Logits `o` and
* a predicted score matrix `y_hat` for each class

using weight matrices and sigmoid function as the activation function. 

In [None]:
z1 = ...
z2 = ...
o = ...
y_hat = ...

print(f"After softmax:\n{'-' * 20}\n{y_hat}")

### Exercise: Implement backpropagation pass
You need to implement three components in the backpropagation pass (in sequence):
* Error signal $D^{(i)}$
* Cache matrix that combines current error signal and associated weight matrix $$C = D \cdot \Theta$$
* Gradient matrix w.s.t specific weight using cache matrix and hidden matrix $$\nabla \Theta = D^T \cdot Z = (F'(Z) \odot C)^T \cdot Z$$

In [None]:
# Last layer: loss -> W3
D_o = ...
cache_z2 = ...
grad_w3 = ...

# Second layer
D_2 = ...
cache_z1 = ...
grad_w2 = ...

# First layer
D_1 = ...
grad_w1 = ...

### Exercise: Implement weight update
Assume using Gradient Descent, and the learning rate is fixed to $0.1$, update the weight for next iteration. 

In [None]:
lr = 0.1
new_w1 = ...
new_w2 = ...
new_w3 = ...

## Implement using `torch`

In [None]:
import torch
import torch.nn as nn

In [None]:
x_ = ...
y_ = ...
w1_ = ...
w2_ = ...
w3_ = ...
F = ...
softmax_fn = ...

In [None]:
z1_ = ...
z2_ = ...
o_ = ...
y_hat_ = ...

In [None]:
y_hat_

In [None]:
ce_loss = ...
ce_loss

In [None]:
ce_loss.backward()

In [None]:
w1_.grad

In [None]:
grad_w1