# Backpropagation

> A feedforward neural network with two hidden layers and one output unit, designed for regression tasks using Mean Squared Error **(MSE)** loss and ReLU activations.

## Architecture


- ReLU activation
- 2 hidden layers, each with 2 neurons
- 3 input features
- 1 output unit
- Mean Squared Error (MSE) loss
- Backpropagation implemented manually

In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [9]:
# Initialize parameters
np.random.seed(0)
input_size = 3
hidden1_size = 2
hidden2_size = 2
output_size = 1
lr = 0.01  # learning rate

# Weights and biases
W1 = np.random.randn(input_size, hidden1_size)
b1 = np.zeros((1, hidden1_size))

W2 = np.random.randn(hidden1_size, hidden2_size)
b2 = np.zeros((1, hidden2_size))

W3 = np.random.randn(hidden2_size, output_size)
b3 = np.zeros((1, output_size))

In [12]:
# ReLU and its derivative
def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return (x > 0).astype(float)

# Mean Squared Error and its derivative
def mse(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

def mse_derivative(y_true, y_pred):
    return 2 * (y_pred - y_true) / y_true.size

In [15]:
# Training loop
def train(X, y, epochs=1000):
    global W1, b1, W2, b2, W3, b3
    for epoch in range(epochs):
        # Forward pass
        z1 = X @ W1 + b1
        a1 = relu(z1)

        z2 = a1 @ W2 + b2
        a2 = relu(z2)

        z3 = a2 @ W3 + b3
        y_pred = z3  # linear output layer

        # Compute loss
        loss = mse(y, y_pred)

        # Backward pass
        dL_dy = mse_derivative(y, y_pred)

        # Layer 3
        dL_dW3 = a2.T @ dL_dy
        dL_db3 = np.sum(dL_dy, axis=0, keepdims=True)

        # Layer 2
        dL_da2 = dL_dy @ W3.T
        dL_dz2 = dL_da2 * relu_derivative(z2)
        dL_dW2 = a1.T @ dL_dz2
        dL_db2 = np.sum(dL_dz2, axis=0, keepdims=True)

        # Layer 1
        dL_da1 = dL_dz2 @ W2.T
        dL_dz1 = dL_da1 * relu_derivative(z1)
        dL_dW1 = X.T @ dL_dz1
        dL_db1 = np.sum(dL_dz1, axis=0, keepdims=True)

        # Update weights and biases
        W3 -= lr * dL_dW3
        b3 -= lr * dL_db3
        W2 -= lr * dL_dW2
        b2 -= lr * dL_db2
        W1 -= lr * dL_dW1
        b1 -= lr * dL_db1

        # Print loss occasionally
        if epoch % 100 == 0:
            print(f"Epoch {epoch}, Loss: {loss:.4f}")



In [16]:
# Example usage
X = np.random.rand(10, 3)  # 10 samples, 3 features
y = np.random.rand(10, 1)  # 10 target values

train(X, y)

Epoch 0, Loss: 0.0672
Epoch 100, Loss: 0.0543
Epoch 200, Loss: 0.0537
Epoch 300, Loss: 0.0536
Epoch 400, Loss: 0.0535
Epoch 500, Loss: 0.0535
Epoch 600, Loss: 0.0534
Epoch 700, Loss: 0.0534
Epoch 800, Loss: 0.0534
Epoch 900, Loss: 0.0534


## References

- [Gradient Descent](https://en.wikipedia.org/wiki/Gradient_descent)