# Backpropagation Implementation from Scratch
## Neural Network for XOR Problem

This notebook implements a simple 2-layer neural network with backpropagation to solve the XOR problem.

In [18]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

## 1. Neural Network Class

Implements a 2-layer neural network with sigmoid activation function.

In [21]:
class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size):
        # Initialize weights with small random values
        self.W1 = np.random.randn(input_size, hidden_size) * 0.5
        self.b1 = np.zeros((1, hidden_size))
        self.W2 = np.random.randn(hidden_size, output_size) * 0.5
        self.b2 = np.zeros((1, output_size))
    
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    def sigmoid_derivative(self, a):
        return a * (1 - a)
    
    def forward(self, X):
        # Hidden layer
        self.z1 = X @ self.W1 + self.b1
        self.a1 = self.sigmoid(self.z1)
        # Output layer
        self.z2 = self.a1 @ self.W2 + self.b2
        self.a2 = self.sigmoid(self.z2)
        return self.a2

## 2. MSE Loss Class

Mean Squared Error loss with backpropagation method to compute gradients.

In [22]:
class MSELoss:
    def __init__(self):
        self.gradients = {}
    
    def __call__(self, y_pred, y_true):
        return 0.5 * np.mean((y_true - y_pred) ** 2)
    
    def backpropagate(self, network, X, y_true):
        m = X.shape[0]
        # Output layer gradient
        delta2 = -(y_true - network.a2) * network.sigmoid_derivative(network.a2)
        # Gradients for W2 and b2
        self.gradients['dW2'] = (network.a1.T @ delta2) / m
        self.gradients['db2'] = np.sum(delta2, axis=0, keepdims=True) / m
        # Hidden layer gradient
        delta1 = (delta2 @ network.W2.T) * network.sigmoid_derivative(network.a1)
        # Gradients for W1 and b1
        self.gradients['dW1'] = (X.T @ delta1) / m
        self.gradients['db1'] = np.sum(delta1, axis=0, keepdims=True) / m

## 3. SGD Optimizer Class

Stochastic Gradient Descent optimizer that updates network weights.

In [23]:
class SGDOptimizer: # this is fake SGD (not stochastic for this simplistic data)
    def __init__(self, learning_rate=0.5):
        self.lr = learning_rate
    
    def step(self, network, gradients):
        # Update weights and biases
        network.W2 -= self.lr * gradients['dW2']
        network.b2 -= self.lr * gradients['db2']
        network.W1 -= self.lr * gradients['dW1']
        network.b1 -= self.lr * gradients['db1']

## 4. Prepare XOR Dataset

The XOR problem is a classic test for neural networks because it's not linearly separable.

In [24]:
# XOR dataset
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([[0], [1], [1], [0]])

print("XOR Truth Table:")
print("-" * 30)
for i in range(len(X)):
    print(f"Input: {X[i]} --> Expected Output: {y[i][0]}")

XOR Truth Table:
------------------------------
Input: [0 0] --> Expected Output: 0
Input: [0 1] --> Expected Output: 1
Input: [1 0] --> Expected Output: 1
Input: [1 1] --> Expected Output: 0


## 5. Initialize Network, Loss, and Optimizer

In [28]:
# Initialize network, loss, and optimizer
network = NeuralNetwork(input_size=2, hidden_size=3, output_size=1)
criterion = MSELoss()
optimizer = SGDOptimizer(learning_rate=0.5)

print("Network initialized with:")
print(f"  Input size: 2")
print(f"  Hidden size: 3")
print(f"  Output size: 1")
print(f"  Learning rate: 0.5")

Network initialized with:
  Input size: 2
  Hidden size: 3
  Output size: 1
  Learning rate: 0.5


## 6. Training Loop

Train the network for 10,000 epochs.

In [29]:
print("Training Neural Network on XOR Problem")
print("=" * 50)

# Training loop
for epoch in range(10000):
    # Forward pass
    predictions = network.forward(X)
    loss = criterion(predictions, y)
    
    # Backward pass. This part is only computing the gradients
    criterion.backpropagate(network, X, y)
    
    # Update weights
    optimizer.step(network, criterion.gradients)
    
    # Print progress
    if (epoch + 1) % 2000 == 0:
        print(f"Epoch {epoch + 1:5d}, Loss: {loss:.6f}")

print("\n" + "-" * 50)
print("Training Complete!")
print("=" * 50)

Training Neural Network on XOR Problem
Epoch  2000, Loss: 0.124454
Epoch  4000, Loss: 0.067873
Epoch  6000, Loss: 0.006640
Epoch  8000, Loss: 0.002464
Epoch 10000, Loss: 0.001420

--------------------------------------------------
Training Complete!


## 7. Evaluate Final Results

In [30]:
# Final predictions
print("\nFinal Results:")
print("-" * 50)
for i in range(len(X)):
    print(f"Input: {X[i]} --> Output: {predictions[i][0]:.4f} (Expected: {y[i][0]})")

print("\n" + "=" * 50)
print("Interpretation:")
print("-" * 50)
print("Values close to 0.0 represent False/0")
print("Values close to 1.0 represent True/1")
print("\nThe network has successfully learned the XOR function!")


Final Results:
--------------------------------------------------
Input: [0 0] --> Output: 0.0315 (Expected: 0)
Input: [0 1] --> Output: 0.9529 (Expected: 1)
Input: [1 0] --> Output: 0.9374 (Expected: 1)
Input: [1 1] --> Output: 0.0650 (Expected: 0)

Interpretation:
--------------------------------------------------
Values close to 0.0 represent False/0
Values close to 1.0 represent True/1

The network has successfully learned the XOR function!


## 8. Get the Network Weights (Optional)

In [10]:
print("\nLearned Weights:")
print("=" * 50)
print("\nW1 (Input --> Hidden):")
print(network.W1)
print("\nb1 (Hidden bias):")
print(network.b1)
print("\nW2 (Hidden --> Output):")
print(network.W2)
print("\nb2 (Output bias):")
print(network.b2)


Learned Weights:

W1 (Input --> Hidden):
[[ 6.2385695  -0.69638346  4.28320814]
 [ 6.22016816 -0.53386542  4.28597905]]

b1 (Hidden bias):
[[-2.68924575 -0.01274379 -6.5648958 ]]

W2 (Hidden --> Output):
[[ 8.84436629]
 [ 0.15700147]
 [-9.40872799]]

b2 (Output bias):
[[-4.15376572]]


## Regression Example with Parametric Rectified Linear Unit Activations

In [33]:
def train_test_split(x,y,test_size=0.2):
    ind = np.random.permutation(range(len(x)))
    bins = np.array_split(ind,1//test_size)
    testidx = bins[0]
    trainidx = np.delete(range(len(x)),testidx)
    return x[trainidx], x[testidx], y[trainidx], y[testidx]

In [34]:
class PReLU:
    def __init__(self, input_size):
        # Initialize alpha parameters (one per neuron)
        self.alpha = np.ones((1, input_size)) * 0.25
        
    def forward(self, z):
        self.z = z
        return np.where(z > 0, z, self.alpha * z)
    
    def backward(self, grad_output):
        # Gradient w.r.t input
        grad_z = np.where(self.z > 0, 1, self.alpha) * grad_output
        
        # Gradient w.r.t alpha (for updating learnable parameter)
        grad_alpha = np.where(self.z > 0, 0, self.z) * grad_output
        grad_alpha = np.sum(grad_alpha, axis=0, keepdims=True)
        
        return grad_z, grad_alpha


class NeuralNetworkPReLU:
    def __init__(self, input_size, hidden_size, output_size):
        # Initialize weights with He initialization (good for ReLU-like activations)
        self.W1 = np.random.randn(input_size, hidden_size) * np.sqrt(2.0 / input_size)
        self.b1 = np.zeros((1, hidden_size))
        
        self.W2 = np.random.randn(hidden_size, output_size) * np.sqrt(2.0 / hidden_size)
        self.b2 = np.zeros((1, output_size))
        
        # PReLU activation for hidden layer
        # note that this is an object based on the PReLU
        self.prelu = PReLU(hidden_size)
    
    def forward(self, X):
        # Hidden layer with PReLU
        self.z1 = X @ self.W1 + self.b1
        self.a1 = self.prelu.forward(self.z1)
        
        # Output layer (linear activation for regression)
        self.z2 = self.a1 @ self.W2 + self.b2
        self.a2 = self.z2  # No activation function for regression output
        
        return self.a2
    
    def predict(self, X):
        return self.forward(X)


class MSELoss:
    def __init__(self):
        self.gradients = {}
    
    def __call__(self, y_pred, y_true):
        return np.mean((y_pred - y_true) ** 2)
    
    def backpropagate(self, network, X, y_true):
        m = X.shape[0]
        
        # Output layer gradient (derivative of MSE + linear activation)
        delta2 = (network.a2 - y_true) / m
        
        # Gradients for output layer
        self.gradients['dW2'] = network.a1.T @ delta2
        self.gradients['db2'] = np.sum(delta2, axis=0, keepdims=True)
        
        # Backpropagate through PReLU
        delta1_pre = delta2 @ network.W2.T
        delta1, grad_alpha = network.prelu.backward(delta1_pre)
        
        # Gradients for hidden layer
        self.gradients['dW1'] = X.T @ delta1
        self.gradients['db1'] = np.sum(delta1, axis=0, keepdims=True)
        
        # Gradient for PReLU alpha parameter
        self.gradients['d_alpha'] = grad_alpha


class SGDOptimizer: # (Again this is not stochastic)
    def __init__(self, learning_rate=0.01):
        self.lr = learning_rate
    
    def step(self, network, gradients):
        network.W2 -= self.lr * gradients['dW2']
        network.b2 -= self.lr * gradients['db2']
        network.W1 -= self.lr * gradients['dW1']
        network.b1 -= self.lr * gradients['db1']
        
        # Update PReLU alpha parameter
        network.prelu.alpha -= self.lr * gradients['d_alpha']


def load_concrete_data():

    url = "https://github.com/dvasiliu/AML/blob/main/Data%20Sets/concrete.csv?raw=true"
    df = pd.read_csv(url)
    
    # Extract features and target
    X = df.iloc[:, :-1].values
    y = df.iloc[:, -1:].values

    return X, y


def train_network(network, criterion, optimizer, X_train, y_train, 
                  X_test, y_test, epochs=6000, print_every=500):
    train_losses = []
    test_losses = []
    
    for epoch in range(epochs):
        # Forward pass
        predictions = network.forward(X_train)
        train_loss = criterion(predictions, y_train)
        
        # Backward pass
        criterion.backpropagate(network, X_train, y_train)
        
        # Update parameters
        optimizer.step(network, criterion.gradients)
        
        # Test loss
        test_predictions = network.predict(X_test)
        test_loss = criterion(test_predictions, y_test)
        
        train_losses.append(train_loss)
        test_losses.append(test_loss)
        
        # Print progress
        if (epoch + 1) % print_every == 0:
            print(f"Epoch {epoch + 1:5d} | Train Loss: {train_loss:.4f} | Test Loss: {test_loss:.4f}")
    
    return train_losses, test_losses


def calculate_metrics(y_true, y_pred):
    mse = np.mean((y_true - y_pred) ** 2)
    rmse = np.sqrt(mse)
    mae = np.mean(np.abs(y_true - y_pred))
    
    # R-squared
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    r2 = 1 - (ss_res / ss_tot)
    
    return {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2
    }



print("=" * 70)
print("Neural Network for Concrete Compressive Strength Prediction")
print("Using PReLU Activation Function")
print("=" * 70)
print()

# Load data
print("Loading Concrete Compressive Strength Dataset...")
X, y = load_concrete_data()
print(f"Dataset shape: X={X.shape}, y={y.shape}")
print(f"Target statistics: mean={y.mean():.2f}, std={y.std():.2f} MPa")
print()

# Split data:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2)

print(f"Train set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print()

# Normalize features only (NOT target values)
scaler_X = StandardScaler()
X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)

print("Data normalized using StandardScaler (features only)")
print()

# Initialize network
input_size = X_train.shape[1]  # 8 features
hidden_size = 20  # Number of neurons in hidden layer
output_size = 1   # Single output for regression

network = NeuralNetworkPReLU(
    input_size=input_size,
    hidden_size=hidden_size,
    output_size=output_size
)

print(f"Network Architecture:")
print(f"  Input layer:  {input_size} neurons")
print(f"  Hidden layer: {hidden_size} neurons (PReLU activation)")
print(f"  Output layer: {output_size} neuron (Linear activation)")
print(f"  Total parameters: {input_size * hidden_size + hidden_size + hidden_size * output_size + output_size + hidden_size}")
print()

# Initialize loss and optimizer
criterion = MSELoss()
optimizer = SGDOptimizer(learning_rate=0.01)

# Train the network
print("Training Started...")
print("-" * 70)

train_losses, test_losses = train_network(
    network, criterion, optimizer,
    X_train, y_train, X_test, y_test,
    epochs=5000,
    print_every=500
)

print("-" * 70)
print("Training Complete!")
print()

# Evaluate on test set
print("=" * 70)
print("Model Evaluation on Test Set")
print("=" * 70)
print()

test_predictions = network.predict(X_test)

metrics = calculate_metrics(y_test, test_predictions)

print(f"Mean Squared Error (MSE):  {metrics['MSE']:.4f}")
print(f"Root Mean Squared Error (RMSE): {metrics['RMSE']:.4f} MPa")
print(f"Mean Absolute Error (MAE): {metrics['MAE']:.4f} MPa")
print(f"R2 Score: {metrics['R2']:.4f}")
print()

# Show learned PReLU alpha values
print("=" * 70)
print("Learned PReLU Alpha Parameters")
print("=" * 70)
print()
print("Alpha values (one per hidden neuron):")
print(network.prelu.alpha)
print(f"\nMean alpha: {network.prelu.alpha.mean():.4f}")
print(f"Std alpha:  {network.prelu.alpha.std():.4f}")
print()

# Show sample predictions
print("=" * 70)
print("Sample Predictions")
print("=" * 70)
print()
n_samples = min(10, len(X_test))
print(f"{'Predicted':<12} {'Actual':<12} {'Error':<12}")
print("-" * 36)
for i in range(n_samples):
    pred = test_predictions[i, 0]
    actual = y_test[i, 0]
    error = pred - actual
    print(f"{pred:<12.2f} {actual:<12.2f} {error:<12.2f}")

print()
print("=" * 70)
print("Training Summary")
print("=" * 70)
print()
print(f"Final Training Loss:   {train_losses[-1]:.6f}")
print(f"Final Test Loss: {test_losses[-1]:.6f}")
print(f"Test RMSE: {metrics['RMSE']:.2f} MPa")

Neural Network for Concrete Compressive Strength Prediction
Using PReLU Activation Function

Loading Concrete Compressive Strength Dataset...
Dataset shape: X=(1030, 8), y=(1030, 1)
Target statistics: mean=35.82, std=16.70 MPa

Train set: 772 samples
Test set: 258 samples

Data normalized using StandardScaler (features only)

Network Architecture:
  Input layer:  8 neurons
  Hidden layer: 20 neurons (PReLU activation)
  Output layer: 1 neuron (Linear activation)
  Total parameters: 221

Training Started...
----------------------------------------------------------------------
Epoch   500 | Train Loss: 39.3940 | Test Loss: 58.9851
Epoch  1000 | Train Loss: 29.7349 | Test Loss: 49.6617
Epoch  1500 | Train Loss: 23.6967 | Test Loss: 43.3676
Epoch  2000 | Train Loss: 21.9393 | Test Loss: 42.0992
Epoch  2500 | Train Loss: 21.1856 | Test Loss: 41.9581
Epoch  3000 | Train Loss: 20.6560 | Test Loss: 41.8888
Epoch  3500 | Train Loss: 20.5149 | Test Loss: 42.1711
Epoch  4000 | Train Loss: 20.883