In [1]:
import numpy as np
%load_ext nb_js_diagrammers

In [2]:
%%mermaid_magic -h 850
graph RL
    %% Output Layer
    subgraph OL[Output Layer]
        direction TB
        L[Loss]
        O2[y₂]
        O1[y₁]
    end

    %% Hidden Layer
    subgraph HL[Hidden Layer]
        direction TB
        H4[h₄]
        H3[h₃]
        H2[h₂]
        H1[h₁]
    end

    %% Input Layer
    subgraph IL[Input Layer]
        direction TB
        I1[x₁]
        I2[x₂]
    end

    %% Backward flow arrows with gradient notations
    L --> O2
    L --> O1
    
    O2 -->|∂L/∂W₂| H4
    O2 -->|∂L/∂W₂| H3
    O2 -->|∂L/∂W₂| H2
    O2 -->|∂L/∂W₂| H1
    
    O1 -->|∂L/∂W₂| H4
    O1 -->|∂L/∂W₂| H3
    O1 -->|∂L/∂W₂| H2
    O1 -->|∂L/∂W₂| H1
    
    H4 -->|∂L/∂W₁| I1
    H4 -->|∂L/∂W₁| I2
    H3 -->|∂L/∂W₁| I1
    H3 -->|∂L/∂W₁| I2
    H2 -->|∂L/∂W₁| I1
    H2 -->|∂L/∂W₁| I2
    H1 -->|∂L/∂W₁| I1
    H1 -->|∂L/∂W₁| I2

    %% Add gradient flow annotations
    annBP1[Gradient Flow:<br>Output → Hidden<br>∂L/∂W₂]
    annBP2[Gradient Flow:<br>Hidden → Input<br>∂L/∂W₁]
    annLoss[Cross-Entropy Loss<br>∂L/∂y]

    annBP1 -.-> HL
    annBP2 -.-> IL
    annLoss -.-> L

    %% Styling
    classDef inputClass fill:#f9f,stroke:#333,stroke-width:2px
    classDef hiddenClass fill:#bbf,stroke:#333,stroke-width:2px
    classDef outputClass fill:#bfb,stroke:#333,stroke-width:2px
    classDef lossClass fill:#fbb,stroke:#333,stroke-width:2px
    
    class I1,I2 inputClass
    class H1,H2,H3,H4 hiddenClass
    class O1,O2 outputClass
    class L lossClass

---

In [3]:
# ========================================================================
#             Forward Pass
# ========================================================================
# Set random seed for reproducibility
np.random.seed(42)

def generate_sample_data(n_samples=6):
    """
    Generate random sample data with 2 features
    
    Args:
        n_samples: Number of samples to generate
        
    Returns:
        X: Input features array of shape (n_samples, 2)
    """
    X = np.random.randint(1, 11, size=(n_samples, 2))
    return X

def initialize_weights(input_size=2, hidden_size=4, output_size=2):
    """
    Initialize weights and biases for the neural network
    
    Args:
        input_size: Number of input features
        hidden_size: Number of nodes in hidden layer
        output_size: Number of output nodes
        
    Returns:
        Dictionary containing weights and biases
    """
    params = {
        # Input to hidden layer weights
        'W1': np.random.randn(input_size, hidden_size) * 0.01,
        # Hidden layer bias
        'b1': np.zeros((1, hidden_size)),
        # Hidden to output layer weights
        'W2': np.random.randn(hidden_size, output_size) * 0.01,
        # Output layer bias
        'b2': np.zeros((1, output_size))
    }
    return params

def relu(z):
    """
    ReLU activation function
    
    Args:
        z: Input to the activation function
        
    Returns:
        ReLU of input: max(0, z)
    """
    return np.maximum(0, z)

def softmax(z):
    """
    Softmax activation function
    
    Args:
        z: Input to the activation function, shape (batch_size, n_classes)
        
    Returns:
        Softmax probabilities with same shape as input
    """
    # Subtract max for numerical stability (prevents overflow)
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

def forward_propagation(X, params):
    """
    Perform forward propagation through the network
    
    Args:
        X: Input features array
        params: Dictionary containing weights and biases
        
    Returns:
        Dictionary containing activations and intermediate values
    """
    # Extract weights and biases
    W1, b1 = params['W1'], params['b1']
    W2, b2 = params['W2'], params['b2']
    
    # Hidden layer calculations
    Z1 = np.dot(X, W1) + b1           # Linear transformation
    A1 = relu(Z1)                     # ReLU Activation
    
    # Output layer calculations
    Z2 = np.dot(A1, W2) + b2  # Linear transformation
    A2 = softmax(Z2)        # Softmax activation
    
    cache = {
        'Z1': Z1, 'A1': A1,
        'Z2': Z2, 'A2': A2
    }
    
    return cache


def print_step_by_step(X, params, cache):
    """
    Print detailed step-by-step calculations for feed forward neural network
    
    Args:
        X: Input features
        params: Network parameters
        cache: Activation values from forward propagation
    """
    print("\nStep-by-Step Calculations:")
    print("-" * 50)
    
    # Input layer
    print("\nInput Layer:")
    print(f"Input features (X):\n\n{X}")
    
    # Hidden layer calculations
    print("\nHidden Layer Calculations:")
    print(f"Weights (W1):\n\n{params['W1']}")
    print(f"Biases (b1):\n\n{params['b1']}")
    
    print(f"Weights (W1) shape: {params['W1'].shape}")
    print(f"Biases (b1) shape: {params['b1'].shape}")
    print()
    
    print(f"Linear transformation (Z1 = X·W1 + b1):\n\n{cache['Z1']}")
    
    print()
    print(f"Activation (A1 = relu(Z1)):\n\n{cache['A1']}")
    print()
    
    # Output layer calculations
    print("\nOutput Layer Calculations:")
    print(f"Weights (W2):\n\n{params['W2']}")
    print(f"Biases (b2):\n\n{params['b2']}")
    print()
    
    print(f"Linear transformation (Z2 = A1·W2 + b2):\n\n{cache['Z2']}")
    
    print()
    print(f"Final Output (A2 = softmax(Z2)):\n\n{cache['A2']}")

def generate_target_data(n_samples=6, n_classes=2):
    """
    Generate target variables for classification
    
    Args:
        n_samples: Number of samples
        n_classes: Number of classes (output nodes)
        
    Returns:
        y: One-hot encoded target variables
    """
    # Generate random class labels (0 to n_classes-1)
    y_labels = np.random.randint(0, n_classes, size=n_samples)
    
    # Convert to one-hot encoding
    y = np.zeros((n_samples, n_classes))
    y[np.arange(n_samples), y_labels] = 1
    return y, y_labels

def predict(X, params):
    """
    Make predictions using the trained network
    
    Args:
        X: Input features
        params: Network parameters
        
    Returns:
        predictions: Predicted class labels
        probabilities: Class probabilities
    """
    # Forward pass
    cache = forward_propagation(X, params)
    
    # Get probabilities from output layer
    probabilities = cache['A2']
    
    # Get predicted class (argmax of probabilities)
    predictions = np.argmax(probabilities, axis=1)
    
    return predictions, probabilities


def evaluate_predictions(predictions, y_true, probabilities):
    """
    Print evaluation metrics for the predictions
    
    Args:
        predictions: Predicted class labels
        y_true: True class labels
        probabilities: Predicted probabilities
    """
    print("\nPrediction Results:")
    print("-" * 50)
    print("\nPredicted Probabilities:")
    print(probabilities)
    print("\nPredicted Classes:", predictions)
    print("True Classes:", y_true)
    
    # Calculate accuracy
    accuracy = np.mean(predictions == y_true)
    print(f"\nAccuracy: {accuracy:.2%}")

In [None]:
# Perform forward propagation
np.random.seed(1010)
X = generate_sample_data()
params = initialize_weights()
cache = forward_propagation(X, params)
# Generate target variables
y, y_true = generate_target_data()

# Make predictions
predictions, probabilities = predict(X, params)

# Evaluate predictions
evaluate_predictions(predictions, y_true, probabilities)

In [4]:
def compute_loss(y_pred, y_true):
    """
    Compute cross-entropy loss
    
    Args:
        y_pred: Predicted probabilities from softmax
        y_true: True labels (one-hot encoded)
        
    Returns:
        loss: Cross-entropy loss value
        d_loss: Gradient of loss with respect to predictions
    """
    # Add small epsilon to avoid log(0)
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    
    # Compute cross-entropy loss
    loss = -np.mean(np.sum(y_true * np.log(y_pred), axis=1))
    
    # Gradient of cross-entropy loss with respect to predictions
    d_loss = y_pred - y_true
    
    return loss, d_loss


def relu_derivative(Z):
    """
    Compute derivative of ReLU activation
    
    Args:
        Z: Input to ReLU function
        
    Returns:
        Binary mask where input was positive
    """
    return Z > 0


def backward_propagation(X, y, params, cache):
    """
    Perform backward propagation to compute gradients
    
    Args:
        X: Input features
        y: True labels (one-hot encoded)
        params: Network parameters
        cache: Cached values from forward propagation
        
    Returns:
        gradients: Dictionary containing gradients for all parameters
    """
    # Get batch size
    m = X.shape[0]
    
    # Unpack parameters and cached values
    W1, W2 = params['W1'], params['W2']
    Z1, A1 = cache['Z1'], cache['A1']
    Z2, A2 = cache['Z2'], cache['A2']
    
    # Step 1: Output layer gradients
    # Compute loss gradient
    _, dA2 = compute_loss(A2, y)            # dL/dA2
    
    # Step 2: Hidden to Output layer gradients
    # dL/dZ2 = dL/dA2 (because softmax gradient is included in cross-entropy)
    dZ2 = dA2
    
    # Compute gradients for W2 and b2
    dW2 = (1/m) * np.dot(A1.T, dZ2)                   # dL/dW2
    db2 = (1/m) * np.sum(dZ2, axis=0, keepdims=True)  # dL/db2
    
    # Step 3: Input to Hidden layer gradients
    # Compute dL/dA1
    dA1 = np.dot(dZ2, W2.T)
    
    # Compute dL/dZ1 using ReLU derivative
    dZ1 = dA1 * relu_derivative(Z1)
    
    # Compute gradients for W1 and b1
    dW1 = (1/m) * np.dot(X.T, dZ1)  # dL/dW1
    db1 = (1/m) * np.sum(dZ1, axis=0, keepdims=True)  # dL/db1
    
    # Store gradients in dictionary
    gradients = {
        'dW1': dW1, 'db1': db1,
        'dW2': dW2, 'db2': db2
    }
    
    return gradients


def update_parameters(params, gradients, learning_rate):
    """
    Update network parameters using computed gradients
    
    Args:
        params: Current network parameters
        gradients: Computed gradients
        learning_rate: Learning rate for gradient descent
        
    Returns:
        Updated parameters
    """
    # Update weights and biases
    params['W1'] -= learning_rate * gradients['dW1']
    params['b1'] -= learning_rate * gradients['db1']
    params['W2'] -= learning_rate * gradients['dW2']
    params['b2'] -= learning_rate * gradients['db2']
    
    return params

def print_gradients(gradients):
    """
    Print the computed gradients for inspection
    
    Args:
        gradients: Dictionary containing computed gradients
    """
    print("\nGradient Information:")
    print("-" * 50)
    
    print("\nGradients for W1 (Input → Hidden):")
    print(gradients['dW1'])
    
    print("\nGradients for b1 (Hidden bias):")
    print(gradients['db1'])
    
    print("\nGradients for W2 (Hidden → Output):")
    print(gradients['dW2'])
    
    print("\nGradients for b2 (Output bias):")
    print(gradients['db2'])

In [5]:
# Example usage:

# Generate sample data
X = generate_sample_data()
y, _ = generate_target_data()
    
# Initialize parameters
params = initialize_weights()

# Forward propagation
cache = forward_propagation(X, params)

# Compute initial loss
loss, _ = compute_loss(cache['A2'], y)
print(f"Initial loss: {loss:.4f}")

# Perform backward propagation
gradients = backward_propagation(X, y, params, cache)

# Print gradients
print_gradients(gradients)

# Update parameters
learning_rate = 0.01
params = update_parameters(params, gradients, learning_rate)

Initial loss: 0.6931

Gradient Information:
--------------------------------------------------

Gradients for W1 (Input → Hidden):
[[-0.00110582  0.          0.          0.        ]
 [-0.00110572  0.          0.          0.        ]]

Gradients for b1 (Hidden bias):
[[2.51779054e-08 0.00000000e+00 0.00000000e+00 0.00000000e+00]]

Gradients for W2 (Hidden → Output):
[[-0.0001896  0.0001896]
 [ 0.         0.       ]
 [ 0.         0.       ]
 [ 0.         0.       ]]

Gradients for b2 (Output bias):
[[ 0.16666477 -0.16666477]]
