# Two-Layered Neural Network


In [None]:
import numpy as np
import matplotlib.pyplot as plt


# Initialize parameters function for the 2-layer neural network
def initialize_parameters(n_x, n_h, n_y):
    """
    Initializes the weights and biases for a 2-layer neural network.

    Arguments:
    n_x -- size of the input layer (number of features)     ; if image is 24 x 24 x 3, n_x = 24*24*3 = 1728
    n_h -- size of the hidden layer                         ; if input layer is 7 then n_h = 7
    n_y -- size of the output layer (number of classes)     ; if output layer is 1 then n_y = 1

    Returns:
    parameters -- dictionary containing initialized weights (W1, W2) and biases (b1, b2)
    """
    np.random.seed(1)
    W1 = np.random.randn(n_h, n_x) * 0.01  # Weight matrix for first layer
    b1 = np.zeros((n_h, 1))  # Bias vector for first layer
    W2 = np.random.randn(n_y, n_h) * 0.01  # Weight matrix for second layer
    b2 = np.zeros((n_y, 1))  # Bias vector for second layer

    parameters = {"W1": W1, "b1": b1, "W2": W2, "b2": b2}
    return parameters


# Linear forward function for each layer
def linear_activation_forward(A_prev, W, b, activation):
    """
    Implements the forward propagation for a single layer, including the activation function.

    Arguments:
    A_prev -- activations from the previous layer (or input data)
    W -- weight matrix of the current layer
    b -- bias vector of the current layer
    activation -- activation function to be used ("sigmoid" or "relu")

    Returns:
    A -- activation of the current layer
    cache -- tuple containing useful values for backward propagation
    """
    Z = np.dot(W, A_prev) + b  # Linear forward step

    # Apply activation function
    if activation == "sigmoid":
        A = 1 / (1 + np.exp(-Z))  # Sigmoid activation
    elif activation == "relu":
        A = np.maximum(0, Z)  # ReLU activation

    cache = (A_prev, W, b, Z)  # Cache values for backpropagation
    return A, cache


# Compute the cost function (binary cross-entropy)
def compute_cost(A2, Y):
    """
    Computes the cost (loss) for the neural network using cross-entropy.

    Arguments:
    A2 -- predicted output from the network (final layer activation)
    Y -- true labels (1 if cat, 0 if non-cat)

    Returns:
    cost -- the cross-entropy cost
    """
    m = Y.shape[1]  # Number of examples
    cost = -np.sum(Y * np.log(A2), (1 - Y) * np.log(1 - A2)) / m
    cost = np.squeeze(cost)  # Ensure cost is a scalar value
    return cost


# Backward propagation function for a single layer
def linear_activation_backward(dA, cache, activation):
    """
    Implements backward propagation for a single layer, including the activation function.

    Arguments:
    dA -- gradient of the cost with respect to the activation of the current layer
    cache -- tuple containing values from forward propagation (A_prev, W, b, Z)
    activation -- the activation function used ("sigmoid" or "relu")

    Returns:
    dA_prev -- gradient of the cost with respect to the activation of the previous layer
    dW -- gradient of the cost with respect to the weights of the current layer
    db -- gradient of the cost with respect to the bias of the current layer
    """
    A_prev, W, b, Z = cache
    m = A_prev.shape[1]  # Number of examples

    # Compute gradients based on the activation function used
    if activation == "relu":
        dZ = np.array(dA, copy=True)
        dZ[Z <= 0] = 0  # Derivative of ReLU
    elif activation == "sigmoid":
        s = 1 / (1 + np.exp(-Z))
        dZ = dA * s * (1 - s)  # Derivative of sigmoid

    # Calculate gradients
    dW = np.dot(dZ, A_prev.T) / m  # Gradient for weights
    db = np.sum(dZ, axis=1, keepdims=True) / m  # Gradient for biases
    dA_prev = np.dot(W.T, dZ)  # Gradient for the previous layer's activations

    return dA_prev, dW, db


# Function for updating parameters (gradient descent)
def update_parameters(parameters, grads, learning_rate):
    """
    Updates parameters using gradient descent.

    Arguments:
    parameters -- dictionary containing current weights and biases
    grads -- dictionary containing gradients (dW1, db1, dW2, db2)
    learning_rate -- learning rate for gradient descent

    Returns:
    parameters -- updated parameters after gradient descent
    """
    parameters["W1"] -= learning_rate * grads["dW1"]  # Update weights of layer 1
    parameters["b1"] -= learning_rate * grads["db1"]  # Update biases of layer 1
    parameters["W2"] -= learning_rate * grads["dW2"]  # Update weights of layer 2
    parameters["b2"] -= learning_rate * grads["db2"]  # Update biases of layer 2

    return parameters


# Two-layer neural network model
def two_layer_model(
    X, Y, layers_dims, learning_rate=0.0075, num_iterations=3000, print_cost=False
):
    """
    Implements a two-layer neural network: LINEAR -> RELU -> LINEAR -> SIGMOID.

    Arguments:
    X -- input data, shape (n_x, number of examples)
    Y -- true labels, shape (1, number of examples)
    layers_dims -- dimensions of the layers (n_x, n_h, n_y)
    learning_rate -- learning rate for gradient descent
    num_iterations -- number of iterations for training
    print_cost -- if True, prints the cost every 100 iterations

    Returns:
    parameters -- dictionary containing the updated parameters
    costs -- list of costs over training
    """
    np.random.seed(1)
    grads = {}  # Dictionary to store gradients
    costs = []  # List to store cost values
    m = X.shape[1]  # Number of examples
    (n_x, n_h, n_y) = layers_dims  # Unpack layer dimensions

    # Initialize parameters
    parameters = initialize_parameters(n_x, n_h, n_y)
    W1, b1, W2, b2 = (
        parameters["W1"],
        parameters["b1"],
        parameters["W2"],
        parameters["b2"],
    )

    # Gradient descent loop
    for i in range(num_iterations):
        # Forward propagation: LINEAR -> RELU -> LINEAR -> SIGMOID
        A1, cache1 = linear_activation_forward(X, W1, b1, "relu")
        A2, cache2 = linear_activation_forward(A1, W2, b2, "sigmoid")

        # Compute the cost (loss)
        cost = compute_cost(A2, Y)

        # Backward propagation
        dA2 = -(
            np.divide(Y, A2) - np.divide(1 - Y, 1 - A2)
        )  # Gradient of the cost wrt A2
        dA1, dW2, db2 = linear_activation_backward(dA2, cache2, "sigmoid")
        dA0, dW1, db1 = linear_activation_backward(dA1, cache1, "relu")

        # Store gradients
        grads["dW1"], grads["db1"], grads["dW2"], grads["db2"] = dW1, db1, dW2, db2

        # Update parameters using gradient descent
        parameters = update_parameters(parameters, grads, learning_rate)

        # Retrieve updated parameters for the next iteration
        W1, b1, W2, b2 = (
            parameters["W1"],
            parameters["b1"],
            parameters["W2"],
            parameters["b2"],
        )

        # Print the cost every 100 iterations
        if print_cost and i % 100 == 0:
            print(f"Cost after iteration {i}: {cost}")
        if i % 100 == 0:
            costs.append(cost)

    return parameters, costs

# L-Layered Neural Network


In [None]:
import numpy as np


# Initialize parameters for L-layer deep neural network
def initialize_parameters_deep(layer_dims):
    """
    Initializes the weights and biases for an L-layer neural network.

    Arguments:
    layer_dims -- list containing the dimensions of each layer in the network

    Returns:
    parameters -- dictionary containing initialized weights and biases for each layer
    """
    np.random.seed(1)
    parameters = {}
    L = len(layer_dims)  # Number of layers in the network (including input layer)

    # Initialize weights and biases for each layer l (from 1 to L-1)
    for l in range(1, L):
        parameters["W" + str(l)] = (
            np.random.randn(layer_dims[l], layer_dims[l - 1]) * 0.01
        )  # Weight matrix
        parameters["b" + str(l)] = np.zeros((layer_dims[l], 1))  # Bias vector

    return parameters


# Linear forward function for each layer
def linear_activation_forward(A_prev, W, b, activation):
    """
    Implements the forward propagation for a single layer, including the activation function.

    Arguments:
    A_prev -- activations from the previous layer (or input data)
    W -- weight matrix of the current layer
    b -- bias vector of the current layer
    activation -- activation function to be used ("sigmoid" or "relu")

    Returns:
    A -- activation of the current layer
    cache -- tuple containing useful values for backward propagation
    """
    Z = np.dot(W, A_prev) + b  # Linear forward step

    # Apply activation function
    if activation == "sigmoid":
        A = 1 / (1 + np.exp(-Z))  # Sigmoid activation
    elif activation == "relu":
        A = np.maximum(0, Z)  # ReLU activation

    cache = (A_prev, W, b, Z)  # Cache values for backpropagation
    return A, cache


# Forward propagation for L-layer neural network
def L_model_forward(X, parameters):
    """
    Implements forward propagation for the L-layer neural network.

    Arguments:
    X -- input data of shape (n_x, number of examples)
    parameters -- dictionary containing the weights and biases for all layers

    Returns:
    AL -- output of the last (Lth) layer (activation of the final layer)
    caches -- list of caches containing every cache of linear_activation_forward() (used for backpropagation)
    """
    caches = []  # To store intermediate values for backpropagation
    A = X  # Initialize activation as the input data
    L = len(parameters) // 2  # Number of layers (W1, W2, ..., WL)

    # Implement forward propagation for layers 1 to L-1 (LINEAR -> RELU)
    for l in range(1, L):
        A_prev = A  # Activation from previous layer
        A, cache = linear_activation_forward(
            A_prev,
            parameters["W" + str(l)],
            parameters["b" + str(l)],
            activation="relu",
        )
        caches.append(cache)  # Store cache for backpropagation

    # Implement forward propagation for the last layer L (LINEAR -> SIGMOID)
    AL, cache = linear_activation_forward(
        A, parameters["W" + str(L)], parameters["b" + str(L)], activation="sigmoid"
    )
    caches.append(cache)  # Store cache for backpropagation

    return AL, caches


# Backward propagation function for a single layer
def linear_activation_backward(dA, cache, activation):
    """
    Implements backward propagation for a single layer, including the activation function.

    Arguments:
    dA -- gradient of the cost with respect to the activation of the current layer
    cache -- tuple containing values from forward propagation (A_prev, W, b, Z)
    activation -- the activation function used ("sigmoid" or "relu")

    Returns:
    dA_prev -- gradient of the cost with respect to the activation of the previous layer
    dW -- gradient of the cost with respect to the weights of the current layer
    db -- gradient of the cost with respect to the bias of the current layer
    """
    A_prev, W, b, Z = cache
    m = A_prev.shape[1]  # Number of examples

    # Compute gradients based on the activation function used
    if activation == "relu":
        dZ = np.array(dA, copy=True)
        dZ[Z <= 0] = 0  # Derivative of ReLU
    elif activation == "sigmoid":
        s = 1 / (1 + np.exp(-Z))
        dZ = dA * s * (1 - s)  # Derivative of sigmoid

    # Calculate gradients
    dW = np.dot(dZ, A_prev.T) / m  # Gradient for weights
    db = np.sum(dZ, axis=1, keepdims=True) / m  # Gradient for biases
    dA_prev = np.dot(W.T, dZ)  # Gradient for the previous layer's activations

    return dA_prev, dW, db


# Compute cost function (binary cross-entropy)
def compute_cost(AL, Y):
    """
    Computes the cost function (binary cross-entropy) for the L-layer neural network.

    Arguments:
    AL -- probability vector corresponding to the predictions, shape (1, number of examples)
    Y -- true "label" vector (1 for cat, 0 for non-cat), shape (1, number of examples)

    Returns:
    cost -- binary cross-entropy cost
    """
    m = Y.shape[1]  # Number of examples

    # Compute binary cross-entropy cost
    cost = -np.sum(Y * np.log(AL) + (1 - Y) * np.log(1 - AL)) / m
    cost = np.squeeze(cost)  # Ensures cost is a scalar value

    return cost


# Backward propagation for L-layer neural network
def L_model_backward(AL, Y, caches):
    """
    Implements backward propagation for the L-layer neural network.

    Arguments:
    AL -- probability vector (predicted output from forward propagation)
    Y -- true "label" vector
    caches -- list of caches containing every cache of forward propagation

    Returns:
    grads -- dictionary with gradients for every layer
    """
    grads = {}
    L = len(caches)  # Number of layers
    m = AL.shape[1]  # Number of examples
    Y = Y.reshape(AL.shape)  # Ensure Y has the same shape as AL

    # Initialize the backpropagation for the last layer (Lth layer: SIGMOID -> LINEAR)
    dAL = -(np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))  # Derivative of cost w.r.t AL
    current_cache = caches[L - 1]  # Cache from last layer
    grads["dA" + str(L - 1)], grads["dW" + str(L)], grads["db" + str(L)] = (
        linear_activation_backward(dAL, current_cache, "sigmoid")
    )

    # Loop backward from L-2 to 0 for layers (RELU -> LINEAR)
    for l in reversed(range(L - 1)):
        current_cache = caches[l]  # Cache from layer l
        dA_prev_temp, dW_temp, db_temp = linear_activation_backward(
            grads["dA" + str(l + 1)], current_cache, "relu"
        )
        grads["dA" + str(l)] = (
            dA_prev_temp  # Gradient for the previous layer's activation
        )
        grads["dW" + str(l + 1)] = dW_temp  # Gradient for the current layer's weights
        grads["db" + str(l + 1)] = db_temp  # Gradient for the current layer's biases

    return grads


# Update parameters for L-layer neural network
def update_parameters(parameters, grads, learning_rate):
    """
    Updates parameters using gradient descent for the L-layer neural network.

    Arguments:
    parameters -- dictionary containing the current weights and biases
    grads -- dictionary containing gradients for the weights and biases
    learning_rate -- learning rate for gradient descent

    Returns:
    parameters -- updated parameters after gradient descent
    """
    L = len(parameters) // 2  # Number of layers in the neural network

    # Update parameters for each layer l
    for l in range(1, L + 1):
        parameters["W" + str(l)] -= (
            learning_rate * grads["dW" + str(l)]
        )  # Update weights
        parameters["b" + str(l)] -= (
            learning_rate * grads["db" + str(l)]
        )  # Update biases

    return parameters


# L-layer neural network model
def L_layer_model(
    X, Y, layers_dims, learning_rate=0.0075, num_iterations=3000, print_cost=False
):
    """
    Implements an L-layer neural network: [LINEAR -> RELU]*(L-1) -> LINEAR -> SIGMOID.

    Arguments:
    X -- input data, shape (n_x, number of examples)
    Y -- true "label" vector, shape (1, number of examples)
    layers_dims -- list containing the dimensions of each layer
    learning_rate -- learning rate for gradient descent
    num_iterations -- number of iterations for training the model
    print_cost -- if True, print the cost every 100 iterations

    Returns:
    parameters -- parameters learned by the model
    costs -- list of costs after every 100 iterations (for plotting the learning curve)
    """
    np.random.seed(1)
    costs = []  # List to store the cost after every 100 iterations

    # Initialize parameters for the L-layer model
    parameters = initialize_parameters_deep(layers_dims)

    # Gradient descent loop for num_iterations
    for i in range(0, num_iterations):

        # Forward propagation
        AL, caches = L_model_forward(X, parameters)

        # Compute cost
        cost = compute_cost(AL, Y)

        # Backward propagation
        grads = L_model_backward(AL, Y, caches)

        # Update parameters
        parameters = update_parameters(parameters, grads, learning_rate)

        # Print the cost every 100 iterations
        if print_cost and (i % 100 == 0 or i == num_iterations - 1):
            print(f"Cost after iteration {i}: {cost}")
        if i % 100 == 0:
            costs.append(cost)

    return parameters, costs