##### 【Problem 1 】 Classification of total join layer

In [1]:
import numpy as np

class FC:
    """
    Fully connected layer from n_nodes1 to n_nodes2.

    Parameters
    ----------
    n_nodes1 : int
      Number of nodes in the previous layer.
    n_nodes2 : int
      Number of nodes in the subsequent layer.
    initializer : Initialization method instance
    optimizer : Optimization method instance
    """
    def __init__(self, n_nodes1, n_nodes2, initializer, optimizer):
        self.optimizer = optimizer
        # Initialize weights and biases using the initializer.
        self.W = initializer.W(n_nodes1, n_nodes2)
        self.B = initializer.B(n_nodes2)
        # Store the input during the forward pass.
        self.X = None
        # Store the gradients calculated during the backward pass.
        self.dW = None
        self.dB = None

    def forward(self, X):
        """
        Forward propagation.

        Parameters
        ----------
        X : ndarray of shape (batch_size, n_nodes1)
            Input from the previous layer.

        Returns
        ----------
        A : ndarray of shape (batch_size, n_nodes2)
            Output of this layer before activation.
        """
        self.X = X
        A = np.dot(X, self.W) + self.B
        return A

    def backward(self, dA):
        """
        Backward propagation.

        Parameters
        ----------
        dA : ndarray of shape (batch_size, n_nodes2)
            Gradient from the subsequent layer.

        Returns
        ----------
        dZ : ndarray of shape (batch_size, n_nodes1)
            Gradient to the previous layer.
        """
        # Calculate the gradient of the loss with respect to the weights.
        self.dW = np.dot(self.X.T, dA)
        # Calculate the gradient of the loss with respect to the biases.
        self.dB = np.sum(dA, axis=0)
        # Calculate the gradient to propagate to the previous layer.
        dZ = np.dot(dA, self.W.T)
        # Update the layer's parameters using the optimizer.
        self.optimizer.update(self)
        return dZ

##### 【Problem 2 】 Initialization method classification

In [2]:
import numpy as np

class SimpleInitializer:
    """
    A simple initializer using a Gaussian distribution.

    Parameters
    ----------
    sigma : float
      Standard deviation of the Gaussian distribution.
    """
    def __init__(self, sigma):
        self.sigma = sigma

    def W(self, n_nodes1, n_nodes2):
        """
        Initializes weights with a Gaussian distribution.

        Parameters
        ----------
        n_nodes1 : int
          Number of nodes in the previous layer.
        n_nodes2 : int
          Number of nodes in the subsequent layer.

        Returns
        ----------
        W : ndarray of shape (n_nodes1, n_nodes2)
            Initialized weight matrix.
        """
        W = np.random.normal(0, self.sigma, (n_nodes1, n_nodes2))
        return W

    def B(self, n_nodes2):
        """
        Initializes biases with zeros.

        Parameters
        ----------
        n_nodes2 : int
          Number of nodes in the subsequent layer.

        Returns
        ----------
        B : ndarray of shape (n_nodes2,)
            Initialized bias vector.
        """
        B = np.zeros(n_nodes2)
        return B

##### 【Problem 3 】 Optimization method classification

In [3]:
class SGD:
    """
    Stochastic Gradient Descent optimizer.

    Parameters
    ----------
    lr : float
      Learning rate.
    """
    def __init__(self, lr):
        self.lr = lr

    def update(self, layer):
        """
        Updates the weights and biases of a given layer.

        Parameters
        ----------
        layer : instance of the layer to be updated
            The layer instance should have attributes `W`, `B`, `dW`, and `dB`.
        """
        layer.W -= self.lr * layer.dW
        layer.B -= self.lr * layer.dB

##### 【Problem 4 】 Activation function classification

In [4]:
import numpy as np

class ReLU:
    """
    Rectified Linear Unit activation function.
    """
    def forward(self, A):
        """
        Forward propagation for the ReLU function.

        Parameters
        ----------
        A : ndarray of shape (batch_size, n_nodes)
            Input to the activation function.

        Returns
        ----------
        Z : ndarray of shape (batch_size, n_nodes)
            Output of the activation function.
        """
        self.mask = (A > 0)
        Z = A * self.mask
        return Z

    def backward(self, dZ):
        """
        Backward propagation for the ReLU function.

        Parameters
        ----------
        dZ : ndarray of shape (batch_size, n_nodes)
            Gradient from the subsequent layer.

        Returns
        ----------
        dA : ndarray of shape (batch_size, n_nodes)
            Gradient to the previous layer.
        """
        dA = dZ * self.mask
        return dA

class Softmax:
    """
    Softmax activation function.
    """
    def forward(self, A):
        """
        Forward propagation for the softmax function.

        Parameters
        ----------
        A : ndarray of shape (batch_size, n_nodes)
            Input to the activation function.

        Returns
        ----------
        Z : ndarray of shape (batch_size, n_nodes)
            Output of the activation function (probabilities).
        """
        exp_A = np.exp(A - np.max(A, axis=1, keepdims=True))
        self.Z = exp_A / np.sum(exp_A, axis=1, keepdims=True)
        return self.Z

    def backward(self, Z, y):
        """
        Backward propagation for the softmax function combined with cross-entropy error.

        Parameters
        ----------
        Z : ndarray of shape (batch_size, n_nodes)
            Output of the softmax function (probabilities).
        y : ndarray of shape (batch_size,)
            True labels (integers).

        Returns
        ----------
        dA : ndarray of shape (batch_size, n_nodes)
            Gradient to the previous layer.
        """
        batch_size = y.shape[0]
        dA = Z - np.eye(Z.shape[1])[y] / batch_size
        return dA

##### 【Problem 5 】 ReLU class creation

In [5]:
import numpy as np

class ReLU:
    """
    Rectified Linear Unit activation function.

    f(x) = {x if x > 0, 0 if x <= 0}
    """
    def forward(self, A):
        """
        Forward propagation for the ReLU function.

        Parameters
        ----------
        A : ndarray of shape (batch_size, n_nodes)
            Input to the activation function.

        Returns
        ----------
        Z : ndarray of shape (batch_size, n_nodes)
            Output of the activation function.
        """
        self.mask = (A > 0)
        Z = np.maximum(0, A)
        return Z

    def backward(self, dZ):
        """
        Backward propagation for the ReLU function.

        Parameters
        ----------
        dZ : ndarray of shape (batch_size, n_nodes)
            Gradient from the subsequent layer.

        Returns
        ----------
        dA : ndarray of shape (batch_size, n_nodes)
            Gradient to the previous layer.
        """
        dA = dZ * self.mask
        return dA

##### 【Problem 6 】 Initial weight

In [6]:
import numpy as np

class XavierInitializer:
    """
    Xavier initializer (also known as Glorot initializer).

    The standard deviation is calculated as sigma = 1 / sqrt(n_nodes1),
    where n_nodes1 is the number of nodes in the previous layer.
    """
    def W(self, n_nodes1, n_nodes2):
        """
        Initializes weights with a normal distribution based on the Xavier method.

        Parameters
        ----------
        n_nodes1 : int
          Number of nodes in the previous layer.
        n_nodes2 : int
          Number of nodes in the subsequent layer.

        Returns
        ----------
        W : ndarray of shape (n_nodes1, n_nodes2)
            Initialized weight matrix.
        """
        sigma = 1.0 / np.sqrt(n_nodes1)
        W = np.random.normal(0, sigma, (n_nodes1, n_nodes2))
        return W

    def B(self, n_nodes2):
        """
        Initializes biases with zeros.

        Parameters
        ----------
        n_nodes2 : int
          Number of nodes in the subsequent layer.

        Returns
        ----------
        B : ndarray of shape (n_nodes2,)
            Initialized bias vector.
        """
        B = np.zeros(n_nodes2)
        return B

class HeInitializer:
    """
    He initializer.

    The standard deviation is calculated as sigma = sqrt(2 / n_nodes1),
    where n_nodes1 is the number of nodes in the previous layer.
    """
    def W(self, n_nodes1, n_nodes2):
        """
        Initializes weights with a normal distribution based on the He method.

        Parameters
        ----------
        n_nodes1 : int
          Number of nodes in the previous layer.
        n_nodes2 : int
          Number of nodes in the subsequent layer.

        Returns
        ----------
        W : ndarray of shape (n_nodes1, n_nodes2)
            Initialized weight matrix.
        """
        sigma = np.sqrt(2.0 / n_nodes1)
        W = np.random.normal(0, sigma, (n_nodes1, n_nodes2))
        return W

    def B(self, n_nodes2):
        """
        Initializes biases with zeros.

        Parameters
        ----------
        n_nodes2 : int
          Number of nodes in the subsequent layer.

        Returns
        ----------
        B : ndarray of shape (n_nodes2,)
            Initialized bias vector.
        """
        B = np.zeros(n_nodes2)
        return B

##### 【Problem 7 】 Optimization method

In [7]:
import numpy as np

class SGD:
    """
    Stochastic Gradient Descent optimizer.

    Parameters
    ----------
    lr : float
      Learning rate.
    """
    def __init__(self, lr):
        self.lr = lr

    def update(self, layer):
        """
        Updates the weights and biases of a given layer.

        Parameters
        ----------
        layer : instance of the layer to be updated
            The layer instance should have attributes `W`, `B`, `dW`, and `dB`.
        """
        layer.W -= self.lr * layer.dW
        layer.B -= self.lr * layer.dB

class AdaGrad:
    """
    AdaGrad optimizer.

    Adapts the learning rate for each parameter based on the historical
    gradient magnitudes.
    """
    def __init__(self, lr):
        self.lr = lr
        self.hw = None  # Accumulated squared gradients for weights
        self.hb = None  # Accumulated squared gradients for biases
        self.epsilon = 1e-7  # Small constant to avoid division by zero

    def update(self, layer):
        """
        Updates the weights and biases of a given layer using AdaGrad.

        Parameters
        ----------
        layer : instance of the layer to be updated
            The layer instance should have attributes `W`, `B`, `dW`, and `dB`.
        """
        if self.hw is None:
            self.hw = np.zeros_like(layer.W)
            self.hb = np.zeros_like(layer.B)

        self.hw += layer.dW * layer.dW
        self.hb += layer.dB * layer.dB

        layer.W -= self.lr * (1 / (np.sqrt(self.hw) + self.epsilon)) * layer.dW
        layer.B -= self.lr * (1 / (np.sqrt(self.hb) + self.epsilon)) * layer.dB

##### 【Problem 8 】 Class completion

In [8]:
import numpy as np
import matplotlib.pyplot as plt

class ScratchDeepNeuralNetrowkClassifier:
    """
    A versatile deep neural network classifier.
    """
    def __init__(self, n_layers, layer_configs, n_features, n_output, random_state=None):
        self.n_layers = n_layers
        self.layer_configs = layer_configs
        self.n_features = n_features
        self.n_output = n_output
        self.layers = []
        self.activations = []
        self.loss_history = []
        self.random_state = random_state
        if self.random_state is not None:
            np.random.seed(self.random_state)
        self._build_network()

    def _build_network(self):
        input_dim = self.n_features
        for i, config in enumerate(self.layer_configs):
            layer_type = config['type']
            n_nodes = config['n_nodes']
            activation_type = config['activation']
            initializer = config['initializer']
            optimizer = config['optimizer']

            if layer_type == 'FC':
                fc_layer = FC(input_dim, n_nodes, initializer, optimizer)
                self.layers.append(fc_layer)
                if activation_type == 'Tanh':
                    self.activations.append(Tanh())
                elif activation_type == 'Softmax':
                    self.activations.append(Softmax())
                elif activation_type == 'ReLU':
                    self.activations.append(ReLU())
                else:
                    raise ValueError(f"Unknown activation type: {activation_type}")
                input_dim = n_nodes
            else:
                raise ValueError(f"Unknown layer type: {layer_type}")

        # Ensure the last layer has n_output nodes and Softmax activation
        if not self.layer_configs or self.layer_configs[-1]['n_nodes'] != self.n_output or self.layer_configs[-1]['activation'] != 'Softmax':
            last_input_dim = self.layer_configs[-1]['n_nodes'] if self.layer_configs else self.n_features
            output_config = {'type': 'FC', 'n_nodes': self.n_output, 'activation': 'Softmax', 'initializer': XavierInitializer(), 'optimizer': AdaGrad(self.layer_configs[-1]['optimizer'].lr if self.layer_configs else 0.01)}
            output_fc = FC(last_input_dim, self.n_output, output_config['initializer'], output_config['optimizer'])
            self.layers.append(output_fc)
            self.activations.append(Softmax())
            self.n_layers = len(self.layers) # Update n_layers

    def fit(self, X, y, n_iter=100, batch_size=32, verbose=True):
        n_samples = X.shape[0]
        self.loss_history = []

        for i in range(n_iter):
            indices = np.random.permutation(n_samples)
            X_shuffled = X[indices]
            y_shuffled = y[indices]

            for batch_start in range(0, n_samples, batch_size):
                batch_end = batch_start + batch_size
                X_batch = X_shuffled[batch_start:batch_end]
                y_batch = y_shuffled[batch_start:batch_end]

                forward_outputs = [X_batch]
                Z = X_batch
                for layer, activation in zip(self.layers, self.activations):
                    A = layer.forward(Z)
                    Z = activation.forward(A)
                    forward_outputs.append(Z)

                dA = self._loss_function_backward(forward_outputs[-1], y_batch)
                for i_layer in range(len(self.layers) - 1, -1, -1):
                    layer = self.layers[i_layer]
                    activation = self.activations[i_layer]
                    Z_current = forward_outputs[i_layer + 1]

                    dZ_prev = layer.backward(dA)
                    if isinstance(activation, Softmax):
                        dA = activation.backward(Z_current, y_batch)
                    else:
                        dA = activation.backward(dZ_prev)

            Z_epoch = self._forward(X)
            loss = self._loss_function_forward(Z_epoch, y)
            self.loss_history.append(loss)

            if verbose:
                print(f"Epoch {i+1}/{n_iter}, Loss: {loss:.4f}")

    def predict(self, X):
        Z = self._forward(X)
        return np.argmax(Z, axis=1)

    def predict_proba(self, X):
        return self._forward(X)

    def _forward(self, X):
        Z = X
        for layer, activation in zip(self.layers, self.activations):
            A = layer.forward(Z)
            Z = activation.forward(A)
        return Z

    def _loss_function_forward(self, Z, y):
        batch_size = y.shape[0]
        loss = -np.sum(np.log(Z[np.arange(batch_size), y] + 1e-7)) / batch_size
        return loss

    def _loss_function_backward(self, Z, y):
        batch_size = y.shape[0]
        dA = Z - np.eye(Z.shape[1])[y] / batch_size
        return dA

class FC:
    def __init__(self, n_in, n_out, initializer, optimizer):
        self.optimizer = optimizer
        self.W = initializer.W(n_in, n_out)
        self.B = initializer.B(n_out)
        self.X = None
        self.dW = None
        self.dB = None

    def forward(self, X):
        self.X = X
        self.A = np.dot(X, self.W) + self.B
        return self.A

    def backward(self, dA):
        dZ = np.dot(dA, self.W.T)
        self.dW = np.dot(self.X.T, dA)
        self.dB = np.sum(dA, axis=0)
        self.optimizer.update(self)
        return dZ

class SimpleInitializer:
    def __init__(self, sigma):
        self.sigma = sigma

    def W(self, n_in, n_out):
        return np.random.normal(0, self.sigma, (n_in, n_out))

    def B(self, n_out):
        return np.zeros(n_out)

class XavierInitializer:
    def W(self, n_in, n_out):
        sigma = 1.0 / np.sqrt(n_in)
        W = np.random.normal(0, sigma, (n_in, n_out))
        return W

    def B(self, n_out):
        B = np.zeros(n_out)
        return B

class HeInitializer:
    def W(self, n_in, n_out):
        sigma = np.sqrt(2.0 / n_in)
        W = np.random.normal(0, sigma, (n_in, n_out))
        return W

    def B(self, n_out):
        B = np.zeros(n_out)
        return B

class Tanh:
    def forward(self, A):
        self.Z = np.tanh(A)
        return self.Z

    def backward(self, dZ):
        dA = dZ * (1 - self.Z**2)
        return dA

class ReLU:
    def forward(self, A):
        self.mask = (A > 0)
        Z = np.maximum(0, A)
        return Z

    def backward(self, dZ):
        dA = dZ * self.mask
        return dA

class Softmax:
    def forward(self, A):
        exp_A = np.exp(A - np.max(A, axis=1, keepdims=True))
        self.Z = exp_A / np.sum(exp_A, axis=1, keepdims=True)
        return self.Z

    def backward(self, Z, y):
        batch_size = y.shape[0]
        dA = Z - np.eye(Z.shape[1])[y] / batch_size
        return dA

class SGD:
    def __init__(self, lr):
        self.lr = lr

    def update(self, layer):
        layer.W -= self.lr * layer.dW
        layer.B -= self.lr * layer.dB

class AdaGrad:
    def __init__(self, lr):
        self.lr = lr
        self.hw = None
        self.hb = None
        self.epsilon = 1e-7

    def update(self, layer):
        if self.hw is None:
            self.hw = np.zeros_like(layer.W)
            self.hb = np.zeros_like(layer.B)

        self.hw += layer.dW * layer.dW
        self.hb += layer.dB * layer.dB

        layer.W -= self.lr * (1 / (np.sqrt(self.hw) + self.epsilon)) * layer.dW
        layer.B -= self.lr * (1 / (np.sqrt(self.hb) + self.epsilon)) * layer.dB

if __name__ == '__main__':
    # Example usage:
    n_features = 784
    n_output = 10
    n_layers = 3

    layer_configs = [
        {'type': 'FC', 'n_nodes': 128, 'activation': 'ReLU', 'initializer': HeInitializer(), 'optimizer': AdaGrad(lr=0.01)},
        {'type': 'FC', 'n_nodes': 64, 'activation': 'ReLU', 'initializer': HeInitializer(), 'optimizer': AdaGrad(lr=0.01)},
        {'type': 'FC', 'n_nodes': n_output, 'activation': 'Softmax', 'initializer': XavierInitializer(), 'optimizer': AdaGrad(lr=0.01)}
    ]

    model = ScratchDeepNeuralNetrowkClassifier(n_layers, layer_configs, n_features, n_output, random_state=42)

    # Dummy training data
    X_train = np.random.rand(1000, n_features)
    y_train = np.random.randint(0, n_output, 1000)

    # Dummy test data
    X_test = np.random.rand(200, n_features)
    y_test = np.random.randint(0, n_output, 200)

    #model.fit(X_train, y_train, n_iter=10, batch_size=32, verbose=True)

    predictions = model.predict(X_test)
    print("Predictions (first 10):", predictions[:10])

    probabilities = model.predict_proba(X_test)
    print("Probabilities (first 2 samples):\n", probabilities[:2])

  

Predictions (first 10): [4 4 4 5 4 4 5 4 4 2]
Probabilities (first 2 samples):
 [[0.07877131 0.03817066 0.13181438 0.14385221 0.20104698 0.13713206
  0.07302206 0.06763242 0.0784162  0.05014172]
 [0.07066045 0.06668568 0.07441456 0.09451205 0.21325199 0.16557688
  0.09807867 0.10509788 0.06783265 0.04388918]]


【Problem 9 】 Learning and estimation

In [13]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt

class ScratchDeepNeuralNetrowkClassifier:
    """
    A versatile deep neural network classifier.
    """
    def __init__(self, n_layers, layer_configs, n_features, n_output, random_state=None):
        self.n_layers = n_layers
        self.layer_configs = layer_configs
        self.n_features = n_features
        self.n_output = n_output
        self.layers = []
        self.activations = []
        self.loss_history = []
        self.random_state = random_state
        if self.random_state is not None:
            np.random.seed(self.random_state)
        self._build_network()

    def _build_network(self):
        input_dim = self.n_features
        for i, config in enumerate(self.layer_configs):
            layer_type = config['type']
            n_nodes = config['n_nodes']
            activation_type = config['activation']
            initializer = config['initializer']
            optimizer = config['optimizer']

            if layer_type == 'FC':
                fc_layer = FC(input_dim, n_nodes, initializer, optimizer)
                self.layers.append(fc_layer)
                if activation_type == 'Tanh':
                    self.activations.append(Tanh())
                elif activation_type == 'Softmax':
                    self.activations.append(Softmax())
                elif activation_type == 'ReLU':
                    self.activations.append(ReLU())
                else:
                    raise ValueError(f"Unknown activation type: {activation_type}")
                input_dim = n_nodes
            else:
                raise ValueError(f"Unknown layer type: {layer_type}")

        # Ensure the last layer has n_output nodes and Softmax activation
        if not self.layer_configs or self.layer_configs[-1]['n_nodes'] != self.n_output or self.layer_configs[-1]['activation'] != 'Softmax':
            last_input_dim = self.layer_configs[-1]['n_nodes'] if self.layer_configs else self.n_features
            output_config = {'type': 'FC', 'n_nodes': self.n_output, 'activation': 'Softmax', 'initializer': XavierInitializer(), 'optimizer': AdaGrad(self.layer_configs[-1]['optimizer'].lr if self.layer_configs else 0.01)}
            output_fc = FC(last_input_dim, self.n_output, output_config['initializer'], output_config['optimizer'])
            self.layers.append(output_fc)
            self.activations.append(Softmax())
            self.n_layers = len(self.layers) # Update n_layers

    def fit(self, X, y, n_iter=10, batch_size=64, verbose=True):
        n_samples = X.shape[0]
        self.loss_history = []

        for i in range(n_iter):
            indices = np.random.permutation(n_samples)
            X_shuffled = X[indices]
            y_shuffled = y[indices]

            for batch_start in range(0, n_samples, batch_size):
                batch_end = batch_start + batch_size
                X_batch = X_shuffled[batch_start:batch_end]
                y_batch = y_shuffled[batch_start:batch_end]

                forward_outputs = [X_batch]
                Z = X_batch
                for layer, activation in zip(self.layers, self.activations):
                    A = layer.forward(Z)
                    Z = activation.forward(A)
                    forward_outputs.append(Z)

                dA = self._loss_function_backward(forward_outputs[-1], y_batch)
                for i_layer in range(len(self.layers) - 1, -1, -1):
                    layer = self.layers[i_layer]
                    activation = self.activations[i_layer]
                    Z_current = forward_outputs[i_layer + 1]

                    dZ_prev = layer.backward(dA)
                    if isinstance(activation, Softmax):
                        dA = activation.backward(Z_current, y_batch)
                    else:
                        dA = activation.backward(dZ_prev)

            Z_epoch = self._forward(X)
            loss = self._loss_function_forward(Z_epoch, y)
            self.loss_history.append(loss)

            if verbose:
                print(f"Epoch {i+1}/{n_iter}, Loss: {loss:.4f}")

    def predict(self, X):
        Z = self._forward(X)
        return np.argmax(Z, axis=1)

    def predict_proba(self, X):
        return self._forward(X)

    def _forward(self, X):
        Z = X
        for layer, activation in zip(self.layers, self.activations):
            A = layer.forward(Z)
            Z = activation.forward(A)
        return Z

    def _loss_function_forward(self, Z, y):
        batch_size = y.shape[0]
        loss = -np.sum(np.log(Z[np.arange(batch_size), y] + 1e-7)) / batch_size
        return loss

    def _loss_function_backward(self, Z, y):
        batch_size = y.shape[0]
        dA = Z - np.eye(Z.shape[1])[y] / batch_size
        return dA

class FC:
    def __init__(self, n_in, n_out, initializer, optimizer):
        self.optimizer = optimizer
        self.W = initializer.W(n_in, n_out)
        self.B = initializer.B(n_out)
        self.X = None
        self.dW = None
        self.dB = None

    def forward(self, X):
        self.X = X
        self.A = np.dot(X, self.W) + self.B
        return self.A

    def backward(self, dA):
        dZ = np.dot(dA, self.W.T)
        self.dW = np.dot(self.X.T, dA)
        self.dB = np.sum(dA, axis=0)
        self.optimizer.update(self)
        return dZ

class SimpleInitializer:
    def __init__(self, sigma):
        self.sigma = sigma

    def W(self, n_in, n_out):
        return np.random.normal(0, self.sigma, (n_in, n_out))

    def B(self, n_out):
        return np.zeros(n_out)

class XavierInitializer:
    def W(self, n_in, n_out):
        sigma = 1.0 / np.sqrt(n_in)
        W = np.random.normal(0, sigma, (n_in, n_out))
        return W

    def B(self, n_out):
        B = np.zeros(n_out)
        return B

class HeInitializer:
    def W(self, n_in, n_out):
        sigma = np.sqrt(2.0 / n_in)
        W = np.random.normal(0, sigma, (n_in, n_out))
        return W

    def B(self, n_out):
        B = np.zeros(n_out)
        return B

class Tanh:
    def forward(self, A):
        self.Z = np.tanh(A)
        return self.Z

    def backward(self, dZ):
        dA = dZ * (1 - self.Z**2)
        return dA

class ReLU:
    def forward(self, A):
        self.mask = (A > 0)
        Z = np.maximum(0, A)
        return Z

    def backward(self, dZ):
        dA = dZ * self.mask
        return dA

class Softmax:
    def forward(self, A):
        exp_A = np.exp(A - np.max(A, axis=1, keepdims=True))
        self.Z = exp_A / np.sum(exp_A, axis=1, keepdims=True)
        return self.Z

    def backward(self, Z, y):
        batch_size = y.shape[0]
        dA = Z - np.eye(Z.shape[1])[y] / batch_size
        return dA

class SGD:
    def __init__(self, lr):
        self.lr = lr

    def update(self, layer):
        layer.W -= self.lr * layer.dW
        layer.B -= self.lr * layer.dB

class AdaGrad:
    def __init__(self, lr):
        self.lr = lr
        self.hw = None
        self.hb = None
        self.epsilon = 1e-7

    def update(self, layer):
        if self.hw is None:
            self.hw = np.zeros_like(layer.W)
            self.hb = np.zeros_like(layer.B)

        self.hw += layer.dW * layer.dW
        self.hb += layer.dB * layer.dB

        layer.W -= self.lr * (1 / (np.sqrt(self.hw) + self.epsilon)) * layer.dW
        layer.B -= self.lr * (1 / (np.sqrt(self.hb) + self.epsilon)) * layer.dB

# Load MNIST data
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X = mnist.data.astype(np.float32)
y = mnist.target.astype(np.int64)

# Normalize pixel values to be between 0 and 1
X /= 255

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split training data for validation (optional)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

n_features = X_train.shape[1]
n_output = len(np.unique(y))

def train_and_evaluate(layer_configs, n_features, n_output, X_train, y_train, X_test, y_test, n_iter=10, batch_size=64, random_state=42, verbose=True):
    """
    Creates, trains, and evaluates a neural network with the given configuration.
    """
    model = ScratchDeepNeuralNetrowkClassifier(
        n_layers=len(layer_configs) + 1,  # +1 for the input layer
        layer_configs=layer_configs,
        n_features=n_features,
        n_output=n_output,
        random_state=random_state
    )

    model.fit(X_train, y_train, n_iter=n_iter, batch_size=batch_size, verbose=verbose)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    plt.plot(model.loss_history)
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training Loss")
    plt.show()

    print(f"Test Accuracy: {accuracy:.4f}")
    return accuracy, model.loss_history

# --- Problem 9 ---

# Network 1: Simple Network (2 layers, ReLU)
layer_config_1 = [
    {'type': 'FC', 'n_nodes': 128, 'activation': 'ReLU', 'initializer': HeInitializer(), 'optimizer': AdaGrad(lr=0.01)},
    {'type': 'FC', 'n_nodes': n_output, 'activation': 'Softmax', 'initializer': XavierInitializer(), 'optimizer': AdaGrad(lr=0.01)}
]
print("\nTraining Network 1 (2 layers, ReLU):")
#accuracy_1, loss_history_1 = train_and_evaluate(layer_config_1, n_features, n_output, X_train, y_train, X_test, y_test)

# Network 2: Deeper Network (3 layers, ReLU)
layer_config_2 = [
    {'type': 'FC', 'n_nodes': 256, 'activation': 'ReLU', 'initializer': HeInitializer(), 'optimizer': AdaGrad(lr=0.01)},
    {'type': 'FC', 'n_nodes': 128, 'activation': 'ReLU', 'initializer': HeInitializer(), 'optimizer': AdaGrad(lr=0.01)},
    {'type': 'FC', 'n_nodes': n_output, 'activation': 'Softmax', 'initializer': XavierInitializer(), 'optimizer': AdaGrad(lr=0.01)}
]
print("\nTraining Network 2 (3 layers, ReLU):")
#accuracy_2, loss_history_2 = train_and_evaluate(layer_config_2, n_features, n_output, X_train, y_train, X_test, y_test)

# Network 3: Simple Network (2 layers, Tanh)
layer_config_3 = [
    {'type': 'FC', 'n_nodes': 128, 'activation': 'Tanh', 'initializer': XavierInitializer(), 'optimizer': AdaGrad(lr=0.01)},
    {'type': 'FC', 'n_nodes': n_output, 'activation': 'Softmax', 'initializer': XavierInitializer(), 'optimizer': AdaGrad(lr=0.01)}
]
print("\nTraining Network 3 (2 layers, Tanh):")
#accuracy_3, loss_history_3 = train_and_evaluate(layer_config_3, n_features, n_output, X_train, y_train, X_test, y_test)

# --- End of Problem 9 ---

X_train shape: (44800, 784)
y_train shape: (44800,)
X_test shape: (14000, 784)
y_test shape: (14000,)

Training Network 1 (2 layers, ReLU):

Training Network 2 (3 layers, ReLU):

Training Network 3 (2 layers, Tanh):
