In [6]:
import numpy as np

RANDOM_SEED = 0

In [39]:
# 6.1.1: Calculating trace
RNG = np.random.default_rng(seed=RANDOM_SEED)
A = RNG.standard_normal(size=(5,5))
print("A =")
print(A)
print()
trace = np.einsum("ii->", A)
print("deviation =", np.linalg.norm(trace - np.trace(A)))

A =
[[ 0.12573022 -0.13210486  0.64042265  0.10490012 -0.53566937]
 [ 0.36159505  1.30400005  0.94708096 -0.70373524 -1.26542147]
 [-0.62327446  0.04132598 -2.32503077 -0.21879166 -1.24591095]
 [-0.73226735 -0.54425898 -0.31630016  0.41163054  1.04251337]
 [-0.12853466  1.36646347 -0.66519467  0.35151007  0.90347018]]

deviation = 0.0


In [42]:
# 6.1.2: Calculating matrix product
RNG = np.random.default_rng(seed=RANDOM_SEED)
A = RNG.standard_normal(size=(5,5))
print("A =")
print(A)
print()

B = RNG.standard_normal(size=(5,5))
print("B =")
print(B)
print()

matmul = np.einsum("ik,kj->ij", A, B)
print("deviation =", np.linalg.norm(matmul - A @ B))

A =
[[ 0.12573022 -0.13210486  0.64042265  0.10490012 -0.53566937]
 [ 0.36159505  1.30400005  0.94708096 -0.70373524 -1.26542147]
 [-0.62327446  0.04132598 -2.32503077 -0.21879166 -1.24591095]
 [-0.73226735 -0.54425898 -0.31630016  0.41163054  1.04251337]
 [-0.12853466  1.36646347 -0.66519467  0.35151007  0.90347018]]

B =
[[ 0.0940123  -0.74349925 -0.92172538 -0.45772583  0.22019512]
 [-1.00961818 -0.20917557 -0.15922501  0.54084558  0.21465912]
 [ 0.35537271 -0.65382861 -0.12961363  0.78397547  1.49343115]
 [-1.25906553  1.51392377  1.34587542  0.7813114   0.26445563]
 [-0.31392281  1.45802068  1.96025832  1.80163487  1.31510376]]

deviation = 1.2729041056627091e-15


In [47]:
# 6.1.3: Calculating batchwise product
RNG = np.random.default_rng(seed=RANDOM_SEED)
A = RNG.standard_normal(size=(3, 4, 5))
print("A =")
print(A)
print()

B = RNG.standard_normal(size=(3, 5, 6))
print("B =")
print(B)
print()

batchmul = np.einsum("bik,bkj->bij", A, B)
manual_batchmul = []
for b in range(A.shape[0]):
    manual_batchmul.append(A[b] @ B[b])
manual_batchmul = np.array(manual_batchmul)
print("deviation =", np.linalg.norm(batchmul - manual_batchmul))

A =
[[[ 0.12573022 -0.13210486  0.64042265  0.10490012 -0.53566937]
  [ 0.36159505  1.30400005  0.94708096 -0.70373524 -1.26542147]
  [-0.62327446  0.04132598 -2.32503077 -0.21879166 -1.24591095]
  [-0.73226735 -0.54425898 -0.31630016  0.41163054  1.04251337]]

 [[-0.12853466  1.36646347 -0.66519467  0.35151007  0.90347018]
  [ 0.0940123  -0.74349925 -0.92172538 -0.45772583  0.22019512]
  [-1.00961818 -0.20917557 -0.15922501  0.54084558  0.21465912]
  [ 0.35537271 -0.65382861 -0.12961363  0.78397547  1.49343115]]

 [[-1.25906553  1.51392377  1.34587542  0.7813114   0.26445563]
  [-0.31392281  1.45802068  1.96025832  1.80163487  1.31510376]
  [ 0.35738041 -1.20831863 -0.00445413  0.65647494 -1.28836146]
  [ 0.39512206  0.42986369  0.69604272 -1.18411797 -0.66170257]]]

B =
[[[-0.43643525 -1.16980191  1.73936788 -0.49591073  0.32896963
   -0.25857255]
  [ 1.58347288  1.32036099  0.63335262 -2.20350988  0.05202897
    0.68368619]
  [ 1.00396158 -0.61790704  1.82201136 -1.32043097 -0.66152

In [52]:
t = np.arange(2 * 3 * 3 * 3).reshape((2, 3, 3, 3))
t2 = np.arange(3 * 3 * 3 * 4).reshape((3, 3, 3, 4))
np.einsum("bijc,ijco->bo", t, t2)

array([[24804, 25155, 25506, 25857],
       [62712, 63792, 64872, 65952]])

In [54]:
t = np.ones(2 * 3 * 3 * 3).reshape((2, 3, 3, 3))
t2 = np.ones(3 * 3 * 3 * 4).reshape((3, 3, 3, 4))
np.einsum("bijc,ijco->bo", t, t2)

array([[27., 27., 27., 27.],
       [27., 27., 27., 27.]])

In [61]:
t = np.arange(2 * 3 * 3 * 3).reshape((2, 3, 3, 3))
np.argmax(t, axis = 1).shape

(2, 3, 3)

## Activation Function Implementations:

Implementation of `activations.Linear`:

```python
class Linear(Activation):
    def __init__(self):
        super().__init__()

    def forward(self, Z: np.ndarray) -> np.ndarray:
        """Forward pass for f(z) = z.
        
        Parameters
        ----------
        Z  input pre-activations (any shape)

        Returns
        -------
        f(z) as described above applied elementwise to `Z`
        """
        return Z

    def backward(self, Z: np.ndarray, dY: np.ndarray) -> np.ndarray:
        """Backward pass for f(z) = z.
        
        Parameters
        ----------
        Z   input to `forward` method
        dY  gradient of loss w.r.t. the output of this layer
            same shape as `Z`

        Returns
        -------
        gradient of loss w.r.t. input of this layer
        """
        return dY

```

Implementation of `activations.Sigmoid`:

```python
class Sigmoid(Activation):
    def __init__(self):
        super().__init__()

    def forward(self, Z: np.ndarray) -> np.ndarray:
        """Forward pass for sigmoid function:
        f(z) = 1 / (1 + exp(-z))
        
        Parameters
        ----------
        Z  input pre-activations (any shape)

        Returns
        -------
        f(z) as described above applied elementwise to `Z`
        """
        ### YOUR CODE HERE ###
        return np.max()

    def backward(self, Z: np.ndarray, dY: np.ndarray) -> np.ndarray:
        """Backward pass for sigmoid.
        
        Parameters
        ----------
        Z   input to `forward` method
        dY  gradient of loss w.r.t. the output of this layer
            same shape as `Z`

        Returns
        -------
        gradient of loss w.r.t. input of this layer
        """
        ### YOUR CODE HERE ###
        return ...

```

Implementation of `activations.ReLU`:

```python
class ReLU(Activation):
    def __init__(self):
        super().__init__()

    def forward(self, Z: np.ndarray) -> np.ndarray:
        """Forward pass for relu activation:
        f(z) = z if z >= 0
               0 otherwise
        
        Parameters
        ----------
        Z  input pre-activations (any shape)

        Returns
        -------
        f(z) as described above applied elementwise to `Z`
        """
        ### YOUR CODE HERE ###
        return np.maximum(Z, 0)

    def backward(self, Z: np.ndarray, dY: np.ndarray) -> np.ndarray:
        """Backward pass for relu activation.
        
        Parameters
        ----------
        Z   input to `forward` method
        dY  gradient of loss w.r.t. the output of this layer
            same shape as `Z`

        Returns
        -------
        gradient of loss w.r.t. input of this layer
        """
        ### YOUR CODE HERE ###
        return dY * (Z > 0)

```

Implementation of `activations.SoftMax`:

```python
class SoftMax(Activation):
    def __init__(self):
        super().__init__()

    def forward(self, Z: np.ndarray) -> np.ndarray:
        """Forward pass for softmax activation.
        Hint: The naive implementation might not be numerically stable.
        
        Parameters
        ----------
        Z  input pre-activations (any shape)

        Returns
        -------
        f(z) as described above applied elementwise to `Z`
        """
        ### YOUR CODE HERE ###
        m = np.max(Z, axis=-1, keepdims=True) # axis=-1 -> apply max over last dim -> each class gets its own stabilization
        stable = Z - m
        exponentiated = np.exp(stable)
        distribution = np.divide(exponentiated, np.sum(exponentiated, axis=-1, keepdims=True))
        return distribution

    def backward(self, Z: np.ndarray, dY: np.ndarray) -> np.ndarray:
        """Backward pass for softmax activation.
        
        Parameters
        ----------
        Z   input to `forward` method
        dY  gradient of loss w.r.t. the output of this layer
            same shape as `Z`

        Returns
        -------
        gradient of loss w.r.t. input of this layer
        """
        ### YOUR CODE HERE ###
        batch_sigmas = self.forward(Z)
        grads = [] # could fix with einsum but its good enough for now
        for sample_idx in range(batch_sigmas.shape[0]):
            sample_sigmas = batch_sigmas[sample_idx]
            diag_sigmas = np.diagflat(sample_sigmas) # diag(sigma_i)
            sample_sigmas = sample_sigmas.reshape((-1, 1)) # make a column vector to allow linalg
            dSigma = diag_sigmas - (sample_sigmas @ sample_sigmas.T) # diag(sigma_i) - [sigma_i * sigma_j] for all i,j <= k 
            # -> [s_i(1 - si) on diags, 0 - s_i*s_j off diag] :)
            grads.append(dY[sample_idx] @ dSigma)
        return np.array(grads)

```


## Layer Implementations:

Implementation of `layers.FullyConnected`:

```python
class FullyConnected(Layer):
    """A fully-connected layer multiplies its input by a weight matrix, adds
    a bias, and then applies an activation function.
    """

    def __init__(
        self, n_out: int, activation: str, weight_init="xavier_uniform"
    ) -> None:

        super().__init__()
        self.n_in = None
        self.n_out = n_out
        self.activation = initialize_activation(activation)

        # instantiate the weight initializer
        self.init_weights = initialize_weights(weight_init, activation=activation)

    def _init_parameters(self, X_shape: Tuple[int, int]) -> None:
        """Initialize all layer parameters (weights, biases)."""
        self.n_in = X_shape[1]

        ### BEGIN YOUR CODE ###

        W = self.init_weights((self.n_in, self.n_out))
        b = np.zeros((1, self.n_out))

        self.parameters = OrderedDict({"W": W, "b": b}) # DO NOT CHANGE THE KEYS
        self.cache = OrderedDict({"Z": [],"X": []})  # cache for backprop
        self.gradients = OrderedDict({"W": np.zeros_like(W), "b": np.zeros_like(b)}) # parameter gradients initialized to zero  
                                                                                     # MUST HAVE THE SAME KEYS AS `self.parameters`

        ### END YOUR CODE ###

    def forward(self, X: np.ndarray) -> np.ndarray:
        """Forward pass: multiply by a weight matrix, add a bias, apply activation.
        Also, store all necessary intermediate results in the `cache` dictionary
        to be able to compute the backward pass.

        Parameters
        ----------
        X  input matrix of shape (batch_size, input_dim)

        Returns
        -------
        a matrix of shape (batch_size, output_dim)
        """
        # initialize layer parameters if they have not been initialized
        if self.n_in is None:
            self._init_parameters(X.shape)

        ### BEGIN YOUR CODE ###
        
        # perform an affine transformation and activation
        W = self.parameters["W"]
        b = self.parameters["b"]
        Z = X @ W + b
        out = self.activation(Z)
        
        # store information necessary for backprop in `self.cache`
        self.cache["Z"] = Z
        self.cache["X"] = X

        ### END YOUR CODE ###

        return out

    def backward(self, dLdY: np.ndarray) -> np.ndarray:
        """Backward pass for fully connected layer.
        Compute the gradients of the loss with respect to:
            1. the weights of this layer (mutate the `gradients` dictionary)
            2. the bias of this layer (mutate the `gradients` dictionary)
            3. the input of this layer (return this)

        Parameters
        ----------
        dLdY  gradient of the loss with respect to the output of this layer
              shape (batch_size, output_dim)

        Returns
        -------
        gradient of the loss with respect to the input of this layer
        shape (batch_size, input_dim)
        """
        ### BEGIN YOUR CODE ###
        
        # unpack the cache
        W = self.parameters["W"]
        # b = self.parameters["b"]
        Z = self.cache["Z"]
        X = self.cache["X"]

        # compute the gradients of the loss w.r.t. all parameters as well as the
        # input of the layer
        dZ = self.activation.backward(Z, dLdY)
        dW = X.T @ dZ
        db = np.sum(dZ, axis=0, keepdims=True) # keepdims -> shape = (1, n^[l + 1]) as opposed to (n^[l + 1])
        dX = dZ @ W.T

        # store the gradients in `self.gradients`
        # the gradient for self.parameters["W"] should be stored in
        # self.gradients["W"], etc.
        self.gradients["W"] = dW
        self.gradients["b"] = db

        ### END YOUR CODE ###

        return dX

```

Implementation of `layers.Pool2D`:

```python
class Pool2D(Layer):
    """Pooling layer, implements max and average pooling."""

    def __init__(
        self,
        kernel_shape: Tuple[int, int],
        mode: str = "max",
        stride: int = 1,
        pad: Union[int, Literal["same"], Literal["valid"]] = 0,
    ) -> None:

        if type(kernel_shape) == int:
            kernel_shape = (kernel_shape, kernel_shape)

        self.kernel_shape = kernel_shape
        self.stride = stride

        if pad == "same":
            self.pad = ((kernel_shape[0] - 1) // 2, (kernel_shape[1] - 1) // 2)
        elif pad == "valid":
            self.pad = (0, 0)
        elif isinstance(pad, int):
            self.pad = (pad, pad)
        else:
            raise ValueError("Invalid Pad mode found in self.pad.")

        self.mode = mode

        if mode == "max":
            self.pool_fn = np.max
            self.arg_pool_fn = np.argmax
        elif mode == "average":
            self.pool_fn = np.mean

        self.cache = {
            "out_rows": [],
            "out_cols": [],
            "X_pad": [],
            "p": [],
            "pool_shape": [],
        }
        self.parameters = {}
        self.gradients = {}

    def forward(self, X: np.ndarray) -> np.ndarray:
        """Forward pass: use the pooling function to aggregate local information
        in the input. This layer typically reduces the spatial dimensionality of
        the input while keeping the number of feature maps the same.

        As with all other layers, please make sure to cache the appropriate
        information for the backward pass.

        Parameters
        ----------
        X  input array of shape (batch_size, in_rows, in_cols, channels)

        Returns
        -------
        pooled array of shape (batch_size, out_rows, out_cols, channels)
        """
        ### BEGIN YOUR CODE ###

        # implement the forward pass
        n_examples, in_rows, in_cols, in_channels = X.shape
        kernel_height, kernel_width = self.kernel_shape
        out_rows = int((in_rows + 2 * self.pad[0] - kernel_height) / self.stride) + 1
        out_cols = int((in_cols + 2 * self.pad[1] - kernel_width) / self.stride) + 1

        X_pad = np.pad(X, ((0,), (self.pad[0],), (self.pad[1],), (0,)))
        X_pool = np.zeros((n_examples, out_rows, out_cols, in_channels))

        for row in range(out_rows):
            y = row * self.stride
            for col in range(out_cols):
                x = col * self.stride
                X_pool[:, row, col, :] = self.pool_fn(X_pad[:, y:y + kernel_height, x:x + kernel_width, :],
                                                       axis = (1, 2))

        # cache any values required for backprop
        self.cache["out_rows"] = out_rows
        self.cache["out_cols"] = out_cols
        self.cache["X_pad"] = X_pad

        ### END YOUR CODE ###

        return X_pool

    def backward(self, dLdY: np.ndarray) -> np.ndarray:
        """Backward pass for pooling layer.

        Parameters
        ----------
        dLdY  gradient of loss with respect to the output of this layer
              shape (batch_size, out_rows, out_cols, channels)

        Returns
        -------
        gradient of loss with respect to the input of this layer
        shape (batch_size, in_rows, in_cols, channels)
        """
        ### BEGIN YOUR CODE ###

        # perform a backward pass
        
        # unpack the cache
        out_rows = self.cache["out_rows"]
        out_cols = self.cache["out_cols"]
        X_pad = self.cache["X_pad"]

        h, w = self.kernel_shape
        n_examples, in_rows, in_cols, in_channels = X_pad.shape
        in_rows = in_rows - 2 * self.pad[0]
        in_cols = in_cols - 2 * self.pad[1]

        # compute the gradients of the loss w.r.t. all parameters as well as the
        # input of the layer
        dX_pad = np.zeros_like(X_pad)

        for row in range(out_rows):
            y = row * self.stride
            for col in range(out_cols):
                x = col * self.stride
                if self.mode == "max":
                    flattened_pad = X_pad[:, y:y + h, x:x + w, :].reshape((n_examples, h * w, in_channels))
                    kernel_idxs = self.arg_pool_fn(flattened_pad, axis = 1)
                    batch_idxs, channel_idxs = np.indices((n_examples, in_channels))
                    mask = np.zeros_like(flattened_pad)
                    mask[batch_idxs, kernel_idxs, channel_idxs] = 1
                    mask = mask.reshape((n_examples, h, w, in_channels))
                    dX_pad[:, y:y + h, x:x + w, :] += mask * dLdY[:, row:row + 1, col:col + 1, :]
                elif self.mode == "average":
                    dX_pad[:, y:y + h, x:x + w, :] += dLdY[:, row:row + 1, col:col + 1, :] / (h * w)
        
        dX = dX_pad[:, self.pad[0]:self.pad[0] + in_rows, self.pad[1]:self.pad[1] + in_cols, :]

        # store the gradients in `self.gradients`
        # the gradient for self.parameters["W"] should be stored in
        # self.gradients["W"], etc.
        
        ### END YOUR CODE ###

        return dX

```

Implementation of `layers.Conv2D.__init__`:

```python
    def __init__(
        self,
        n_out: int,
        kernel_shape: Tuple[int, int],
        activation: str,
        stride: int = 1,
        pad: str = "same",
        weight_init: str = "xavier_uniform",
    ) -> None:

        super().__init__()
        self.n_in = None
        self.n_out = n_out
        self.kernel_shape = kernel_shape
        self.stride = stride
        self.pad = pad

        self.activation = initialize_activation(activation)
        self.init_weights = initialize_weights(weight_init, activation=activation)

```

Implementation of `layers.Conv2D._init_parameters`:

```python
    def _init_parameters(self, X_shape: Tuple[int, int, int, int]) -> None:
        """Initialize all layer parameters and determine padding."""
        self.n_in = X_shape[3]

        W_shape = self.kernel_shape + (self.n_in,) + (self.n_out,)
        W = self.init_weights(W_shape)
        b = np.zeros((1, self.n_out))

        self.parameters = OrderedDict({"W": W, "b": b}) # DO NOT CHANGE THE KEYS
        self.cache = OrderedDict({"Z": [], "X": []}) # cache for backprop
        self.gradients = OrderedDict({"W": np.zeros_like(W), "b": np.zeros_like(b)}) # parameter gradients initialized to zero
                                                                                     # MUST HAVE THE SAME KEYS AS `self.parameters`

        if self.pad == "same":
            self.pad = ((W_shape[0] - 1) // 2, (W_shape[1] - 1) // 2)
        elif self.pad == "valid":
            self.pad = (0, 0)
        elif isinstance(self.pad, int):
            self.pad = (self.pad, self.pad)
        else:
            raise ValueError("Invalid Pad mode found in self.pad.")

```

Implementation of `layers.Conv2D.forward`:

```python
    def forward(self, X: np.ndarray) -> np.ndarray:
        """Forward pass for convolutional layer. This layer convolves the input
        `X` with a filter of weights, adds a bias term, and applies an activation
        function to compute the output. This layer also supports padding and
        integer strides. Intermediates necessary for the backward pass are stored
        in the cache.

        Parameters
        ----------
        X  input with shape (batch_size, in_rows, in_cols, in_channels)

        Returns
        -------
        output feature maps with shape (batch_size, out_rows, out_cols, out_channels)
        """
        if self.n_in is None:
            self._init_parameters(X.shape)

        W = self.parameters["W"]
        b = self.parameters["b"]

        kernel_height, kernel_width, in_channels, out_channels = W.shape
        n_examples, in_rows, in_cols, in_channels = X.shape
        kernel_shape = (kernel_height, kernel_width)

        ### BEGIN YOUR CODE ###

        # implement a convolutional forward pass
        X_pad = np.pad(X, ((0,), (self.pad[0],), (self.pad[1],), (0,)))

        out_rows = int((in_rows + 2 * self.pad[0] - kernel_height) / self.stride) + 1
        out_cols = int((in_cols + 2 * self.pad[1] - kernel_width) / self.stride) + 1

        Z = np.zeros((n_examples, out_rows, out_cols, out_channels))

        for row in range(out_rows):
            y = row * self.stride
            for col in range(out_cols):
                x = col * self.stride
                Z[:, row, col, :] = np.einsum("bijc,ijco->bo", 
                                              X_pad[:, y:y + kernel_height, x:x + kernel_width, :],
                                              W) + b

        out = self.activation(Z)

        # cache any values required for backprop
        self.cache["X"] = X 
        self.cache["Z"] = Z

        ### END YOUR CODE ###

        return out

```

Implementation of `layers.Conv2D.backward`:

```python
    def backward(self, dLdY: np.ndarray) -> np.ndarray:
        """Backward pass for conv layer. Computes the gradients of the output
        with respect to the input feature maps as well as the filter weights and
        biases.

        Parameters
        ----------
        dLdY  gradient of loss with respect to output of this layer
              shape (batch_size, out_rows, out_cols, out_channels)

        Returns
        -------
        gradient of the loss with respect to the input of this layer
        shape (batch_size, in_rows, in_cols, in_channels)
        """
        ### BEGIN YOUR CODE ###

        # perform a backward pass

        # unpack the cache
        W = self.parameters["W"]
        # b = self.parameters["b"]
        Z = self.cache["Z"]
        X = self.cache["X"]
        X_pad = np.pad(X, ((0,), (self.pad[0],), (self.pad[1],), (0,)))

        kernel_height, kernel_width, in_channels, out_channels = W.shape
        n_examples, in_rows, in_cols, in_channels = X.shape

        out_rows = int((in_rows + 2 * self.pad[0] - kernel_height) / self.stride) + 1
        out_cols = int((in_cols + 2 * self.pad[1] - kernel_width) / self.stride) + 1

        # compute the gradients of the loss w.r.t. all parameters as well as the
        # input of the layer
        dZ = self.activation.backward(Z, dLdY)
        db = np.sum(dZ, axis = (0, 1, 2)).reshape(1, -1)
        dW = np.zeros_like(W)
        dX_pad = np.zeros_like(X_pad)

        for row in range(out_rows):
            y = row * self.stride
            for col in range(out_cols):
                x = col * self.stride
                dW += np.einsum("bijo,bklc->klco",
                                dZ[:, row:row+1, col:col+1, :], 
                                X_pad[:,y:y + kernel_height, x:x + kernel_width,:])
                dX_pad[:, y:y + kernel_height, x:x + kernel_width, :] += np.einsum(
                    "bijo,klco->bklc",
                    dZ[:, row:row+1, col:col+1, :],
                    W)
        
        dX = dX_pad[:, self.pad[0]:self.pad[0] + in_rows, self.pad[1]:self.pad[1] + in_cols, :]

        # store the gradients in `self.gradients`
        # the gradient for self.parameters["W"] should be stored in
        # self.gradients["W"], etc.
        self.gradients["W"] = dW
        self.gradients["b"] = db

        ### END YOUR CODE ###

        return dX

```


## Loss Function Implementations:

Implementation of `losses.CrossEntropy`:

```python
class CrossEntropy(Loss):
    """Cross entropy loss function."""

    def __init__(self, name: str) -> None:
        self.name = name

    def __call__(self, Y: np.ndarray, Y_hat: np.ndarray) -> float:
        return self.forward(Y, Y_hat)

    def forward(self, Y: np.ndarray, Y_hat: np.ndarray) -> float:
        """Computes the loss for predictions `Y_hat` given one-hot encoded labels
        `Y`.

        Parameters
        ----------
        Y      one-hot encoded labels of shape (batch_size, num_classes)
        Y_hat  model predictions in range (0, 1) of shape (batch_size, num_classes)

        Returns
        -------
        a single float representing the loss
        """
        ### YOUR CODE HERE ###
        a_really_friggin_small_number_safety_first = 5e-324
        return - np.sum(Y * np.log(Y_hat + a_really_friggin_small_number_safety_first)) / Y.shape[0]

    def backward(self, Y: np.ndarray, Y_hat: np.ndarray) -> np.ndarray:
        """Backward pass of cross-entropy loss.
        NOTE: This is correct ONLY when the loss function is SoftMax.

        Parameters
        ----------
        Y      one-hot encoded labels of shape (batch_size, num_classes)
        Y_hat  model predictions in range (0, 1) of shape (batch_size, num_classes)

        Returns
        -------
        the gradient of the cross-entropy loss with respect to the vector of
        predictions, `Y_hat`
        """
        ### YOUR CODE HERE ###
        return - Y / (Y_hat * Y.shape[0])

```


## Model Implementations:

Implementation of `models.NeuralNetwork.forward`:

```python
    def forward(self, X: np.ndarray) -> np.ndarray:
        """One forward pass through all the layers of the neural network.

        Parameters
        ----------
        X  design matrix whose must match the input shape required by the
           first layer

        Returns
        -------
        forward pass output, matches the shape of the output of the last layer
        """
        ### YOUR CODE HERE ###
        # Iterate through the network's layers.
        Y_hat = X
        for layer in self.layers:
            Y_hat = layer.forward(Y_hat)
        return Y_hat

```

Implementation of `models.NeuralNetwork.backward`:

```python
    def backward(self, target: np.ndarray, out: np.ndarray) -> float:
        """One backward pass through all the layers of the neural network.
        During this phase we calculate the gradients of the loss with respect to
        each of the parameters of the entire neural network. Most of the heavy
        lifting is done by the `backward` methods of the layers, so this method
        should be relatively simple. Also make sure to compute the loss in this
        method and NOT in `self.forward`.

        Note: Both input arrays have the same shape.

        Parameters
        ----------
        target  the targets we are trying to fit to (e.g., training labels)
        out     the predictions of the model on training data

        Returns
        -------
        the loss of the model given the training inputs and targets
        """
        ### YOUR CODE HERE ###
        # Compute the loss.
        L = self.loss(target, out)
        # Backpropagate through the network's layers.
        dLdY = self.loss.backward(target, out)
        for layer in self.layers[::-1]:
            dLdY = layer.backward(dLdY)
        return L

```

Implementation of `models.NeuralNetwork.predict`:

```python
    def predict(self, X: np.ndarray, Y: np.ndarray) -> Tuple[np.ndarray, float]:
        """Make a forward and backward pass to calculate the predictions and
        loss of the neural network on the given data.

        Parameters
        ----------
        X  input features
        Y  targets (same length as `X`)

        Returns
        -------
        a tuple of the prediction and loss
        """
        ### YOUR CODE HERE ###
        # Do a forward pass. Maybe use a function you already wrote?
        Y_hat = self.forward(X)
        # Get the loss. Remember that the `backward` function returns the loss.
        L = self.backward(Y, Y_hat)
        return (Y_hat, L)

```

