##3.2 Object-Oriented Design for Implementation

In [None]:
pip install d2l

In [None]:
import time
import numpy as np
import torch
from torch import nn
from d2l import torch as d2l

###3.2.1 Utilities

In [None]:
def add_to_class(Class): #save
  """Register functions as methods in created class."""
  def wrapper(obj):
    setattr(Class, obj.__name__, obj)
  return wrapper

In [None]:
class A:
  def __init__(self):
    self.b= 1

a = A()

In [None]:
@add_to_class(A)
def do(self):
  print('Class attribute "b" is', self.b)

a.do()

In [None]:
class HyperParameters: #save
  """The base class of hyperparameters."""
  def save_hyperparameters(self, ignore=[]):
    raise NotImplemented

In [None]:
# Call the fully implemented HyperParameters class saved iind2l
class B(d2l.HyperParameters):
  def __init__(self, a, b, c):
    self.save_hyperparameters(ignore=['c'])
    print('self.a =', self.a, 'self.b =', self.b)
    print('There is no self.c =', not hasattr(self, 'c'))

b = B(a=1, b=2, c=3)

In [None]:
class ProgressBoard(d2l.HyperParameters): #save
  """The board that plots data points in animation."""
  def __init__(self, xlabel=None, ylabel=None, xlim=None,
               ylim=None, xscale='linear', yscale='linear',
               ls=['-', '--', '-.', ':'], colors=['C0', 'C1', 'C2', 'C3'],
               fig = None, axes = None, figsize=(3.5, 2.5), display=True):
    self.save_hyperparameters()

  def draw(self, x, y, label, every_n=1):
    raise NotImplemented

In [None]:
board = d2l.ProgressBoard('x')
for x in np.arange(0, 10, 0.1):
  board.draw(x, np.sin(x), 'sin', every_n=2)
  board.draw(x, np.cos(x), 'cos', every_n=10)

###3.2.2 Models

In [None]:
class Module(nn.Module, d2l.HyperParameters): #save
  """The base class of models."""
  def __init__(self, plot_train_per_epoch=2, plot_valid_per_epoch=1):
    super().__init__()
    self.save_hyperparameters()
    self.board = ProgressBoard()

  def loss(self, y_hat, y):
    raise ImplementError

  def forward(self, x):
    assert hasattr(self, 'net'), 'Neural network is defined'
    return self.net(X)

  def plot(self, key, value, train):
    """Plot a point in animation."""
    assert hasattr(self, 'trainer'), 'Trainer is not inited'
    self.board.xlabel = 'epoch'
    if train:
      x = self.trainer.train_batch_idex / \
          self.trainer.num_train_batches
      n = self.trainer.num_train_batches / \
          self.plot_train_per_epoch

    else:
      x = self.trainer.epoch + 1
      n = self.trainer.num_val_batches / \
          self.plot_valid_per_epoch
    self.board.draw(X, value.to(d2l.cpu()).detach().numpy(),
                    ('train_' if train else 'val_') + key,
                    every_n =  int(n))

  def training_step(self, batch):
    l = self.loss(self(*batch[:-1]), batch[-1])
    self.plot('loss', l, train = True)
    return l

  def validation_step(self, batch):
    l = self.loss(self(*batch[:-1]), batch[-1])
    self.plot('loss', l, rain=False)

  def configure_optimizers(self):
    raise NotImplementedError

###3.2.3 Data

In [None]:
class fataModule(d2l.HyperParameters): #save
  """"The base class of data."""
  def __init__(self, root='../data', num_workers = 4):
    self.save_hyperparameters()

  def get_dataloader(self_train):
    raise NotImplementedError

  def train_dataloader(self):
    return self.get_dataloader(train=True)

  def val_dataloader(self):
    return self.get_dataloader(train=False)

###3.2.4 Training

In [None]:
class Trainer(d2l.HyperParameters): #save
  """The base class for training model with data."""
  def __init__(self, max_epochs, num_gpus=0, gradient_clip_val=0):
    self.save_hyperparameters()
    assert num_gpus == 0, 'No GPU support yet'

  def prepare_data(self, data):
    self.train_dataloader = data.train_dataloader()
    self.val_dataloader = data.val_dataloader()
    self.num_train_batches = len(self.train_dataloader)
    self.num_val_batches = (len(self.val_dataloader)
                            if self.val_dataloader is not None else 0)

  def prepare_model(self, model):
    model.trainer = self
    model.board.xlim = [0, self.max_epochs]
    self.model = model

  def fit(self, model, data):
    self.prepare_data(data)
    self.prepare_model(model)
    self.optim = model.configure_optimizers()
    self.epoch = 0
    self.train_batch_idex = 0
    self.val_batch_idx = 0
    for self.epoch in range(self.max_epochs):
      self.fit_epoch()

  def fit_epoch(self):
    raise NotImplementedError

##Discussion and Review 3.2

###3.2.2

Key Features of the Module Class

- Inheritance from `nn.Module`: Ensures compatibility with PyTorch, leveraging its built-in functionalities for `forward()` calls and automatic parameter registration.
- Loss Computation and Visualization: The `training_step` and `validation_step` methods manage loss calculations and interactive plotting.
- Optimizer Configuration: The `configure_optimizers` method is designed to be implemented in specific models to specify the optimization algorithms.


###3.2.3

Key Features of the Trainer Class

- Data Preparation: The `prepare_data` method sets up data loaders for both training and validation.
- Model Preparation: The `prepare_model `method links the model to the trainer, configuring plotting parameters.
- Training Loop: The `fit` method orchestrates the training process over a specified number of epochs, coordinating data and model interactions.


##3.4 Linear Regression Implementation from Scratch

In [None]:
%matplotlib inline
import torch
from d2l import torch as d2l

###3.4.1 Defining the Model

In [None]:
class LinearRegressionScratch(d2l.Module): #save
  """The linear regression model implemented from scratch."""
  def __init__(self, num_inputs, lr, sigma=0.01):
    super().__init__()
    self.save_hyperparameters()
    self.w = torch.normal(0, sigma, (num_inputs, 1), requires_grad=True)
    self.b = torch.zeros(1, requires_grad=True)

In [None]:
@d2l.add_to_class(LinearRegressionScratch) #save
def forward(self, X):
  return torch.matmul(X, self.w) + self.b

###3.4.2 Defining the Loss Function

In [None]:
@d2l.add_to_class(LinearRegressionScratch) #save
def loss(self, y_hat, y):
  l = (y_hat - y) ** 2 / 2
  return l.mean()

###3.4.3 Defining the Optimization Algorithm

In [None]:
class SGD(d2l.HyperParameters):
  """Minibatch stochastic gradient descent."""
  def __init__(self, params, lr):
    self.save_hyperparameters()

  def step(self):
    for param in self.params:
      param -= self.lr * param.grad

  def zero_grad(self):
    for param in self.params:
      if param.grad is not None:
        param.grad.zero_()

In [None]:
@d2l.add_to_class(LinearRegressionScratch) #save
def configure_optimizers(self):
  return SGD([self.w, self.b], self.lr)

###3.4.4 Training

In [None]:
@d2l.add_to_class(d2l.Trainer) #save
def prepare_batch(self, batch):
  return batch

@d2l.add_to_class(d2l.Trainer) #save
def fit_epoch(self):
  self.model.train()
  for batch in self.train_dataloader:
    loss = self.model.training_step(self.prepare_batch(batch))
    self.optim.zero_grad()
    with torch.no_grad():
      loss.backward()
      if self.gradient_clip_val > 0:
        self.clip_gradients(self.gradient_clip_val, self.model)
      self.optim.step()
    self.train_batch_idx += 1
  if self.val_dataloader is None:
    return
  self.model.eval()
  for batch in self.val_dataloader:
    with torch.no_grad():
      self.model.validation_step(self.prepare_batch(batch))
    self.val_batch_idx += 1

In [None]:
model = LinearRegressionScratch(2, lr=0.03)
data = d2l.SyntheticRegressionData(w=torch.tensor([2, -3.4]), b=4.2)
trainer = d2l.Trainer(max_epochs=3)
trainer.fit(model, data)

In [None]:
with torch.no_grad():
  print(f'error in estimating w: {data.w - model.w.reshape(data.w.shape)}')
  print(f'error in estimating b: {data.b - model.b}')

##Discussion & Review 3.4

###3.4.4

- With the model, loss function, and optimizer components defined, we can implement the main training loop. This loop orchestrates the training process by iterating over batches of data, calculating the loss, computing gradients, and updating the model’s parameters.

The Training Loop Steps

1. Initialize Parameters: Set the initial weights and biases.
2. Repeat for Each Epoch:
  - Compute the gradient of the loss with respect to the parameters.
  - Update the parameters using the optimizer.
  - Optionally, measure performance on a validation dataset.

- The training loop processes all training data divided into minibatches. This implementation is encapsulated within the Trainer class.

##4.1 Softmax Regression

Information Theory Basics

Information theory provides a framework for understanding how to encode, transmit, and manipulate data, which is essential for many deep learning applications.

####4.1.3.1 Entropy

- Definition: Entropy quantifies the amount of uncertainty or information present in a data distribution.
- Encoding Limit: According to Shannon’s theorem, a minimum of “nats” is required to encode data from a distribution. A "nat" is the natural logarithm equivalent of a "bit" in base 2.

####4.1.3.2. Surprisal

- Concept: Surprisal measures the unexpectedness of an event based on its probability. Events with lower probabilities yield higher surprisal values.

####4.1.3.3. Cross-Entropy Revisited

- Definition: Cross-entropy measures the expected surprisal when an observer employs subjective probabilities to predict data that follows a true probability distribution.

- Relation to Entropy: The minimum cross-entropy occurs when subjective probabilities perfectly align with true probabilities.

- Objective: In classification tasks, cross-entropy serves dual purposes:

    (i) maximizing the likelihood of observed data

    (ii) minimizing the surprisal (and thereby the bits needed) for effectively communicating the labels.

##4.2 The Image Classification Dataset

In [None]:
%matplotlib inline
import time
import torch
import torchvision
from torchvision import transforms
from d2l import torch as d2l

d2l.use_svg_display()

###4.2.1 Loading the Dataset

In [None]:
class FashionMNIST(d2l.DataModule): #save
  """The Fashion-MNIST dataset."""
  def __init__(self, batch_size = 64, resize = (28, 28)):
    super().__init__()
    self.save_hyperparameters()
    trans = transforms.Compose([transforms.Resize(resize),
                                transforms.ToTensor()])
    self.train = torchvision.datasets.FashionMNIST(
     root = self.root, train = True, transform =trans, download =True)
    self.val = torchvision.datasets.FashionMNIST(
        root =self.root, train = False, transform = trans, download =True)

In [None]:
data = FashionMNIST(resize=(32, 32))
len(data.train), len(data.val)

In [None]:
data.train[0][0].shape

In [None]:
@d2l.add_to_class(FashionMNIST) #save
def text_labels(self, indices):
  """Return text labels."""
  labels = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat',
            'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']
  return [labels[int(i)] for i in indices]

###4.2.2 Reading a Minibatch

In [None]:
@d2l.add_to_class(FashionMNIST) #save
def get_dataloader(self, train):
  data = self.train if train else self.val
  return torch.utils.data.DataLoader(data, self.batch_size, shuffle = train,
                                     num_workers = self.num_workers)

In [None]:
X, y = next(iter(data.train_dataloader()))
print(X.shape, X.dtype, y.shape, y.dtype)

In [None]:
tic = time.time()
for X, y in data.train_dataloader():
  continue
f'{time.time() - tic:.2f} sec'

###4.2.3 Visualization

In [None]:
def show_images(imgs, num_rows, num_cols, titles=None, scale=1.5): #save
  """Plot a list of images."""
  raise NotImplementedError

In [None]:
@d2l.add_to_class(FashionMNIST) #save
def visualize(self, batch, nrows=1, ncols=8, labels=[]):
  X, y = batch
  if not labels:
    labels = self.text_labels(y)
  d2l.show_images(X.squeeze(1), nrows, ncols, titles=labels)
batch = next(iter(data.val_dataloader()))
data.visualize(batch)

##4.3 The Base Classification Model

##4.3.1 The Classifier Class

In [None]:
class Classifier(d2l.Module):
  """The base class of classification models."""
  def validation_step(self, batch):
    Y_hat = self(*batch[:-1])
    self.plot('loss', self.loss(Y_hat, batch[-1], train=False))
    self.plot('acc', self.accuracy(Y_hat, batch[-1]), train=False)

In [None]:
@d2l.add_to_class(d2l.Module) #save
def configure_optimizers(self):
  return torch.optim.SGD(self.parameters(), lr = self.lr)

###4.3.2 Accuracy

In [None]:
@d2l.add_to_class(Classifier)
def accuracy(self, Y_hat, Y, averaged=True):
  """Compute the number of correct predictions."""
  Y_hat = Y_hat.reshape((-1, Y_hat.shape[-1]))
  preds = Y_hat.argmax(axis = 1).type(torch.float32)
  return compare.mean() if averaged else compare


##4.4 Softmax Regression Implementatio from Scratch

###4.4.1 The Softmax

In [None]:
X = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
X.sum(0, keepdims=True), X.sum(1, keepdims=True)

In [None]:
def softmax(X):
  X_exp = torch.exp(X)
  partition = X_exp.sum(1, keepdims=True)
  return X_exp / partition # The broadcasting mechanism is applied here

In [None]:
X = torch.rand((2, 5))
X_prob = softmax(X)
X_prob, X_prob.sum(1)

###4.4.2 The Model

In [None]:
class SoftmaxRegressionScratch(d2l.Classifier):
  def __init__(self, num_inputs, num_outputs, lr, sigma=0.01):
    super().__init__()
    self.save_hyperparameters()
    self.W = torch.normal(0, sigma, size = (num_inputs, num_outputs),
                         requires_grad = True)
    self.b = torch.zeros(num_outputs, requires_grad = True)

  def parameters(self):
    return [self.W, self.b]

In [None]:
@d2l.add_to_class(SoftmaxRegressionScratch)
def forward(self, X):
  X = X.reshape((-1, self.W.shape[0]))
  return softmax(torch.matmul(X, self.W) + self.b)

###4.4.3 The Cross-Entropy Loss

In [None]:
y = torch.tensor([0, 2])
y_hat = torch.tensor([[0.1, 0.3, 0.6], [0.3, 0.2,0.5]])
y_hat[[0, 1], y]

In [None]:
def cross_entropy(y_hat, y):
  return -torch.log(y_hat[list(range(len(y_hat))), y]).mean()

cross_entropy(y_hat, y)

In [None]:
@d2l.add_to_class(SoftmaxRegressionScratch)
def loss(self, y_hat, y):
  return cross_entropy(y_hat, y)

###4.4.4 Training

In [None]:
data = d2l.FashionMNIST(batch_size=256)
model = SoftmaxRegressionScratch(num_inputs=784, num_outputs=10, lr=0.1)
trainer = d2l.Trainer(max_epochs=10)
trainer.fit(model, data)

###4.4.5 Prediction

In [None]:
X, y = next(iter(data.val_dataloader()))
preds = model(X).argmax(axis = 1)
preds.shape

In [None]:
wrong = preds.type(y.dtype) != y
X, y, preds = X[wrong], y[wrong], preds[wrong]
labels = [a+'\n' + b for a, b in zip(
    data.text_labels(y), data.text_labels(preds))]
data.visualize([X, y], labels = labels)

##Discussion and Exercises 4.4

####4.4.1.

The softmax function converts logits (raw prediction scores) into probabilities through three main steps:

1. Exponentiation: Compute the exponential of each element.
2. Normalization: Calculate the sum of the exponentials for each row to determine the normalization constant.
3. Division: Divide each element by its row’s normalization constant, ensuring the results sum to 1.

##5.1 Multilayer Perceptrons

###5.1.2 Activation Functions

####5.1.2.1 ReLU Function

In [None]:
x = torch.arange(-8.0, 8.0, 0.1, requires_grad=True)
y = torch.relu(x)
d2l.plot(x.detach(), y.detach(), 'x', 'relu(x)', figsize = (5, 2.5))

In [None]:
y.backward(torch.ones_like(x), retain_graph = True)
d2l.plot(x.detach(), x.grad, 'x', 'grad of relu', figsize = (5, 2.5))

####5.1.2.2 Sigmoid Function

In [None]:
y = torch.sigmoid(x)
d2l.plot(x.detach(), y.detach(), 'x', 'sigmoid(x)', figsize = (5, 2.5))

In [None]:
# Clear out previous gradients
x.grad.data.zero_()
y.backward(torch.ones_like(x), retain_graph = True)
d2l.plot(x.detach(), x.grad, 'x', 'grad of sigmoid', figsize = (5,2.5))

####5.1.2.3 Tanh Function

In [None]:
y = torch.tanh(x)
d2l.plot(x.detach(), y.detach(), 'x', 'tanh(x)', figsize=(5, 2.5))

In [None]:
# Clear out previous gradients
x.grad.data.zero_()
y.backward(torch.ones_like(x), retain_graph = True)
d2l.plot(x.detach(), x.grad, 'x', 'grad of tanh', figsize = (5, 2.5))

##Discussion and Exercises 5.1

####5.1.1.1.
Linear models, such as softmax regression, map inputs to outputs via a single affine transformation, which limits their ability to capture complex relationships.

- Monotonicity: Linear models assume that increases in input features correspond to consistent increases or decreases in output. This assumption can fail in many contexts:
  - Income Prediction: A higher income might increase the probability of loan repayment, but the relationship is not necessarily linear across all income ranges.
  - Health Prediction: For body temperature, higher temperatures can indicate greater health risks above a certain threshold, while lower temperatures can also indicate risks.
  - Image Classification: Relying solely on pixel brightness to distinguish between categories like cats and dogs ignores the complex interplay of pixel values.

####5.1.1.2
- MLP Structure: An MLP consists of:

  - An input layer (receives the features).
  - One or more hidden layers (process inputs and learn representations).
  - An output layer (produces final predictions).

In a simple MLP:

- The input layer might have 4 features.
- The hidden layer could have 5 neurons (hidden units).
- The output layer may have 3 output classes.

####5.1.1.3

For an MLP with one hidden layer:

- The hidden layer produces hidden representations, calculated as an affine transformation of the input.
- The final layer (output layer) applies another affine transformation to the hidden representations.

1. Hidden Layer Output:

  $H = f(W_h X + b_h)$

2. Output Layer:
  
  $\hat{Y} = W_o H + b_o$

###5.1.2

A commonly used activation function is the ReLU (Rectified Linear Unit), defined as:

  $f(x) = \max(0, x)$

Key Characteristics:

- Nonlinear Transformation: ReLU retains only positive values and sets negative values to zero, effectively introducing nonlinearity into the model.

- Computational Efficiency: The piecewise linear nature of ReLU allows for efficient computation, as it requires only a simple comparison operation.

- Sparsity: By zeroing out negative values, ReLU can lead to sparsity in the network, which may improve efficiency and performance.


Its derivation:

- For $x > 0$: he output of the ReLU function is equal to $x$, so the derivative is:
  $f'(x)=1$
- For $x<0$: The output is 0, thus the derivative is:
  $f'(x)=0$
- For $x=0$: The function is not differentiable at this point. By convention, we typically set:
  $f'(0)=0$

####5.1.2.1
- The ReLU (Rectified Linear Unit) activation function is widely used in deep learning due to its well-behaved derivatives. It outputs either zero (for negative inputs) or the input itself (for positive inputs).

- The Parameterized ReLU (pReLU) addresses a limitation of the standard ReLU, which can lead to the "dying ReLU" problem where neurons become inactive. The pReLU introduces a small linear term for negative inputs:

  $f(x) =
\begin{cases}
x & \text{if } x > 0 \\
\alpha x & \text{if } x \leq 0
\end{cases}
$


####5.1..2
The Sigmoid function is defined as:
  $\sigma(x) = \frac{1}{1 + e^{-x}}$

This function transforms any input $x$ from the real line $(-\infty, +\infty)$ to the range $(0,1)$, making it especially useful for binary classification tasks. Key points about the sigmoid function include:

- Squashing Function: The sigmoid squashes inputs to the range (0, 1), interpretable as probabilities.
- Historical Context: The sigmoid function gained popularity as it approximated biological neurons that fired (outputting 1) or did not fire (outputting 0).




Derivative of the Sigmoid Function

The derivative of the sigmoid function can be expressed mathematically as:

$\sigma'(x) = \sigma(x) \cdot (1 - \sigma(x))$

$\sigma(x) = \frac{1}{1 + e^{-x}}$

$\sigma(0) = \frac{1}{1 + e^0} = \frac{1}{2}$

$\sigma'(0) = \frac{1}{2} \cdot \left(1 - \frac{1}{2}\right) = \frac{1}{2} \cdot \frac{1}{2} = 0.25$

Tanh Function

The hyperbolic tangent (tanh) function is a commonly used activation function in neural networks, transforming input values into a range between -1 and 1. It is mathematically defined as:

$\tanh(x) = \frac{e^x - e^{-x}}{e^x + e^{-x}}$

Properties of the Tanh Function

1. Range: The output of the tanh function lies in the interval $(-1,1)$. This allows it to produce both positive and negative values, effectively centering the data around zero and often facilitating faster convergence during training.

2. Symmetry: The tanh function is odd, exhibiting point symmetry about the origin:

  $\tanh(-x) = -\tanh(x)$

3. Behavior Near Zero: Similar to the sigmoid function, the tanh function approaches a linear transformation near $x=0$:
  - For small values of $x$:
  $\tanh(x) \approx x$

4. Derivative: The derivative of the tanh function can be expressed as:

  $\tanh'(x) = 1 - \tanh^2(x)$


Derivative of the Tanh Function

The derivative of the hyperbolic tangent (tanh) function is essential for understanding its behavior during the training of neural networks. The mathematical expression for the derivative is:

  $\tanh'(x) = 1 - \tanh^2(x)$

Properties of the Derivative

Maximum at Zero: The derivative reaches its maximum value of 1 when $x=0$, indicating that the function is most sensitive to changes in input around this point.

Vanishing Gradients: As the input $x$ moves away from 0 in either direction (positive or negative), the derivative approaches 0. This phenomenon, known as the vanishing gradient problem, can impede the training of deep networks, as gradients become too small to effect significant updates during backpropagation.

##5.2 Implementation of Multilayer Perceptrons

###5.2.1 Implementation from Scratch

####5.2.1.1 Initializing Model Parameters

In [None]:
class MLPScratch(d2l.Classifier):
  def __init__(self, num_inputs, num_outputs, num_hiddens, lr, sigma = 0.01):
    super().__init__()
    self.save_hyperparameters()
    self.W1 = nn.Parameter(torch.randn(num_inputs, num_hiddens) * sigma)
    self.b1 = nn.Parameter(torch.zeros(num_hiddens))
    self.W2 = nn.Parameter(torch.randn(num_hiddens, num_outputs) * sigma)
    self.b2 = nn.Parameter(torch.zeros(num_outputs))

####5.2.1.2 Model

In [None]:
def relu(X):
  a = torch.zeros_like(X)
  return torch.max(X, a)

In [None]:
@d2l.add_to_class(MLPScratch)
def forward(self, X):
  X = X.reshape((-1, self.num_inputs))
  H = relu(torch.matmul(X, self.W1) + self.b1)
  return torch.matmul(H, self.W2) + self.b2

####5.2.1.3 Training

In [None]:
model = MLPScratch(num_inputs = 784, num_outputs = 10, num_hiddens = 256, lr = 0.1)
data = d2l.FashionMNIST(batch_size = 256)
trainer = d2l.Trainer(max_epochs = 10)
trainer.fit(model, data)

###5.2.2 Concise Implementation

####5.2.2.1 Model

In [None]:
class MLP(d2l.Classifier):
  def __init__(self, num_outputs, num_hiddens, lr):
    super().__init__()
    self.save_hyperparameters()
    self.net = nn.Sequential(nn.Flatten(), nn.LazyLinear(num_hiddens),
                             nn.ReLU(), nn.LazyLinear(num_outputs))

####5.2.2.2 Training

In [None]:
model = MLP(num_outputs = 10, num_hiddens = 256, lr = 0.1)
trainer.fit(model, data)

##Discussion & Review 5.3

###5.3.1. Forward Propagation

Forward propagation refers to the process of calculating and storing intermediate variables and outputs as data flows from the input layer through to the output layer. Here’s a breakdown of the forward propagation in a one-hidden-layer MLP:

1. Input and Hidden Layer Computations:
  - Given an input vector $x$ and the weight matrix $W_1$ for the hidden layer (excluding bias for simplicity), the intermediate variable for the hidden layer is computed as:

  $z = W_1^\top x$

  - After applying an activation function (e.g., ReLU), the hidden activation vector $h$ is:

  $h = ReLU(z)$

2. Output Layer Computation:

  Assuming the output layer has weight matrix $W2$, the output variable $y$ can be computed as:

  $y = W_2^\top h$

3. Loss Calculation:

  If we denote the loss function as $L$ and the true label as $y_{true}$, the loss for a single data example is given by:

  $Loss=L(y, y_{true})$


4. Regularization:

- Incorporating weight decay regularization with a hyperparameter $λ$, the regularization term is:

  $\text{Regularization} = \frac{1}{2} \lambda \|W_1\|_F^2 + \frac{1}{2} \lambda \|W_2\|_F^2$

- The overall regularized loss for a given data example becomes:
  $Objective=Loss+Regularization$



####5.3.3. Backpropagation

Backpropagation is the algorithm used to compute gradients for all parameters in the network. It involves traversing the network in reverse order (from output back to input), applying the chain rule of calculus.

1. Calculate Gradients of the Loss:

  Start by computing the gradient of the objective function $J$ with respect to the loss and regularization terms:

  $\frac{\partial \text{Loss}}{\partial J} \quad \text{and} \quad \frac{\partial \text{Regularization}}{\partial J}$

2. Gradient of Output Layer:

  Compute the gradient of the objective with respect to the output layer variable:

  $\frac{\partial y}{\partial J} = \frac{\partial J}{\partial \text{Loss}} \cdot \frac{\partial \text{Loss}}{\partial y}$

3. Gradient of Parameters:

- Compute gradients with respect to the parameters using the chain rule:

  $\frac{\partial W_2}{\partial J} = \frac{\partial y}{\partial J} \cdot \mathbf{h}^\top$

- For the hidden layer:
  
  $\frac{\partial \mathbf{h}}{\partial J} = \frac{\partial y}{\partial J} \cdot W_2$

4. Hidden Layer Gradients:

  Compute the gradient of the hidden activation and then the parameters of the hidden layer:

  $\frac{\partial z}{\partial J} = \frac{\partial h}{\partial J} \cdot \text{ReLU}'(z)$

  $\frac{\partial J}{\partial W_1} = \frac{\partial z}{\partial J} \cdot x^\top$

5. Final Gradients:

  Finally, compute the gradients of the regularization term with respect to both parameters:

  $\frac{\partial W_1}{\partial J} + \lambda W_1 \quad \text{and} \quad \frac{\partial W_2}{\partial J} + \lambda W_2$


###5.3.4. Training Neural Networks

Interdependence of Forward and Backward Propagation

1. Forward Propagation:

  - During forward propagation, we traverse the computational graph from input to output, calculating all intermediate variables and the final output.
  - This phase includes the computations through each layer and the calculation of the regularization term, which relies on the current model parameters.

2. Backward Propagation:

  - In the backward pass, we calculate gradients using the chain rule. The gradients for model parameters are directly influenced by the outputs from the forward pass, particularly the hidden layer outputs and the computed loss.

The Training Process

1. Initialization: Begin with randomly initialized model parameters.

2. Forward Propagation:

  - Compute outputs based on the current parameters and input data.
  - Calculate the loss, incorporating any regularization terms.

3. Backward Propagation:

  - Compute gradients with respect to the loss using the stored intermediate values from the forward pass.
  - Update the model parameters using an optimization algorithm (e.g., stochastic gradient descent).

4. Iteration:

  - Repeat the forward and backward passes for multiple epochs until the model converges or achieves satisfactory performance.
