In [1]:
#Optimization algorithms play a crucial role in training artificial neural networks by:


#1. Minimizing the loss function (error between predictions and actual labels)
#2. Adjusting weights and biases to improve model performance
#3. Converging to optimal solutions


#Gradient Descent and Variants

#Gradient descent is an iterative optimization algorithm that updates weights and biases based on the negative gradient of the loss function.


#Gradient Descent Variants:

#1. Batch Gradient Descent (BGD): Uses entire dataset for each update.
#2. Stochastic Gradient Descent (SGD): Uses single data point for each update.
#3. Mini-Batch Gradient Descent (MBGD): Uses small batch of data points for each update.


#Differences and Tradeoffs:

#| Variant | Convergence Speed | Memory Requirements |
#| --- | --- | --- |
#| BGD | Slow | High |
#| SGD | Fast | Low |
#| MBGD | Balanced | Balanced |


#Challenges with Traditional Gradient Descent

#1. Slow Convergence: Requires many iterations to converge.
#2. Local Minima: Gets stuck in suboptimal solutions.


#Modern Optimizers

#Modern optimizers address these challenges by:


#1. Momentum: Adds inertia to weight updates, escaping local minima.
#2. Learning Rate Scheduling: Adjusts learning rate during training.
#3. Adaptive Learning Rates: Adjusts learning rate per parameter.


#Popular Modern Optimizers:

#1. Momentum SGD
#2. Nesterov Accelerated Gradient (NAG)
#3. Adam
#4. RMSProp
#5. Adagrad


#Momentum and Learning Rate

#1. Momentum: Helps escape local minima by adding inertia to weight updates.
#2. Learning Rate: Controls step size during weight updates.


#Impact on Convergence and Model Performance

#1. High Learning Rate: Fast convergence, but may overshoot optimal solution.
#2. Low Learning Rate: Slow convergence, but more stable.
#3. Optimal Momentum: Balances exploration and exploitation.


#By understanding optimization algorithms and their variants, you can choose the best approach for training your neural network models.


#Key concepts:


#- Optimization algorithms minimize loss functions.
#- Gradient descent variants differ in convergence speed and memory requirements.
#3- Modern optimizers address slow convergence and local minima challenges.
#- Momentum and learning rate impact convergence and model performance.

In [3]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.datasets import mnist

# Load and preprocess the dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

# Build a simple model
def create_model(optimizer):
    model = models.Sequential([
        layers.Flatten(input_shape=(28, 28)),
        layers.Dense(128, activation='relu'),
        layers.Dense(10, activation='softmax')
    ])
    model.compile(optimizer=optimizer, 
                  loss='sparse_categorical_crossentropy', 
                  metrics=['accuracy'])
    return model

# Train with different optimizers
optimizers = ['sgd', 'adam', 'rmsprop']
for opt in optimizers:
    model = create_model(opt)
    model.fit(x_train, y_train, epochs=5, verbose=1)
    test_loss, test_acc = model.evaluate(x_test, y_test, verbose=2)
    print(f"Optimizer: {opt}, Test accuracy: {test_acc:.4f}")


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 0us/step


  super().__init__(**kwargs)


Epoch 1/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.7531 - loss: 1.0030
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9035 - loss: 0.3442
Epoch 3/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9186 - loss: 0.2892
Epoch 4/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9271 - loss: 0.2589
Epoch 5/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9342 - loss: 0.2330
313/313 - 1s - 2ms/step - accuracy: 0.9396 - loss: 0.2165
Optimizer: sgd, Test accuracy: 0.9396
Epoch 1/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.8812 - loss: 0.4195
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9646 - loss: 0.1210
Epoch 3/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m