# Model optimization examples

*Model optimization* is crucial to enhance performance, efficiency and scalability of models.  This improves training times and utilization of hardware resources. 

Common optimization techniques include: 
- Weight initalization
- Learning rate scheduling
- Batch normalization

Proper initialization of weights can impact the convergence and performance of a neural network.  Methods include Glorot and He initialization methods to avoid vanishing or exploding gradients. 

## Technique 1: Weight initialization

In [3]:
import warnings
# Suppress urllib3 and Keras input_shape warnings
warnings.filterwarnings('ignore', category=UserWarning, module='urllib3')
warnings.filterwarnings('ignore', message='Do not pass an `input_shape`')

from tensorflow.keras.initializers import HeNormal
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten

# Define the model with 'He' initialization for better weight initialization
model = Sequential([
   # Flatten layer: converts 28x28 image to 784-element vector
   Flatten(input_shape=(28, 28)), 
   
   # Hidden layer: 128 neurons with ReLU activation and He Normal initialization
   Dense(128, activation='relu', kernel_initializer=HeNormal()), 
   
   # Output layer: 10 neurons for digit classification (0-9)
   Dense(10, activation='softmax')
])

## Technique 2: Learning rate scheduling:

In [4]:
#Load and Preprocess Dataset and get it ready for training using the Mnist dataset
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical

#Load MNIST dataset
(x_train, y_train), (x_val, y_val) = mnist.load_data()

#Normalize input data the pixel values to between  0 and 1 for better performance during training
x_train = x_train.astype('float32') / 255.0
x_val = x_val.astype('float32') / 255.0

#Reshape input data (if necessary) to ensure it is in the correct format
x_train = x_train.reshape(-1, 28, 28)
x_val = x_val.reshape(-1, 28, 28)

**Implement a learning rate scheduler to adjust the learning rate dynamically during training**
*(This will help the model converge more efficiently)*

In [6]:
from tensorflow.keras.callbacks import LearningRateScheduler
import tensorflow as tf
from tensorflow.keras.optimizers import Adam

#Define a scheduler function that keeps the learning rate constant for the first ten epochs then exponentially decreases it
def scheduler(epoch, lr):
   if epoch < 10:
       return lr
   else:
       return float(lr * tf.math.exp(-0.1))

lr_scheduler = LearningRateScheduler(scheduler)

In [8]:
#Model evaluation

from tensorflow.keras.utils import to_categorical

# Load and preprocess the MNIST dataset
(x_train, y_train), (x_val, y_val) = tf.keras.datasets.mnist.load_data()

# Normalize pixel values to [0, 1] range
x_train, x_val = x_train / 255.0, x_val / 255.0

# Convert labels to categorical (one-hot encoding) for categorical_crossentropy
y_train = to_categorical(y_train, 10)
y_val = to_categorical(y_val, 10)

# Compile the model with Adam optimizer and categorical crossentropy loss
model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model for 20 epochs with learning rate scheduler to improve convergence
# Train the model's performance on unseen validation data
history = model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=20, callbacks=[lr_scheduler])

Epoch 1/20
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 676us/step - accuracy: 0.8782 - loss: 0.4371 - val_accuracy: 0.9592 - val_loss: 0.1361 - learning_rate: 0.0010
Epoch 2/20
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 656us/step - accuracy: 0.9648 - loss: 0.1209 - val_accuracy: 0.9702 - val_loss: 0.0945 - learning_rate: 0.0010
Epoch 3/20
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 640us/step - accuracy: 0.9773 - loss: 0.0766 - val_accuracy: 0.9739 - val_loss: 0.0880 - learning_rate: 0.0010
Epoch 4/20
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 657us/step - accuracy: 0.9834 - loss: 0.0562 - val_accuracy: 0.9694 - val_loss: 0.0929 - learning_rate: 0.0010
Epoch 5/20
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 648us/step - accuracy: 0.9870 - loss: 0.0442 - val_accuracy: 0.9751 - val_loss: 0.0783 - learning_rate: 0.0010
Epoch 6/20
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━

## Technique 3: Batch normalization

Additional techniques exist like batch normalization, mixed precision training, model pruning and quantization. 

1. Batch normalization: can normalize the input layer by adjusting and scaling the activations
2. Mixed precision training: Uses 16-bit and 32-bit floating point to speed up training on modern GPUs


In [10]:
# Example code for batch normalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, BatchNormalization

# Define a simple model with Batch Normalization for improved training stability
model = Sequential([
   Flatten(input_shape=(28, 28)), # Input layer: flatten 28x28 images to 784-element vectors
   Dense(128, activation='relu'), # Hidden layer: 128 neurons with ReLU activation
   BatchNormalization(), # Batch normalization: normalizes inputs to next layer for faster convergence
   Dense(10, activation='softmax') # Output layer: 10 neurons for digit classification
])

In [32]:
# Example for mixed precision training to improve performance and reduce memory usage
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras import mixed_precision

# Enable mixed precision policy for faster training
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

# Load and preprocess the MNIST dataset
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

# Normalize the data to [0, 1] range
x_train, x_test = x_train.astype('float32') / 255.0, x_test.astype('float32') / 255.0

# Define a simple model with mixed precision
model = models.Sequential([
   layers.Input(shape=(28, 28)), 
   layers.Flatten(),
   layers.Dense(128, activation='relu'),
   layers.Dense(10, activation='softmax')
])

# Compile the model with optimizer and loss function
optimizer = optimizers.Adam(learning_rate=1e-3)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(x_train, y_train, epochs=5, validation_data=(x_test, y_test))

Epoch 1/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 855us/step - accuracy: 0.8814 - loss: 0.4288 - val_accuracy: 0.9601 - val_loss: 0.1391
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 807us/step - accuracy: 0.9657 - loss: 0.1157 - val_accuracy: 0.9685 - val_loss: 0.1037
Epoch 3/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 816us/step - accuracy: 0.9769 - loss: 0.0802 - val_accuracy: 0.9755 - val_loss: 0.0825
Epoch 4/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 861us/step - accuracy: 0.9822 - loss: 0.0584 - val_accuracy: 0.9764 - val_loss: 0.0778
Epoch 5/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 855us/step - accuracy: 0.9875 - loss: 0.0427 - val_accuracy: 0.9762 - val_loss: 0.0762


<keras.src.callbacks.history.History at 0x30fb25e20>