# CHAPTER 11 - Training Deep Neural Networks

# The Vanishing/Exploding Gradients Problems

## Glorot and He Initialization

In [1]:
import tensorflow as tf

dense = tf.keras.layers.Dense(50, activation='relu', kernel_initializer='he_normal')

2025-12-11 10:26:51.888269: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-11 10:26:52.168639: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-12-11 10:26:53.671127: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
he_avg_init = tf.keras.initializers.VarianceScaling(scale=2, mode='fan_avg', distribution='uniform')
dense = tf.keras.layers.Dense(50, activation='sigmoid', kernel_initializer=he_avg_init)

## Better Activation Functions

### Leaky ReLU

In [3]:
leaky_relu = tf.keras.layers.LeakyReLU(negative_slope=0.2)
dense = tf.keras.layers.Dense(50, activation=leaky_relu, kernel_initializer="he_normal")

### ELU and SELU

### GELU, Swish, and Mish

## Batch Normalization

### Implementing batch normalization with Keras

In [4]:
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(shape=[28, 28]),
    tf.keras.layers.Flatten(),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(300, activation='relu', kernel_initializer='he_normal'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(300, activation='relu', kernel_initializer='he_normal'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(10, activation='softmax')
])

W0000 00:00:1765459616.878921    4813 gpu_device.cc:2342] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [5]:
model.summary()

In [6]:
[(var.name, var.trainable) for var in model.layers[1].variables]

[('gamma', True),
 ('beta', True),
 ('moving_mean', False),
 ('moving_variance', False)]

In [7]:
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(shape=[28, 28]),
    tf.keras.layers.Flatten(),    
    tf.keras.layers.Dense(300, kernel_initializer='he_normal', use_bias=False),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Activation('relu'),
    tf.keras.layers.Dense(300, kernel_initializer='he_normal', use_bias=False),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Activation('relu'),
    tf.keras.layers.Dense(10, activation='softmax')
])

In [8]:
model.summary()

## Gradient Clipping

# Reusing Pretrained Layers

## Transfer Learning with Keras

In [10]:
model_A = tf.keras.models.load_model("../models/mnist.keras")
model_A.summary()

In [11]:
model_B_on_A = tf.keras.Sequential(model_A.layers[:-1])
model_B_on_A.summary()

In [12]:
model_B_on_A.add(tf.keras.layers.Dense(1, activation="sigmoid"))
model_B_on_A.summary()

In [13]:
model_A_clone = tf.keras.models.clone_model(model_A)
model_A_clone.set_weights(model_A.get_weights())

In [14]:
for layer in model_B_on_A.layers[:-1]:
    layer.trainable = False
optimizer = tf.keras.optimizers.SGD(learning_rate=0.001)
model_B_on_A.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

In [15]:
model_B_on_A.summary()

In [36]:
from tensorflow.keras.datasets.fashion_mnist import load_data

def prepare_data(X, y):
    mask = (y == 0) | (y == 5)
    y_filtered = y[mask]
    y_filtered[y_filtered == 0] = 1
    y_filtered[y_filtered == 5] = 0
    return X[mask], y_filtered

fashion_mnist = load_data()
(X_train_full, y_train_full), (X_test_full, y_test_full) = fashion_mnist

X_train_filtered, y_train_filtered = prepare_data(X_train_full, y_train_full)
X_train, y_train = X_train_filtered[:-100], y_train_filtered[:-100]
X_valid, y_valid = X_train_filtered[-100:], y_train_filtered[-100:]

X_test, y_test = prepare_data(X_test_full, y_test_full)
X_train, X_valid, X_test = X_train / 255.0, X_valid / 255.9, X_test / 255.0

y_test
# class_names = ["T-shirt/top", "Trouser", "Pullover", "Dress", "Coat", "Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot"]

array([0, 0, 1, ..., 0, 0, 0], shape=(2000,), dtype=uint8)

In [37]:
history = model_B_on_A.fit(X_train, y_train, epochs=4, validation_data=(X_valid, y_valid))

Epoch 1/4
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9804 - loss: 0.2268 - val_accuracy: 0.9900 - val_loss: 0.1296
Epoch 2/4
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 788us/step - accuracy: 0.9982 - loss: 0.0888 - val_accuracy: 0.9900 - val_loss: 0.0816
Epoch 3/4
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 782us/step - accuracy: 0.9984 - loss: 0.0604 - val_accuracy: 0.9900 - val_loss: 0.0630
Epoch 4/4
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 745us/step - accuracy: 0.9983 - loss: 0.0472 - val_accuracy: 0.9900 - val_loss: 0.0530


In [38]:
for layer in model_B_on_A.layers[:-1]:
    layer.trainable = True

optimizer = tf.keras.optimizers.SGD(learning_rate=0.001)
model_B_on_A.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

history = model_B_on_A.fit(X_train, y_train, epochs=16, validation_data=(X_valid, y_valid))

Epoch 1/16
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9984 - loss: 0.0355 - val_accuracy: 0.9900 - val_loss: 0.0398
Epoch 2/16
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9986 - loss: 0.0267 - val_accuracy: 0.9900 - val_loss: 0.0334
Epoch 3/16
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9985 - loss: 0.0220 - val_accuracy: 0.9900 - val_loss: 0.0296
Epoch 4/16
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9987 - loss: 0.0189 - val_accuracy: 0.9900 - val_loss: 0.0271
Epoch 5/16
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9987 - loss: 0.0168 - val_accuracy: 0.9900 - val_loss: 0.0253
Epoch 6/16
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9987 - loss: 0.0152 - val_accuracy: 0.9900 - val_loss: 0.0240
Epoch 7/16
[1m372/372[0m 

In [39]:
model_B_on_A.evaluate(X_test, y_test)

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9985 - loss: 0.0097  


[0.009722077287733555, 0.9984999895095825]

## Unsupervised Pretraining

## Pretraining on an Auxiliary Task

# Learning Rate Scheduling

In [45]:
def exponencial_decay(lr0, s):
    def exponencial_decay_fn(epoch):
        print("********************")
        return lr0 * 0.1 ** (epoch / s)
    return exponencial_decay_fn

exponencial_decay_fn = exponencial_decay(lr0=0.01, s=20)

In [46]:
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(exponencial_decay_fn)

history = model_B_on_A.fit(X_train, y_train, epochs=16, validation_data=(X_valid, y_valid), callbacks=[lr_scheduler])

********************
Epoch 1/16
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9992 - loss: 0.0037 - val_accuracy: 0.9900 - val_loss: 0.0172 - learning_rate: 0.0100
********************
Epoch 2/16
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9992 - loss: 0.0035 - val_accuracy: 0.9900 - val_loss: 0.0167 - learning_rate: 0.0089
********************
Epoch 3/16
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9992 - loss: 0.0033 - val_accuracy: 0.9900 - val_loss: 0.0170 - learning_rate: 0.0079
********************
Epoch 4/16
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9993 - loss: 0.0032 - val_accuracy: 0.9900 - val_loss: 0.0166 - learning_rate: 0.0071
********************
Epoch 5/16
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9993 - loss: 0.0031 - val_accuracy: 0.9900 - val_lo

In [49]:
history.history['learning_rate']

[0.009999999776482582,
 0.008912509307265282,
 0.007943281903862953,
 0.007079457864165306,
 0.0063095735386013985,
 0.005623413249850273,
 0.005011872388422489,
 0.004466835875064135,
 0.003981071524322033,
 0.003548133885487914,
 0.003162277629598975,
 0.0028183830436319113,
 0.002511886414140463,
 0.0022387211211025715,
 0.00199526222422719,
 0.0017782794311642647]

# Avoiding Overfitting Through Regularization

## $\ell_{1}$ and $\ell_{2}$ Regularization

In [50]:
layer = tf.keras.layers.Dense(100, activation="relu", kernel_initializer="he_normal", kernel_regularizer=tf.keras.regularizers.l2(0.01))

In [57]:
from functools import partial

RegularizedDense = partial(tf.keras.layers.Dense, activation="relu", kernel_initializer="he_normal", kernel_regularizer=tf.keras.regularizers.l2(0.01))

model = tf.keras.Sequential([
    tf.keras.layers.InputLayer([28, 28]),
    tf.keras.layers.Flatten(),
    RegularizedDense(100),
    RegularizedDense(100),
    RegularizedDense(10, activation="softmax")
])

## Dropout

In [58]:
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer([28, 28]),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dropout(rate=0.2),
    tf.keras.layers.Dense(100, activation="relu", kernel_initializer="he_normal"),
    tf.keras.layers.Dropout(rate=0.2),
    tf.keras.layers.Dense(100, activation="relu", kernel_initializer="he_normal"),
    tf.keras.layers.Dropout(rate=0.2),
    tf.keras.layers.Dense(10, activation="softmax")
])

model.summary()