8. Deep Learning.
   
a. Build a DNN with five hidden layers of 100 neurons each, He initialization,
and the ELU activation function.

b. Using Adam optimization and early stopping, try training it on MNIST but
only on digits 0 to 4, as we will use transfer learning for digits 5 to 9 in the
next exercise. You will need a softmax output layer with five neurons, and as
always make sure to save checkpoints at regular intervals and save the final
model so you can reuse it later.

c. Tune the hyperparameters using cross-validation and see what precision you
can achieve.

d. Now try adding Batch Normalization and compare the learning curves: is it
converging faster than before? Does it produce a better model?

e. Is the model overfitting the training set? Try adding dropout to every layer
and try again. Does it help?


In [1]:
# Import the libraries
import tensorflow as tf
from tensorflow import keras
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# Let's get the MNIST DATA
mnist = keras.datasets.mnist
(X_train_full, y_train_full), (X_test_full, y_test_full) = mnist.load_data()

In [3]:
# That has the whole datset with features 0 to 9, now we have to split the dataset by
# the features from 0 to 4, 5 to 9  so let's see how to do that

# Check what your're working with
print(type(X_train_full)) 
print(type(y_train_full))
print(y_train_full.shape)
print(X_train_full.shape)
#print(X_train_full[0, :])
#print(y_train_full[0])

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(60000,)
(60000, 28, 28)


In [4]:
# Okay, its a numpy array so find how to split it 

def split_data_by_labels(x_data, y_data):
    # create an empty list for the new datas
    X_split_04 = []
    X_split_59 = []
    y_split_04 = []
    y_split_59 = []

    # Split by the labels
    for i, j in zip(x_data, y_data):
        if j in [0, 1, 2, 3, 4]:
            X_split_04.append(i)
            y_split_04.append(j)
        else:
            X_split_59.append(i)
            y_split_59.append(j)

    # change back to numpy array
    X_split_04 = np.array(X_split_04)
    X_split_59 = np.array(X_split_59)
    y_split_04 = np.array(y_split_04)
    y_split_59 = np.array(y_split_59)

    return X_split_04, X_split_59, y_split_04, y_split_59

# Split Test and train data
X_train_04, X_train_59, y_train_04, y_train_59 = split_data_by_labels(X_train_full, y_train_full)
X_test_04, X_test_59, y_test_04, y_test_59 = split_data_by_labels(X_test_full, y_test_full)

# Check
print(X_train_04.shape)
print(X_train_59.shape)
print(y_train_04.shape)
print(y_train_59.shape)

(30596, 28, 28)
(29404, 28, 28)
(30596,)
(29404,)


In [5]:
# Split train set into training and validation sets
X_train_04, X_val_04, y_train_04, y_val_04 = train_test_split(X_train_04, y_train_04, test_size=0.1)
X_train_59, X_val_59, y_train_59, y_val_59 = train_test_split(X_train_59, y_train_59, test_size=0.1)

# Check
print(X_train_04.shape)
print(X_val_04.shape)
print(y_train_59.shape)
print(y_val_59.shape)

(27536, 28, 28)
(3060, 28, 28)
(26463,)
(2941,)


In [6]:
# Preprocessing step
# Normalize the images to be between 0 to 1
X_train_04 = X_train_04 / 255.0
X_val_04 = X_val_04 / 255.0
X_test_04 = X_test_04 / 255.0
X_train_59 = X_train_59 / 255.0
X_val_59 = X_val_59 / 255.0
X_test_59 = X_test_59 / 255.0

In [7]:
# Okay so starting with 0 to 4 to build the base model

# Let's build a sequential model cause...yeah
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28,28]), # Input layer, flatten because MNIST
    # 5 hidden layers
    keras.layers.Dense(100, activation="elu", kernel_initializer="he_normal"), 
    keras.layers.Dense(100, activation="elu", kernel_initializer="he_normal"), 
    keras.layers.Dense(100, activation="elu", kernel_initializer="he_normal"), 
    keras.layers.Dense(100, activation="elu", kernel_initializer="he_normal"), 
    keras.layers.Dense(100, activation="elu", kernel_initializer="he_normal"),
    keras.layers.Dense(5, activation="softmax") # Since we are training from 0 to 4 of the MNIST data, we are using 5 output neurons, then softmax for activation good for classification
])
    

  super().__init__(**kwargs)


In [8]:
# Check the details of the model
model.summary()

In [9]:
# Compile
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])


In [10]:
# Set up the callback that you will be using to fit
# Here we are using EarlyStopping and ModelCheckpoint
my_callbacks = [
    keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True), # so that when it stops improving after 10 epochs, it would stop
    keras.callbacks.ModelCheckpoint("mnist_04_bm.keras", save_best_only=True), # this saves the model at regular intervals but at the epochs with best results
]
    

# Now fit the model
history = model.fit(x=X_train_04, y=y_train_04, epochs=100, validation_data=(X_val_04, y_val_04), callbacks=my_callbacks)

Epoch 1/100
[1m861/861[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 9ms/step - accuracy: 0.9349 - loss: 0.1912 - val_accuracy: 0.9797 - val_loss: 0.0659
Epoch 2/100
[1m861/861[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 9ms/step - accuracy: 0.9829 - loss: 0.0530 - val_accuracy: 0.9791 - val_loss: 0.0673
Epoch 3/100
[1m861/861[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 9ms/step - accuracy: 0.9894 - loss: 0.0328 - val_accuracy: 0.9843 - val_loss: 0.0469
Epoch 4/100
[1m861/861[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - accuracy: 0.9900 - loss: 0.0323 - val_accuracy: 0.9879 - val_loss: 0.0426
Epoch 5/100
[1m861/861[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 9ms/step - accuracy: 0.9925 - loss: 0.0235 - val_accuracy: 0.9837 - val_loss: 0.0583
Epoch 6/100
[1m861/861[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - accuracy: 0.9914 - loss: 0.0251 - val_accuracy: 0.9892 - val_loss: 0.0360
Epoch 7/100
[1m

In [11]:
# save the model
model.save('mnist_04_model.keras')

Tune the hyperparameters using cross-validation and see what precision you
can achieve.

In [12]:
# This is giving me a bit of a pause.
# a lot of things have beem deprecated from keras so idk what to use
# I'm going to try randomized search cv and hope it works

# First define a function to build a model, idk i already have a model so wont it be like creating a new model?
def build_model(learning_rate=1e-2):
    model = keras.models.Sequential([
        keras.layers.Flatten(input_shape=[28,28]), # Input layer, flatten because MNIST
        # 5 hidden layers
        keras.layers.Dense(100, activation="elu", kernel_initializer="he_normal"), 
        keras.layers.Dense(100, activation="elu", kernel_initializer="he_normal"), 
        keras.layers.Dense(100, activation="elu", kernel_initializer="he_normal"), 
        keras.layers.Dense(100, activation="elu", kernel_initializer="he_normal"), 
        keras.layers.Dense(100, activation="elu", kernel_initializer="he_normal"),
        keras.layers.Dense(5, activation="softmax") # Since we are training from 0 to 4 of the MNIST data, we are using 5 output neurons, then softmax for activation good for classification
    ])

    # Compile the model with a specified learning rate
    optimizer = keras.optimizers.Adam(learning_rate)
    model.compile(optimizer=optimizer, loss="sparse_categorical_crossentropy", metrics=["accuracy"])

    return model
    

In [13]:
model=build_model()
model.summary()

In [14]:
from sklearn.model_selection import KFold

my_callbacks = [
    keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True), # so that when it stops improving after 10 epochs, it would stop
    keras.callbacks.ModelCheckpoint("mnist_04_tm.keras", save_best_only=True), # this saves the model at regular intervals but at the epochs with best results
]

# Define the hyperparameter grid
param_grid = {
    "learning_rate": np.logspace(-4, -1, 4), # Learning rate between 3e-4 and 3e-2
    "batch_size": np.arange(16, 65, 16)
}

# Initialize KFold for cross-validation
kf=KFold(n_splits=3)

# Store the results of hyperparameter tuning
results = []

In [15]:
#print(set(y_train_04))

In [16]:
#print(X_train_04.shape)

In [17]:
#print(y_train_04.shape)

**Best Hyperparameters: {'learning_rate': np.float64(0.1), 'batch_size': np.int64(16), 'avg_val_score': np.float64(0.7426687677701315)}**

In [20]:
print(results)

[{'learning_rate': np.float64(0.0001), 'batch_size': np.int64(16), 'avg_val_score': np.float64(0.6724885640045007)}, {'learning_rate': np.float64(0.0001), 'batch_size': np.int64(32), 'avg_val_score': np.float64(0.6722778417170048)}, {'learning_rate': np.float64(0.0001), 'batch_size': np.int64(48), 'avg_val_score': np.float64(0.6721367090940475)}, {'learning_rate': np.float64(0.0001), 'batch_size': np.int64(64), 'avg_val_score': np.float64(0.6721557701627413)}, {'learning_rate': np.float64(0.001), 'batch_size': np.int64(16), 'avg_val_score': np.float64(0.6743382774293423)}, {'learning_rate': np.float64(0.001), 'batch_size': np.int64(32), 'avg_val_score': np.float64(0.6721340281267961)}, {'learning_rate': np.float64(0.001), 'batch_size': np.int64(48), 'avg_val_score': np.float64(0.6737084810932478)}, {'learning_rate': np.float64(0.001), 'batch_size': np.int64(64), 'avg_val_score': np.float64(0.6732814150551955)}, {'learning_rate': np.float64(0.01), 'batch_size': np.int64(16), 'avg_val_sc

Now try adding Batch Normalization and compare the learning curves: is it converging faster than before? Does it produce a better model?


In [24]:
# I didnt add the Tensorboard in the model before so idk how to check the learning curve now
# and i am not going back it took too long to run
# where does time go :(

# On with BatchNormalization
model = keras.models.Sequential([
    # Flatten input images
    keras.layers.Flatten(input_shape=[28, 28]),

    # BN before activation
    keras.layers.BatchNormalization(), 

    # Dense layer with no bias term (because BN includes bias)
    keras.layers.Dense(100, kernel_initializer="he_normal", use_bias=False),

    # Activation function before BN
    keras.layers.Activation("elu"),

    # BN again before Dense layer
    keras.layers.BatchNormalization(),
    keras.layers.Dense(100, kernel_initializer="he_normal", use_bias=False),
    keras.layers.Activation("elu"),

    # BN again before Dense layer
    keras.layers.BatchNormalization(),
    keras.layers.Dense(100, kernel_initializer="he_normal", use_bias=False),
    keras.layers.Activation("elu"),

    # BN again before Dense layer
    keras.layers.BatchNormalization(),
    keras.layers.Dense(100, kernel_initializer="he_normal", use_bias=False),
    keras.layers.Activation("elu"),

    # BN again before Dense layer
    keras.layers.BatchNormalization(),
    keras.layers.Dense(100, kernel_initializer="he_normal", use_bias=False),
    keras.layers.Activation("elu"),

    # Final BN and output layer
    keras.layers.BatchNormalization(),
    keras.layers.Dense(5, activation="softmax")

])
                  

In [25]:
# Compile the model with a specified learning rate from CV
optimizer = keras.optimizers.Adam(learning_rate=0.1)
model.compile(optimizer=optimizer, loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [26]:
my_callbacks = [
    keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True), # so that when it stops improving after 10 epochs, it would stop
    keras.callbacks.ModelCheckpoint("mnist_04_BN.keras", save_best_only=True), # this saves the model at regular intervals but at the epochs with best results
    keras.callbacks.TensorBoard("logs")
]
    

# Now fit the model
history = model.fit(x=X_train_04, y=y_train_04, epochs=100, validation_data=(X_val_04, y_val_04), callbacks=my_callbacks, batch_size=16)

Epoch 1/100
[1m1721/1721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 7ms/step - accuracy: 0.8394 - loss: 0.6040 - val_accuracy: 0.9350 - val_loss: 0.2582
Epoch 2/100
[1m1721/1721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 8ms/step - accuracy: 0.9186 - loss: 0.3688 - val_accuracy: 0.9042 - val_loss: 355294.5000
Epoch 3/100
[1m1721/1721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 8ms/step - accuracy: 0.9271 - loss: 0.3191 - val_accuracy: 0.9304 - val_loss: 913260352.0000
Epoch 4/100
[1m1721/1721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7ms/step - accuracy: 0.9521 - loss: 0.1840 - val_accuracy: 0.9654 - val_loss: 270036512.0000
Epoch 5/100
[1m1721/1721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7ms/step - accuracy: 0.9431 - loss: 0.2967 - val_accuracy: 0.9683 - val_loss: 306993760.0000
Epoch 6/100
[1m1721/1721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 7ms/step - accuracy: 0.9547 - loss: 0.2027 - val_accuracy: 0

In [29]:
%load_ext tensorboard

In [30]:
%tensorboard --logdir=logs

e. Is the model overfitting the training set? Try adding dropout to every layer
and try again. Does it help?

In [33]:
# Adding Dropout to the model
model = keras.models.Sequential([
    # Flatten input images
    keras.layers.Flatten(input_shape=[28, 28]),

    # BN before activation
    keras.layers.BatchNormalization(), 

    # Apply a dropout rate of 20%
    keras.layers.Dropout(rate=0.2),

    # Dense layer with no bias term (because BN includes bias)
    keras.layers.Dense(100, kernel_initializer="he_normal", use_bias=False),

    # Activation function before BN
    keras.layers.Activation("elu"),

    # BN again before Dense layer
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(rate=0.2),
    keras.layers.Dense(100, kernel_initializer="he_normal", use_bias=False),
    keras.layers.Activation("elu"),

    # BN again before Dense layer
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(rate=0.2),
    keras.layers.Dense(100, kernel_initializer="he_normal", use_bias=False),
    keras.layers.Activation("elu"),

    # BN again before Dense layer
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(rate=0.2),
    keras.layers.Dense(100, kernel_initializer="he_normal", use_bias=False),
    keras.layers.Activation("elu"),

    # BN again before Dense layer
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(rate=0.2),
    keras.layers.Dense(100, kernel_initializer="he_normal", use_bias=False),
    keras.layers.Activation("elu"),

    # Final BN and output layer
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(rate=0.2),
    keras.layers.Dense(5, activation="softmax")

])
   

In [34]:
# Compile the model with a specified learning rate from CV
optimizer = keras.optimizers.Adam(learning_rate=0.1)
model.compile(optimizer=optimizer, loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [35]:
my_callbacks = [
    keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True), # so that when it stops improving after 10 epochs, it would stop
    keras.callbacks.ModelCheckpoint("mnist_04_DO.keras", save_best_only=True), # this saves the model at regular intervals but at the epochs with best results
    keras.callbacks.TensorBoard("logs/fit/do")
]
    

# Now fit the model
history = model.fit(x=X_train_04, y=y_train_04, epochs=100, validation_data=(X_val_04, y_val_04), callbacks=my_callbacks, batch_size=16)

Epoch 1/100
[1m1721/1721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 7ms/step - accuracy: 0.7840 - loss: 0.8519 - val_accuracy: 0.9529 - val_loss: 0.3533
Epoch 2/100
[1m1721/1721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7ms/step - accuracy: 0.8536 - loss: 0.6266 - val_accuracy: 0.9637 - val_loss: 0.2482
Epoch 3/100
[1m1721/1721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7ms/step - accuracy: 0.8831 - loss: 0.5155 - val_accuracy: 0.9634 - val_loss: 5.8047
Epoch 4/100
[1m1721/1721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7ms/step - accuracy: 0.9054 - loss: 0.3858 - val_accuracy: 0.9585 - val_loss: 15891.5703
Epoch 5/100
[1m1721/1721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 7ms/step - accuracy: 0.8937 - loss: 0.4785 - val_accuracy: 0.9670 - val_loss: 6.7902
Epoch 6/100
[1m1721/1721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 9ms/step - accuracy: 0.9059 - loss: 0.4513 - val_accuracy: 0.9686 - val_loss: 14501.1

In [36]:
%tensorboard --logdir=logs/fit/do

In [37]:
# will be using this as the base model
# save the model
model.save('mnist_04_model.keras')

9. Transfer learning.
a. Create a new DNN that reuses all the pretrained hidden layers of the previous
model, freezes them, and replaces the softmax output layer with a new one.
b. Train this new DNN on digits 5 to 9, using only 100 images per digit, and time
how long it takes. Despite this small number of examples, can you achieve
high precision?
c. Try caching the frozen layers, and train the model again: how much faster is it
now?
d. Try again reusing just four hidden layers instead of five. Can you achieve a
higher precision?
e. Now unfreeze the top two hidden layers and continue training: can you get
the model to perform even better?


In [84]:
# let the base model be the current model
base_model = model
# Clone the base model
cloned_base = keras.models.clone_model(base_model)
# Copy its weigts as well, since clone_model does not clone the weights
cloned_base.set_weights(base_model.get_weights())

In [85]:
# Create a new Sequential model
new_model = keras.models.Sequential()

# Add the layers from the cloned base model (excluding the final output layer)
for layer in cloned_base.layers[:-1]: # Loop through all layers except the last one
    new_model.add(layer)
    layer.trainable = False # Freeze the layers

# Add a new outplut layer
new_model.add(keras.layers.Dense(5, activation="softmax"))

In [86]:
# Compile the model: You must always compile after you freeze or unfreeze layers
new_model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

b. Train this new DNN on digits 5 to 9, using only 100 images per digit, and time how long it takes. Despite this small number of examples, can you achieve high precision? 

In [87]:
# We only need to get 100 of images per digit, let's get the data we are using
print(len(X_train_59))
print(len(X_val_59))
print(len(X_test_59))
print(len(y_train_59))
print(len(y_val_59))
print(len(y_test_59))


26463
2941
4861
26463
2941
4861


In [88]:
# So I am trying to save 100 images per digit in a new list
X_train_100 = []
y_train_100 = []

# Create a dictionary to track the number of images per digit
digit_counts = {5:0, 6:0, 7:0, 8:0, 9:0}

for i, j in zip(X_train_59, y_train_59):
    if digit_counts[j] < 100:
        X_train_100.append(i)
        y_train_100.append(j)
        digit_counts[j] += 1

# Change back to numpy array
X_train_100 = np.array(X_train_100)
y_train_100 = np.array(y_train_100)

# Remap the y labels to be 0 to 4 cause of sparse
y_train_59r = y_train_59 - 5
y_test_59r = y_test_59 - 5
y_val_59r = y_val_59 - 5
y_train_100r = y_train_100 - 5



# Check
print(len(X_train_100))
print(len(y_train_100)) 
print(type(X_train_100))

500
500
<class 'numpy.ndarray'>


In [89]:
print(digit_counts)

{5: 100, 6: 100, 7: 100, 8: 100, 9: 100}


In [90]:
# Now we are training with this and time how long it takes to train
import time

# Callbacks
my_callbacks = [
    keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True), # so that when it stops improving after 10 epochs, it would stop
    keras.callbacks.ModelCheckpoint("mnist_59.keras", save_best_only=True), # this saves the model at regular intervals but at the epochs with best results
    keras.callbacks.TensorBoard("logs/fit/59")
]
   

start_time = time.time()
# Fit
history = new_model.fit(x=X_train_100, y=y_train_100r, epochs=100, validation_data=(X_val_59, y_val_59r), callbacks=my_callbacks)
end_time = time.time()

print(f"Time it used to train: {end_time - start_time}")

Epoch 1/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 141ms/step - accuracy: 0.2052 - loss: 1.7402 - val_accuracy: 0.2781 - val_loss: 1.5794
Epoch 2/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 86ms/step - accuracy: 0.2720 - loss: 1.6301 - val_accuracy: 0.3937 - val_loss: 1.5110
Epoch 3/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 92ms/step - accuracy: 0.3586 - loss: 1.7440 - val_accuracy: 0.4539 - val_loss: 1.4524
Epoch 4/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 109ms/step - accuracy: 0.3754 - loss: 1.5819 - val_accuracy: 0.4726 - val_loss: 1.4083
Epoch 5/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 91ms/step - accuracy: 0.4331 - loss: 1.3876 - val_accuracy: 0.4855 - val_loss: 1.3717
Epoch 6/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 102ms/step - accuracy: 0.4684 - loss: 1.4279 - val_accuracy: 0.4947 - val_loss: 1.3382
Epoch 7/100
[1m16/16[0m

In [91]:
%tensorboard --logdir=logs/fit/59

Reusing TensorBoard on port 6008 (pid 26852), started 2:23:15 ago. (Use '!kill 26852' to kill it.)

c. Try caching the frozen layers, and train the model again: how much faster is it
now?
Tensorflow already does this automatically so idk


d. Try again reusing just four hidden layers instead of five. Can you achieve a
higher precision?


In [92]:
new_model.summary()

In [93]:
print(len(new_model.layers))

24


In [94]:
new_model.layers

[<Flatten name=flatten_51, built=True>,
 <BatchNormalization name=batch_normalization_6, built=True>,
 <Dropout name=dropout, built=True>,
 <Dense name=dense_306, built=True>,
 <Activation name=activation_5, built=True>,
 <BatchNormalization name=batch_normalization_7, built=True>,
 <Dropout name=dropout_1, built=True>,
 <Dense name=dense_307, built=True>,
 <Activation name=activation_6, built=True>,
 <BatchNormalization name=batch_normalization_8, built=True>,
 <Dropout name=dropout_2, built=True>,
 <Dense name=dense_308, built=True>,
 <Activation name=activation_7, built=True>,
 <BatchNormalization name=batch_normalization_9, built=True>,
 <Dropout name=dropout_3, built=True>,
 <Dense name=dense_309, built=True>,
 <Activation name=activation_8, built=True>,
 <BatchNormalization name=batch_normalization_10, built=True>,
 <Dropout name=dropout_4, built=True>,
 <Dense name=dense_310, built=True>,
 <Activation name=activation_9, built=True>,
 <BatchNormalization name=batch_normalization_

In [99]:
# take out one hidden layer from the model,
# i will also take out the corresponding BN, Dropout and activation layers for that "one layer"
# Create a new model
old_model = new_model
new_model = keras.models.Sequential()
#add layers you want to keep
for index in range(len(old_model.layers)):
    if index not in {17, 18, 19, 20}:
        new_model.add(old_model.layers[index])

new_model.summary()

In [101]:
print(len(new_model.layers))

20


In [102]:
# Now compile and train again
new_model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [104]:
#Now we are training with this and time how long it takes to train
import time

# Callbacks
my_callbacks = [
    keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True), # so that when it stops improving after 10 epochs, it would stop
    keras.callbacks.ModelCheckpoint("mnist_59.keras", save_best_only=True), # this saves the model at regular intervals but at the epochs with best results
    keras.callbacks.TensorBoard("logs/fit/59")
]
   

start_time = time.time()
# Fit
history = new_model.fit(x=X_train_100, y=y_train_100r, epochs=100, validation_data=(X_val_59, y_val_59r), callbacks=my_callbacks)
end_time = time.time()

print(f"Time it used to train: {end_time - start_time}")

Epoch 1/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 112ms/step - accuracy: 0.2170 - loss: 488.9446 - val_accuracy: 0.2520 - val_loss: 395.9422
Epoch 2/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 77ms/step - accuracy: 0.2672 - loss: 430.2499 - val_accuracy: 0.2737 - val_loss: 345.8837
Epoch 3/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 87ms/step - accuracy: 0.2796 - loss: 388.3739 - val_accuracy: 0.2815 - val_loss: 295.4886
Epoch 4/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 76ms/step - accuracy: 0.2411 - loss: 335.0635 - val_accuracy: 0.2894 - val_loss: 249.5870
Epoch 5/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 85ms/step - accuracy: 0.2813 - loss: 259.3115 - val_accuracy: 0.2914 - val_loss: 208.3234
Epoch 6/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 88ms/step - accuracy: 0.2631 - loss: 231.1711 - val_accuracy: 0.2975 - val_loss: 169.2914
Epo

In [105]:
%tensorboard --logdir=logs/fit/59

Reusing TensorBoard on port 6008 (pid 26852), started 2:41:23 ago. (Use '!kill 26852' to kill it.)

e. Now unfreeze the top two hidden layers and continue training: can you get
the model to perform even better?


In [106]:
# Unfreeze te top 2 layers
for layer in new_model.layers[:5]:
    layer.trainable = True

new_model.summary()

In [107]:
# Now compile and train again
new_model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [108]:
# Callbacks
my_callbacks = [
    keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True), # so that when it stops improving after 10 epochs, it would stop
    keras.callbacks.ModelCheckpoint("mnist_59.keras", save_best_only=True), # this saves the model at regular intervals but at the epochs with best results
    keras.callbacks.TensorBoard("logs/fit/59")
]
   

start_time = time.time()
# Fit
history = new_model.fit(x=X_train_100, y=y_train_100r, epochs=100, validation_data=(X_val_59, y_val_59r), callbacks=my_callbacks)
end_time = time.time()

Epoch 1/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 121ms/step - accuracy: 0.4431 - loss: 1.9344 - val_accuracy: 0.5406 - val_loss: 1.8473
Epoch 2/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 84ms/step - accuracy: 0.4847 - loss: 1.7634 - val_accuracy: 0.5209 - val_loss: 1.9847
Epoch 3/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 94ms/step - accuracy: 0.4445 - loss: 1.6019 - val_accuracy: 0.4128 - val_loss: 1.7073
Epoch 4/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 81ms/step - accuracy: 0.4289 - loss: 1.6673 - val_accuracy: 0.5400 - val_loss: 1.2428
Epoch 5/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 74ms/step - accuracy: 0.4382 - loss: 1.5554 - val_accuracy: 0.5342 - val_loss: 1.3785
Epoch 6/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 81ms/step - accuracy: 0.4775 - loss: 1.3549 - val_accuracy: 0.6012 - val_loss: 1.1158
Epoch 7/100
[1m16/16[0m 

10. Pretraining on an auxiliary task.
a. In this exercise you will build a DNN that compares two MNIST digit images
and predicts whether they represent the same digit or not. Then you will reuse
the lower layers of this network to train an MNIST classifier using very little
training data. Start by building two DNNs (let’s call them DNN A and B), both
similar to the one you built earlier but without the output layer: each DNN
should have five hidden layers of 100 neurons each, He initialization, and ELU
activation. Next, add one more hidden layer with 10 units on top of both
DNNs. To do this, you should use a keras.layers.Concatenate layer to con‐
catenate the outputs of both DNNs for each instance, then feed the result to
the hidden layer. Finally, add an output layer with a single neuron using the
logistic activation function.
b. Split the MNIST training set in two sets: split #1 should containing 55,000
images, and split #2 should contain contain 5,000 images. Create a function
that generates a training batch where each instance is a pair of MNIST images
picked from split #1. Half of the training instances should be pairs of images
that belong to the same class, while the other half should be images from dif‐
ferent classes. For each pair, the training label should be 0 if the images are
from the same class, or 1 if they are from different classes.
c. Train the DNN on this training set. For each image pair, you can simultane‐
ously feed the first image to DNN A and the second image to DNN B. The
whole network will gradually learn to tell whether two images belong to the
same class or not.
d. Now create a new DNN by reusing and freezing the hidden layers of DNN A
and adding a softmax output layer on top with 10 neurons. Train this network
on split #2 and see if you can achieve high performance despite having only
500 images per class.


In [116]:
# Start by building the models with 5 hidden layers, 100 neurons, he ini and elu activation
# Use functional API
# DNN-A
input_a = keras.layers.Input(shape=[28, 28])
flatten_a = keras.layers.Flatten()(input_a)
hidden1a = keras.layers.Dense(100, kernel_initializer="he_normal", activation="elu")(flatten_a)
hidden2a = keras.layers.Dense(100, kernel_initializer="he_normal", activation="elu")(hidden1a)
hidden3a = keras.layers.Dense(100, kernel_initializer="he_normal", activation="elu")(hidden2a)
hidden4a = keras.layers.Dense(100, kernel_initializer="he_normal", activation="elu")(hidden3a)
hidden5a = keras.layers.Dense(100, kernel_initializer="he_normal", activation="elu")(hidden4a)

# DNN-B
input_b = keras.layers.Input(shape=[28, 28])
flatten_b = keras.layers.Flatten()(input_b)
hidden1b = keras.layers.Dense(100, kernel_initializer="he_normal", activation="elu")(flatten_a)
hidden2b = keras.layers.Dense(100, kernel_initializer="he_normal", activation="elu")(hidden1b)
hidden3b = keras.layers.Dense(100, kernel_initializer="he_normal", activation="elu")(hidden2b)
hidden4b = keras.layers.Dense(100, kernel_initializer="he_normal", activation="elu")(hidden3b)
hidden5b = keras.layers.Dense(100, kernel_initializer="he_normal", activation="elu")(hidden4b)

# Concatenate DNN A and B
concat = keras.layers.Concatenate()([hidden5a, hidden5b])

# Add a new hidden layer
hidden6 = keras.layers.Dense(10)(concat)

# Output layer
output = keras.layers.Dense(1, activation="sigmoid")(hidden6)

# Build the model
f_model = keras.models.Model(inputs=[input_a, input_b], outputs=[output])

b. Split the MNIST training set in two sets: split #1 should containing 55,000 images, and split #2 should contain contain 5,000 images. Create a function that generates a training batch where each instance is a pair of MNIST images picked from split #1. Half of the training instances should be pairs of images that belong to the same class, while the other half should be images from dif‐ ferent classes. For each pair, the training label should be 0 if the images are from the same class, or 1 if they are from different classes. 

In [117]:
print(len(X_train_full))
print(len(y_train_full))

60000
60000


In [120]:
X_train_1 = X_train_full[:55000]
X_train_2 = X_train_full[55000:]
y_train_1 = y_train_full[:55000]
y_train_2 = y_train_full[55000:]
print(len(X_train_1), len(X_train_2))

55000 5000


In [121]:
print(y_train_1[:5])

[5 0 4 1 9]


In [None]:
# use the sorted indices
sorted_indices = np.argsort(y_train)

# then i wil use it to split
sorted_right = sorted_indices[1::2]
sorted_left = sorted_indices[2::2]
# i wanna add the first index to the second one
sorted_left = np.append(sorted_left, sorted_indices[0])


# now create the pair
pair_indices = []
for pair in zip (sorted_right, sorted_left):
    pair_indices.append(pair)

# This way because its sorted the pairs would be half the same and half different

# change it back to a np.array
pair_indices = np.array(pair_indices)

# now get the corresponding data of the pair
y_train1_pair = y_train_1[[pair_indices]
x_train1_pair = X_train_1[pair_indices]

# Now get the new labels for x_train1_pair. if the same number = 0, different numbers = 1
new_ytrain1 = []
for i, j in y_train1_pair:
    if i == j:
        new_ytrain1.append(0)
    else:
        new_ytrain1.append(1)

    