# 3 classic evaluation methods
- simple holdout validation
- k-fold validation
- iterated k-fold validation with shuffling

# holdout validation

In [None]:
# @title
num_validation_samples = 10000
np.random.shuffle(data)

validation_data = data[:num_validation_samples]
training_data = data[num_validation_samples:]

model = get_model()
model.fit(training_data, ...)
validation_score = model.evaluate(validation_data, ...)
# at this point, tune the model, retrain, evaluate, tune it again

# once tuned, train a fresh final model on all non-test data available
model = get_model()
model.fit(np.concatenate([training_data, validation_data]), ...)
test_score = model.evaluate(test_data, ...)

# k-fold validation

In [None]:
k = 3
num_validation_samples = len(data) // k
np.random.shuffle(data)
validation_scores = []

for fold in range(k):
    validation_data = data[num_validation_samples * fold : num_validation_samples * (fold+1)]
    training_data = np.concatenate(
        data[:num_validation_samples * fold],
        data[num_validation_samples * (fold+1):])
    model = get_model()
    model.fit(training_data, ...)
    validation_score = model.evaluate(validation_data, ...)
    validation_scores.append(validation_score)

validation_score = np.average(validation_scores)

model = get_model()
model.fit(data, ...)
test_score = model.evaluate(test_data, ...)

# iterated k-fold validation with shuffling
- apply this k-fold validation multiple times with shuffling the data
- final score is the average of results


---

# improving model fit

# training an MNIST model with an incorrectly high learning rate

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.datasets import mnist

(train_images, train_labels), _ = mnist.load_data()
train_images = train_images.reshape((60000, 28*28))
train_images = train_images.astype("float32") / 255

model = keras.Sequential([
    layers.Dense(512, activation="relu"),
    layers.Dense(10, activation="softmax")
])

model.compile(optimizer=keras.optimizers.RMSprop(1.),
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

model.fit(train_images, train_labels, epochs=10, batch_size=128, validation_split=0.2)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Epoch 1/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.4606 - loss: 4220.6597 - val_accuracy: 0.2277 - val_loss: 2.1103
Epoch 2/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.2322 - loss: 2.6278 - val_accuracy: 0.2079 - val_loss: 2.4779
Epoch 3/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.1926 - loss: 2.7890 - val_accuracy: 0.1682 - val_loss: 2.2713
Epoch 4/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.1794 - loss: 2.5972 - val_accuracy: 0.2011 - val_loss: 2.3246
Epoch 5/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.2061 - loss: 2.2848 - val_accuracy: 0.2047 - val_loss: 2.2141
Epoch 6/10

<keras.src.callbacks.history.History at 0x781082878d10>

# same model with an appropriate learning rate

In [None]:
model = keras.Sequential([
    layers.Dense(512, activation="relu"),
    layers.Dense(10, activation="softmax")
])

model.compile(optimizer=keras.optimizers.RMSprop(1e-2),
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

model.fit(train_images, train_labels, epochs=10, batch_size=128, validation_split=0.2)

Epoch 1/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.8369 - loss: 0.7959 - val_accuracy: 0.9523 - val_loss: 0.1651
Epoch 2/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9628 - loss: 0.1306 - val_accuracy: 0.9499 - val_loss: 0.1947
Epoch 3/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9729 - loss: 0.0985 - val_accuracy: 0.9603 - val_loss: 0.1859
Epoch 4/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9801 - loss: 0.0769 - val_accuracy: 0.9676 - val_loss: 0.1609
Epoch 5/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9846 - loss: 0.0565 - val_accuracy: 0.9715 - val_loss: 0.1523
Epoch 6/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9864 - loss: 0.0487 - val_accuracy: 0.9722 - val_loss: 0.1691
Epoch 7/10
[1m375/375[0m 

<keras.src.callbacks.history.History at 0x78108287ab40>

#

# a model that does not overfit

In [None]:
model = keras.Sequential([layers.Dense(10, activation="softmax")])
model.compile(optimizer="rmsprop",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])
model.fit(
    train_images, train_labels,
    epochs=20,
    batch_size=128,
    validation_split=0.2
)

Epoch 1/20
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.7442 - loss: 1.0124 - val_accuracy: 0.9035 - val_loss: 0.3582
Epoch 2/20
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8998 - loss: 0.3664 - val_accuracy: 0.9162 - val_loss: 0.3096
Epoch 3/20
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9116 - loss: 0.3205 - val_accuracy: 0.9195 - val_loss: 0.2929
Epoch 4/20
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9142 - loss: 0.3054 - val_accuracy: 0.9211 - val_loss: 0.2853
Epoch 5/20
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9183 - loss: 0.2959 - val_accuracy: 0.9222 - val_loss: 0.2786
Epoch 6/20
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9205 - loss: 0.2833 - val_accuracy: 0.9245 - val_loss: 0.2741
Epoch 7/20
[1m375/375[0m 

<keras.src.callbacks.history.History at 0x7810675f40b0>

# try a bigger model

In [None]:
model = keras.Sequential([
    layers.Dense(96, activation="relu"),
    layers.Dense(96, activation="relu"),
    layers.Dense(10, activation="softmax")
])
model.compile(optimizer="rmsprop",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])
model.fit(
    train_images, train_labels,
    epochs=20,
    batch_size=128,
    validation_split=0.2)

Epoch 1/20
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.8203 - loss: 0.6185 - val_accuracy: 0.9377 - val_loss: 0.2142
Epoch 2/20
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9483 - loss: 0.1730 - val_accuracy: 0.9600 - val_loss: 0.1350
Epoch 3/20
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9649 - loss: 0.1173 - val_accuracy: 0.9665 - val_loss: 0.1151
Epoch 4/20
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9741 - loss: 0.0873 - val_accuracy: 0.9682 - val_loss: 0.1110
Epoch 5/20
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9796 - loss: 0.0712 - val_accuracy: 0.9708 - val_loss: 0.0998
Epoch 6/20
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9830 - loss: 0.0582 - val_accuracy: 0.9724 - val_loss: 0.0954
Epoch 7/20
[1m375/375[0m 

<keras.src.callbacks.history.History at 0x781060153b30>

observe that overfit began after 7th epoch, validation loss started to increase

# regularizing the model
- making model more generic, not specific to training data

# reduce the network size to prevent overfitting

# original model

In [None]:
import numpy as np
from tensorflow.keras.datasets import imdb
(train_data, train_labels), _ = imdb.load_data(num_words=10000)

def vectorize_sequences(sequences, dimension=10000):
  results = np.zeros((len(sequences), dimension))
  for i, sequence in enumerate(sequences):
    results[i, sequence] = 1.
  return results

train_data = vectorize_sequences(train_data)

model = keras.Sequential([
    layers.Dense(16, activation="relu"),
    layers.Dense(16, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])

model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])

model.fit(train_data, train_labels,
          epochs=20, batch_size=512, validation_split=0.4)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Epoch 1/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 113ms/step - accuracy: 0.6732 - loss: 0.6195 - val_accuracy: 0.8585 - val_loss: 0.4165
Epoch 2/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step - accuracy: 0.8892 - loss: 0.3586 - val_accuracy: 0.8818 - val_loss: 0.3222
Epoch 3/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.9215 - loss: 0.2570 - val_accuracy: 0.8813 - val_loss: 0.3053
Epoch 4/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.9379 - loss: 0.1988 - val_accuracy: 0.8855 - val_loss: 0.2866
Epoch 5/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.9486 - loss: 0.1617 - val_accuracy: 0.8802 - val_loss: 0.3010
Epoch 6/20
[1m30/

<keras.src.callbacks.history.History at 0x781082645550>

observe that overfit began after 5th epoch

# replace this model with a smaller one

In [None]:
model = keras.Sequential([
    layers.Dense(4, activation="relu"),
    layers.Dense(4, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.fit(train_data, train_labels,
          epochs=20, batch_size=512, validation_split=0.4)

Epoch 1/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 102ms/step - accuracy: 0.6077 - loss: 0.6651 - val_accuracy: 0.8271 - val_loss: 0.5508
Epoch 2/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.8590 - loss: 0.5119 - val_accuracy: 0.8436 - val_loss: 0.4612
Epoch 3/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.8890 - loss: 0.4133 - val_accuracy: 0.8658 - val_loss: 0.3937
Epoch 4/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.9064 - loss: 0.3445 - val_accuracy: 0.8838 - val_loss: 0.3497
Epoch 5/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 0.9141 - loss: 0.2939 - val_accuracy: 0.8851 - val_loss: 0.3208
Epoch 6/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 0.9273 - loss: 0.2521 - val_accuracy: 0.8893 - val_loss: 0.2988
Epoch 7/20
[1m30/30[0m [32m━━━

<keras.src.callbacks.history.History at 0x7810827da810>

observe that overfit began after 7th epoch

# version of the model with higher capacity

In [None]:
model = keras.Sequential([
    layers.Dense(512, activation="relu"),
    layers.Dense(512, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.fit(
    train_data, train_labels,
    epochs=20, batch_size=512, validation_split=0.4)

Epoch 1/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 97ms/step - accuracy: 0.6671 - loss: 0.6464 - val_accuracy: 0.8199 - val_loss: 0.4007
Epoch 2/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.8438 - loss: 0.3597 - val_accuracy: 0.8749 - val_loss: 0.2972
Epoch 3/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.9034 - loss: 0.2384 - val_accuracy: 0.8874 - val_loss: 0.2792
Epoch 4/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step - accuracy: 0.9256 - loss: 0.1911 - val_accuracy: 0.8310 - val_loss: 0.3911
Epoch 5/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - accuracy: 0.9500 - loss: 0.1346 - val_accuracy: 0.8559 - val_loss: 0.4084
Epoch 6/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 0.9471 - loss: 0.1307 - val_accuracy: 0.8889 - val_loss: 0.3195
Epoch 7/20
[1m30/30[0m [32m━━━━

<keras.src.callbacks.history.History at 0x78107a37f3b0>

observe that overfit began after just one epoch

# adding L2 weight regularization to the initial movie model
- add a cost to weights to make weights more regular

In [None]:
from tensorflow.keras import regularizers
model = keras.Sequential([
    layers.Dense(16, kernel_regularizer=regularizers.l2(0.002), activation="relu"),
    layers.Dense(16, kernel_regularizer=regularizers.l2(0.002), activation="relu"),
    layers.Dense(1, activation="sigmoid")
])
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.fit(
    train_data, train_labels,
    epochs=20, batch_size=512, validation_split=0.4
)

Epoch 1/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 94ms/step - accuracy: 0.6854 - loss: 0.7053 - val_accuracy: 0.8438 - val_loss: 0.5079
Epoch 2/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.8805 - loss: 0.4514 - val_accuracy: 0.8773 - val_loss: 0.4032
Epoch 3/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.9059 - loss: 0.3567 - val_accuracy: 0.8858 - val_loss: 0.3689
Epoch 4/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.9251 - loss: 0.3046 - val_accuracy: 0.8878 - val_loss: 0.3571
Epoch 5/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 0.9332 - loss: 0.2817 - val_accuracy: 0.8881 - val_loss: 0.3531
Epoch 6/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step - accuracy: 0.9407 - loss: 0.2611 - val_accuracy: 0.8845 - val_loss: 0.3559
Epoch 7/20
[1m30/30[0m [32m━━━━

<keras.src.callbacks.history.History at 0x780fbd4737d0>

# dropout

# add dropout to the model

In [None]:
model = keras.Sequential([
    layers.Dense(16, activation="relu"),
    layers.Dropout(0.5),
    layers.Dense(16, activation="relu"),
    layers.Dropout(0.5),
    layers.Dense(1, activation="sigmoid")
])
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.fit(
    train_data, train_labels,
    epochs=20, batch_size=512, validation_split=0.4)

Epoch 1/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 107ms/step - accuracy: 0.5778 - loss: 0.6645 - val_accuracy: 0.8422 - val_loss: 0.5608
Epoch 2/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.7499 - loss: 0.5524 - val_accuracy: 0.8644 - val_loss: 0.4574
Epoch 3/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.8126 - loss: 0.4755 - val_accuracy: 0.8789 - val_loss: 0.3869
Epoch 4/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.8502 - loss: 0.4167 - val_accuracy: 0.8798 - val_loss: 0.3460
Epoch 5/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.8784 - loss: 0.3619 - val_accuracy: 0.8772 - val_loss: 0.3263
Epoch 6/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.8938 - loss: 0.3238 - val_accuracy: 0.8840 - val_loss: 0.3029
Epoch 7/20
[1m30/30[0m [32m━━━

<keras.src.callbacks.history.History at 0x780fbd42ab10>