## Training neural networks
In this notebook we exlpore various Keras functions that are helpful for DNN training. We also note some interesting things about them.

In [3]:
from tensorflow import keras
import tensorflow as tf
import numpy as np

In [4]:
# overly large model
dnn = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[32,32,3])
])

for i in range(20):
    dnn.add(keras.layers.Dense(100, activation='elu', kernel_initializer='he_normal'))
    
dnn.add(keras.layers.Dense(10, activation='softmax'))

In [5]:
dataset = keras.datasets.cifar10.load_data()
(X_train_full, y_train_full), (X_test, y_test) = dataset

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz


In [6]:
X_train = X_train_full[:40000]
y_train = y_train_full[:40000]
X_val = X_train_full[40000:]
y_val = y_train_full[40000:]
X_val.shape

(10000, 32, 32, 3)

### Early stopping
- Probably outdated.

In [111]:
keras.backend.clear_session()
tf.random.set_seed(42)
np.random.seed(42)

optimizer = keras.optimizers.Nadam(lr=5e-4)
dnn.compile(optimizer=optimizer, 
            loss=keras.losses.sparse_categorical_crossentropy, 
            metrics=['accuracy'], 
            )
early_stopping_cb = keras.callbacks.EarlyStopping(patience=10)
dnn.fit(X_train, y_train,  epochs=100,
        validation_data=(X_val, y_val),
        callbacks = [early_stopping_cb]
       )
dnn.evaluate(X_val, y_val)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100


[1.5680395364761353, 0.43470001220703125]

### Batch Normalization

- Note: this is essentially scaling the data *on all layers* which enourmously helps gradient descent. Therefore: BatchNorm layers are included in almost every architecture now (ex. Transformers)

In [81]:
dnn2 = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[32,32,3])
])

for i in range(20):
    dnn2.add(keras.layers.BatchNormalization())
    dnn2.add(keras.layers.Dense(100, activation='elu', kernel_initializer='he_normal'))
    
dnn2.add(keras.layers.Dense(10, activation='softmax'))

In [82]:
dnn2.compile(optimizer="nadam", 
            loss=keras.losses.sparse_categorical_crossentropy, 
            metrics=['accuracy'], 
            )
dnn2.fit(X_train, y_train,  epochs=30,
        validation_data=(X_val, y_val),
        callbacks = [keras.callbacks.EarlyStopping(patience=10)]
       )

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x1cb42d59130>

In [6]:
X_means = X_train.mean(axis=0)
X_stds = X_train.std(axis=0)
X_train_scaled = (X_train - X_means) / X_stds
X_val_scaled = (X_val - X_means) / X_stds
X_test_scaled = (X_test - X_means) / X_stds

### Different nonlinearities 

In [7]:
# SELU
dnn3 = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[32,32,3])
])

for i in range(20):
    dnn3.add(keras.layers.Dense(100, activation='selu', kernel_initializer='lecun_normal'))
    
dnn3.add(keras.layers.Dense(10, activation='softmax'))

optimizer = keras.optimizers.Nadam(lr=8e-4)
dnn3.compile(optimizer=optimizer, 
            loss=keras.losses.sparse_categorical_crossentropy, 
            metrics=['accuracy'], 
            )
cb = keras.callbacks.EarlyStopping(patience=10)



dnn3.fit(X_train_scaled, y_train,  epochs=100,
        validation_data=(X_val_scaled, y_val),
        callbacks = [cb]
       )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100


<tensorflow.python.keras.callbacks.History at 0x266cc672610>

In [None]:
keras.backend.clear_session()
tf.random.set_seed(42)
np.random.seed(42)

optimizer = keras.optimizers.Nadam(lr=1e-2, decay=)
dnn.compile(optimizer=optimizer, 
            loss=keras.losses.sparse_categorical_crossentropy, 
            metrics=['accuracy'], 
            )
early_stopping_cb = keras.callbacks.EarlyStopping(patience=10)
dnn.fit(X_train, y_train,  epochs=100,
        validation_data=(X_val, y_val),
        callbacks = [early_stopping_cb]
       )
dnn.evaluate(X_val, y_val)