The main objective of this activity is to provide an initial experience trying different optimization strategies. We'll train a Neural Network to identify the numbers of MNIST dataset. The students will assess the convergence of the network using different optimizers parameters.

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

In [None]:
(x_train,y_train), (x_test,y_test)= tf.keras.datasets.mnist.load_data()
x_train = np.reshape(x_train, (-1, 784))
x_test = np.reshape(x_test, (-1, 784))

In [None]:
num_classes = 10
# Dataset normalization
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train= x_train/255.
x_test = x_test/255.
# One-hot encoding
y_train_h = np.zeros((np.size(y_train,0) , num_classes))
y_test_h = np.zeros((np.size(y_test,0) , num_classes))
for i in range(np.size(y_train , 0)):
  y_train_h[i , y_train[i]] = 1

for i in range(np.size(y_test , 0)):
  y_test_h[i , y_test[i]] = 1


Model creation

In [None]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(256, 'sigmoid'))
model.add(tf.keras.layers.Dense(512, 'relu'))
model.add(tf.keras.layers.Dense(256, 'relu'))
model.add(tf.keras.layers.Dense(10, 'sigmoid'))

In [None]:
#inputs = tf.keras.Input(shape=(784,))
#x = tf.keras.layers.Dense(256, activation="relu")(inputs)
#x = tf.keras.layers.Dense(512, activation="relu")(x)
#x = tf.keras.layers.Dense(256, activation="relu")(x)
#outputs = tf.keras.layers.Dense(10, activation="sigmoid")(x)
#model = tf.keras.Model(inputs=inputs, outputs=outputs)

In [None]:
lr = 0.01
momentum = 0.01
nest = False#True

optimizer = tf.keras.optimizers.SGD(learning_rate=lr, momentum=momentum, nesterov = nest)
#optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
#optimizer = tf.keras.optimizers.RMSprop(learning_rate=lr)
#optimizer = tf.keras.optimizers.Adagrad(learning_rate=lr)

loss = tf.keras.losses.CategoricalCrossentropy()

metrics = [
           'accuracy'
]

batch_size = 128
epochs = 50

In [None]:
model.compile(
    optimizer=optimizer,
    loss = loss,
    metrics = metrics
)

In [None]:
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor="val_accuracy",
    patience=10,
    verbose=1,
    mode='max',
    restore_best_weights=True,
)
callbacks = [early_stop]

In [None]:
history = model.fit(
    x = x_train,
    y = y_train_h,
    batch_size = batch_size,
    epochs = epochs,
    validation_split = 0.2,
    callbacks = callbacks
)

In [None]:
model.evaluate(
    x = x_test,
    y = y_test_h
)

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='lower right')
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()

## Exercise:

Try other training parameters, following this table.

---

Student | Optimizer | Learning Rate
--------|------------|---------
1            |SGD                | 0.01
2    |SGD+Momentum       | 0.01
3             |Nesterov Momentum  | 0.01
4       |Adagrad            | 0.01
5                        |RMSProp            | 0.01
6                        |RMSProp            | 0.1
7               |Adam               | 0.01
8                           |Adam               | 0.0001
9             |Adam               | 0.1
10             |Adam               | 0.3
11             |SGD               | 0.3
12             |Adagrad               | 0.3
13             |RMSprop               | 0.3

---

Register your results in [https://docs.google.com/document/d/1fYb2Rv36RDRX-K3OXkb3-YVZ0vGv5kb2DG6iahKB-_w/edit?usp=sharing]