In [1]:
import warnings
warnings.filterwarnings("ignore")

from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

from tensorflow.keras.optimizers import SGD

#### In this module, you learned the basics of deep learning and the fundamental architecture of artificial neural networks. During the examples in the checkpoints, you used a MNIST dataset. In this challenge, you'll work with another dataset: fashion MNIST. Using this dataset, do the following:

* Preprocess your data so that you can feed it into ANN models.

* Split your data into training and test sets.

* Try different ANN models and train them on your training set. You can play with the following:

    * Number of layers
    * Activation functions of the layers
    * Number of neurons in the layers
    * Different batch sizes during training

* Compare your models' training scores and interpret your results.

* Evaluate how your models perform on your test set. Compare the results of your models.



##### 1. Preprocess your data so that you can feed it into ANN models.

In [2]:
(X_train, y_train), (X_test, y_test) = mnist.load_data()
input_dim = 784  # 28*28
output_dim = nb_classes = 10
batch_size = 128
nb_epoch = 20

##### 2. Split your data into training and test sets.

In [3]:
X_train = X_train.reshape(60000, input_dim)
X_test = X_test.reshape(10000, input_dim)
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255
X_test /= 255

Y_train = to_categorical(y_train, nb_classes)
Y_test = to_categorical(y_test, nb_classes)

In [4]:
X_train[0].shape

(784,)

##### 3. Try different ANN models and train them on your training set. You can play with the following:

##### * Change number of layers

In [5]:
# Create the model with 3 layers
model_a1 = Sequential()

model_a1.add(Dense(128, input_shape=X_train[0].shape, activation="relu"))
model_a1.add(Dense(64, activation="relu"))
model_a1.add(Dense(output_dim, activation="softmax"))
model_a1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               100480    
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_2 (Dense)              (None, 10)                650       
Total params: 109,386
Trainable params: 109,386
Non-trainable params: 0
_________________________________________________________________


In [6]:
# Compile the model
model_a1.compile(optimizer='sgd', loss='categorical_crossentropy',
              metrics=['accuracy'])

# Fit the model
model_a1.fit(X_train, Y_train, batch_size=batch_size, epochs=nb_epoch, verbose=1)

Train on 60000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fc3a0255cd0>

In [7]:
# Create the model with 5 layers
model_a2 = Sequential()

model_a2.add(Dense(128, input_shape=X_train[0].shape, activation="relu"))
model_a2.add(Dense(128, activation="relu"))
model_a2.add(Dense(64, activation="relu"))
model_a2.add(Dense(64, activation="relu"))
model_a2.add(Dense(output_dim, activation="softmax"))
model_a2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 128)               100480    
_________________________________________________________________
dense_4 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_5 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_6 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_7 (Dense)              (None, 10)                650       
Total params: 130,058
Trainable params: 130,058
Non-trainable params: 0
_________________________________________________________________


In [8]:
# Compile the model
model_a2.compile(optimizer='sgd', loss='categorical_crossentropy',
              metrics=['accuracy'])
# Fit the model
model_a2.fit(X_train, Y_train, batch_size=batch_size, epochs=20, verbose=1)

Train on 60000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fc3a030de90>

##### * Change the activation funtion of the layers

In [9]:
# Create the model with the tanh activation function
model_b1 = Sequential()

model_b1.add(Dense(128, input_shape=X_train[0].shape, activation="tanh"))
model_b1.add(Dense(64, activation="tanh"))
model_b1.add(Dense(10, activation="softmax"))
model_b1.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_8 (Dense)              (None, 128)               100480    
_________________________________________________________________
dense_9 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_10 (Dense)             (None, 10)                650       
Total params: 109,386
Trainable params: 109,386
Non-trainable params: 0
_________________________________________________________________


In [10]:
# Compile the model
model_b1.compile(optimizer='sgd', loss='categorical_crossentropy',
              metrics=['accuracy'])
# Fit the model
model_b1.fit(X_train, Y_train, batch_size=batch_size, epochs=20, verbose=1)

Train on 60000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fc3807eeb90>

In [11]:
# Create the model with the sigmoid activation function
model_b2 = Sequential()

model_b2.add(Dense(128, input_shape=X_train[0].shape, activation="sigmoid"))
model_b2.add(Dense(64, activation="sigmoid"))
model_b2.add(Dense(10, activation="softmax"))
model_b2.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_11 (Dense)             (None, 128)               100480    
_________________________________________________________________
dense_12 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_13 (Dense)             (None, 10)                650       
Total params: 109,386
Trainable params: 109,386
Non-trainable params: 0
_________________________________________________________________


In [12]:
# Compile the model
model_b2.compile(optimizer='sgd', loss='categorical_crossentropy',
              metrics=['accuracy'])
# Fit the model
model_b2.fit(X_train, Y_train, batch_size=batch_size, epochs=20, verbose=1)

Train on 60000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fc3808eead0>

##### * Change the number of neurons in the layers

In [13]:
# Create the model with more neurons
model_c1 = Sequential()

model_c1.add(Dense(256, input_shape=X_train[0].shape, activation="relu"))
model_c1.add(Dense(128, activation="relu"))
model_c1.add(Dense(10, activation="softmax"))
model_c1.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_14 (Dense)             (None, 256)               200960    
_________________________________________________________________
dense_15 (Dense)             (None, 128)               32896     
_________________________________________________________________
dense_16 (Dense)             (None, 10)                1290      
Total params: 235,146
Trainable params: 235,146
Non-trainable params: 0
_________________________________________________________________


In [14]:
# Compile the model
model_c1.compile(optimizer='sgd', loss='categorical_crossentropy',
              metrics=['accuracy'])
# Fit the model
model_c1.fit(X_train, Y_train, batch_size=batch_size, epochs=20, verbose=1)

Train on 60000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fc390c65990>

In [15]:
# Create the model with lots more neurons
model_c2 = Sequential()

model_c2.add(Dense(1024, input_shape=X_train[0].shape, activation="relu"))
model_c2.add(Dense(256, activation="relu"))
model_c2.add(Dense(10, activation="softmax"))
model_c2.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_17 (Dense)             (None, 1024)              803840    
_________________________________________________________________
dense_18 (Dense)             (None, 256)               262400    
_________________________________________________________________
dense_19 (Dense)             (None, 10)                2570      
Total params: 1,068,810
Trainable params: 1,068,810
Non-trainable params: 0
_________________________________________________________________


In [16]:
# Compile the model
model_c2.compile(optimizer='sgd', loss='categorical_crossentropy',
              metrics=['accuracy'])
# Fit the model
model_c2.fit(X_train, Y_train, batch_size=batch_size, epochs=20, verbose=1)

Train on 60000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fc3b5f60fd0>

##### * Change the training batch sizes

In [17]:
# Create the model with batch size of 64
model_d1 = Sequential()

model_d1.add(Dense(128, input_shape=X_train[0].shape, activation="relu"))
model_d1.add(Dense(64, activation="relu"))
model_d1.add(Dense(10, activation="softmax"))
model_d1.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_20 (Dense)             (None, 128)               100480    
_________________________________________________________________
dense_21 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_22 (Dense)             (None, 10)                650       
Total params: 109,386
Trainable params: 109,386
Non-trainable params: 0
_________________________________________________________________


In [18]:
# Compile the model
model_d1.compile(optimizer='sgd', loss='categorical_crossentropy',
              metrics=['accuracy'])
# Fit the model
model_d1.fit(X_train, Y_train, batch_size=64, epochs=20, verbose=1)

Train on 60000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fc391789e90>

In [19]:
# Create the model with batch size of 32
model_d2 = Sequential()

model_d2.add(Dense(128, input_shape=X_train[0].shape, activation="relu"))
model_d2.add(Dense(64, activation="relu"))
model_d2.add(Dense(10, activation="softmax"))
model_d2.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_23 (Dense)             (None, 128)               100480    
_________________________________________________________________
dense_24 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_25 (Dense)             (None, 10)                650       
Total params: 109,386
Trainable params: 109,386
Non-trainable params: 0
_________________________________________________________________


In [20]:
# Compile the model
model_d2.compile(optimizer='sgd', loss='categorical_crossentropy',
              metrics=['accuracy'])
# Fit the model
model_d2.fit(X_train, Y_train, batch_size=32, epochs=20, verbose=1)

Train on 60000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fc39172cad0>

##### 4. Compare your models' training scores and interpret your results.


In [21]:
# Training score of model with 3 layers
score = model_a1.evaluate(X_train, Y_train, verbose=0)
print('Train score:', score[0])
print('Train accuracy:', score[1])

# Score of model with 5 layers
score = model_a2.evaluate(X_train, Y_train, verbose=0)
print('Train score:', score[0])
print('Train accuracy:', score[1])

Train score: 0.16479001728643974
Train accuracy: 0.9532667
Train score: 0.1020082001067698
Train accuracy: 0.96996665


* The model with more layers performed better

In [22]:
# Training score of model with tanh activation function
score = model_b1.evaluate(X_train, Y_train, verbose=0)
print('Train score:', score[0])
print('Train accuracy:', score[1])

# Training score of model with sigmoid activation function
score = model_b2.evaluate(X_train, Y_train, verbose=0)
print('Train score:', score[0])
print('Train accuracy:', score[1])

Train score: 0.20993710252741973
Train accuracy: 0.94045
Train score: 0.5694485246419907
Train accuracy: 0.8576


* Both of these models performed worse than when using the relu activation function (above)

In [24]:
# Training score of model with more neurons (256, 128, 10)
score = model_c1.evaluate(X_train, Y_train, verbose=0)
print('Train score:', score[0])
print('Train accuracy:', score[1])

# Training score of model with lots more neurons (1024, 256, 10)
score = model_c2.evaluate(X_train, Y_train, verbose=0)
print('Train score:', score[0])
print('Train accuracy:', score[1])

Train score: 0.1511257340138157
Train accuracy: 0.95675
Train score: 0.13326519663259387
Train accuracy: 0.96323335


* The models with more neurons performed better, though the processing time was significantly longer

In [25]:
# Training score of model with a smaller batch size (64 vs 128)
score = model_d1.evaluate(X_train, Y_train, verbose=0)
print('Train score:', score[0])
print('Train accuracy:', score[1])

# Training score of model with an even smaller batch size (32)
score = model_d2.evaluate(X_train, Y_train, verbose=0)
print('Train score:', score[0])
print('Train accuracy:', score[1])

Train score: 0.10363129419299463
Train accuracy: 0.97
Train score: 0.05451198863511284
Train accuracy: 0.9856833


* The models with the smaller batch sizes performed much better
* The batch size seems to have a much larger effect on the performance than the other variables

##### 5. Evaluate how your models perform on your test set. Compare the results of your models.

In [26]:
# Test score of model with 3 layers
score = model_a1.evaluate(X_test, Y_test, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

# Test score of model with 5 layers
score = model_a2.evaluate(X_test, Y_test, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.16851898211017252
Test accuracy: 0.9512
Test score: 0.12009563096947969
Test accuracy: 0.9646


* Same as the training data, the model with more layers performed better

In [27]:
# Training score of model with tanh activation function
score = model_b1.evaluate(X_test, Y_test, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

# Training score of model with sigmoid activation function
score = model_b2.evaluate(X_test, Y_test, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.21214508360922338
Test accuracy: 0.9392
Test score: 0.5539725347518921
Test accuracy: 0.8622


* Same as the training data, the model with the relu activation function performed better than the other activation functions

In [29]:
# Training score of model with more neurons (256, 128, 10)
score = model_c1.evaluate(X_test, Y_test, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

# Training score of model with lots more neurons (1024, 256, 10)
score = model_c2.evaluate(X_test, Y_test, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.1562782148063183
Test accuracy: 0.9544
Test score: 0.13889799484536053
Test accuracy: 0.9588


* Same as the training data, the model with more neurons performed a bit better however, it did not have a significantly larger difference in performance

In [31]:
# Training score of model with a smaller batch size (64 vs 128)
score = model_d1.evaluate(X_test, Y_test, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

# Training score of model with an even smaller batch size (32)
score = model_d2.evaluate(X_test, Y_test, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.11894182193242013
Test accuracy: 0.9637
Test score: 0.08545311895292253
Test accuracy: 0.9742


* As with the training data, the models with the smaller batch sizes performed better
* The batch size definitely seems to have a much larger effect on the performance than the other variables