# Initial Run - No Changes

In [1]:
from __future__ import print_function 
import numpy as np 
from keras.datasets import mnist 
from keras.models import Sequential 
from keras.layers.core import Dense, Activation 
from keras.optimizers import SGD 
from keras.utils import np_utils 
np.random.seed(1671) # for reproducibility 

# network and training 
NB_EPOCH = 20 
BATCH_SIZE = 128 
VERBOSE = 1 
NB_CLASSES = 10 # number of outputs = number of digits
OPTIMIZER = SGD() # optimizer, explained later in this chapter
N_HIDDEN = 128 
VALIDATION_SPLIT=0.2 # how much TRAIN is reserved for validation
# data: shuffled and split between train and test 
(X_train, y_train), (X_test, y_test) = mnist.load_data() 
# X_train is 60000 rows of 28x28 values --> reshaped in 60000 x 764
RESHAPED = 784 
# 
X_train = X_train.reshape(60000, RESHAPED) 
X_test = X_test.reshape(10000, RESHAPED) 
X_train = X_train.astype('float32') 
X_test = X_test.astype('float32') 
# normalize 
X_train /= 255 
X_test /= 255 
print(X_train.shape[0], 'train samples') 
print(X_test.shape[0], 'test samples') 
# convert class vectors to binary class matrices 
Y_train = np_utils.to_categorical(y_train, NB_CLASSES) 
Y_test = np_utils.to_categorical(y_test, NB_CLASSES) 
# M_HIDDEN hidden layers 
# 10 outputs 
# final stage is softmax 
model = Sequential() 
model.add(Dense(N_HIDDEN, input_shape=(RESHAPED,))) 
model.add(Activation('relu')) 
model.add(Dense(N_HIDDEN)) 
model.add(Activation('relu')) 
model.add(Dense(NB_CLASSES)) 
model.add(Activation('softmax')) 
model.summary() 
model.compile(loss='categorical_crossentropy', optimizer=OPTIMIZER, metrics=['accuracy']) 
history = model.fit(X_train, Y_train, batch_size=BATCH_SIZE, epochs=NB_EPOCH, verbose=VERBOSE, validation_split=VALIDATION_SPLIT) 
score = model.evaluate(X_test, Y_test, verbose=VERBOSE) 
print("Test score:", score[0]) 
print('Test accuracy:', score[1])

Using TensorFlow backend.


Downloading data from https://s3.amazonaws.com/img-datasets/mnist.npz
60000 train samples
10000 test samples
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 128)               100480    
_________________________________________________________________
activation_1 (Activation)    (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               16512     
_________________________________________________________________
activation_2 (Activation)    (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 10)                1290      
_________________________________________________________________
activation_3 (Activation)    (None, 10)                0         
Total param

# Change 1 - Reducing nodes to a quarter of original (32 nodes)

In [1]:
from __future__ import print_function 
import numpy as np 
from keras.datasets import mnist 
from keras.models import Sequential 
from keras.layers.core import Dense, Activation 
from keras.optimizers import SGD 
from keras.utils import np_utils 
np.random.seed(1671) # for reproducibility 

# network and training 
NB_EPOCH = 20 
BATCH_SIZE = 128 
VERBOSE = 1 
NB_CLASSES = 10 # number of outputs = number of digits
OPTIMIZER = SGD() # optimizer, explained later in this chapter
N_HIDDEN = 32 
VALIDATION_SPLIT=0.2 # how much TRAIN is reserved for validation
# data: shuffled and split between train and test 
(X_train, y_train), (X_test, y_test) = mnist.load_data() 
# X_train is 60000 rows of 28x28 values --> reshaped in 60000 x 764
RESHAPED = 784 
# 
X_train = X_train.reshape(60000, RESHAPED) 
X_test = X_test.reshape(10000, RESHAPED) 
X_train = X_train.astype('float32') 
X_test = X_test.astype('float32') 
# normalize 
X_train /= 255 
X_test /= 255 
print(X_train.shape[0], 'train samples') 
print(X_test.shape[0], 'test samples') 
# convert class vectors to binary class matrices 
Y_train = np_utils.to_categorical(y_train, NB_CLASSES) 
Y_test = np_utils.to_categorical(y_test, NB_CLASSES) 
# M_HIDDEN hidden layers 
# 10 outputs 
# final stage is softmax 
model = Sequential() 
model.add(Dense(N_HIDDEN, input_shape=(RESHAPED,))) 
model.add(Activation('relu')) 
model.add(Dense(N_HIDDEN)) 
model.add(Activation('relu')) 
model.add(Dense(NB_CLASSES)) 
model.add(Activation('softmax')) 
model.summary() 
model.compile(loss='categorical_crossentropy', optimizer=OPTIMIZER, metrics=['accuracy']) 
history = model.fit(X_train, Y_train, batch_size=BATCH_SIZE, epochs=NB_EPOCH, verbose=VERBOSE, validation_split=VALIDATION_SPLIT) 
score = model.evaluate(X_test, Y_test, verbose=VERBOSE) 
print("Test score:", score[0]) 
print('Test accuracy:', score[1])

Using TensorFlow backend.


Downloading data from https://s3.amazonaws.com/img-datasets/mnist.npz
60000 train samples
10000 test samples
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 32)                25120     
_________________________________________________________________
activation_1 (Activation)    (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                1056      
_________________________________________________________________
activation_2 (Activation)    (None, 32)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 10)                330       
_________________________________________________________________
activation_3 (Activation)    (None, 10)                0         
Total param

# Change 2 - reducing nodes to a quarter again (8 nodes)

In [2]:
from __future__ import print_function 
import numpy as np 
from keras.datasets import mnist 
from keras.models import Sequential 
from keras.layers.core import Dense, Activation 
from keras.optimizers import SGD 
from keras.utils import np_utils 
np.random.seed(1671) # for reproducibility 

# network and training 
NB_EPOCH = 20 
BATCH_SIZE = 128 
VERBOSE = 1 
NB_CLASSES = 10 # number of outputs = number of digits
OPTIMIZER = SGD() # optimizer, explained later in this chapter
N_HIDDEN = 8 
VALIDATION_SPLIT=0.2 # how much TRAIN is reserved for validation
# data: shuffled and split between train and test 
(X_train, y_train), (X_test, y_test) = mnist.load_data() 
# X_train is 60000 rows of 28x28 values --> reshaped in 60000 x 764
RESHAPED = 784 
# 
X_train = X_train.reshape(60000, RESHAPED) 
X_test = X_test.reshape(10000, RESHAPED) 
X_train = X_train.astype('float32') 
X_test = X_test.astype('float32') 
# normalize 
X_train /= 255 
X_test /= 255 
print(X_train.shape[0], 'train samples') 
print(X_test.shape[0], 'test samples') 
# convert class vectors to binary class matrices 
Y_train = np_utils.to_categorical(y_train, NB_CLASSES) 
Y_test = np_utils.to_categorical(y_test, NB_CLASSES) 
# M_HIDDEN hidden layers 
# 10 outputs 
# final stage is softmax 
model = Sequential() 
model.add(Dense(N_HIDDEN, input_shape=(RESHAPED,))) 
model.add(Activation('relu')) 
model.add(Dense(N_HIDDEN)) 
model.add(Activation('relu')) 
model.add(Dense(NB_CLASSES)) 
model.add(Activation('softmax')) 
model.summary() 
model.compile(loss='categorical_crossentropy', optimizer=OPTIMIZER, metrics=['accuracy']) 
history = model.fit(X_train, Y_train, batch_size=BATCH_SIZE, epochs=NB_EPOCH, verbose=VERBOSE, validation_split=VALIDATION_SPLIT) 
score = model.evaluate(X_test, Y_test, verbose=VERBOSE) 
print("Test score:", score[0]) 
print('Test accuracy:', score[1])

60000 train samples
10000 test samples
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 8)                 6280      
_________________________________________________________________
activation_4 (Activation)    (None, 8)                 0         
_________________________________________________________________
dense_5 (Dense)              (None, 8)                 72        
_________________________________________________________________
activation_5 (Activation)    (None, 8)                 0         
_________________________________________________________________
dense_6 (Dense)              (None, 10)                90        
_________________________________________________________________
activation_6 (Activation)    (None, 10)                0         
Total params: 6,442
Trainable params: 6,442
Non-trainable params: 0
_____________



Epoch 20/20
Test score: 0.34400188852548597
Test accuracy: 0.8988999724388123


# Change 3 - 4x the original number of nodes (512 nodes)

In [3]:
from __future__ import print_function 
import numpy as np 
from keras.datasets import mnist 
from keras.models import Sequential 
from keras.layers.core import Dense, Activation 
from keras.optimizers import SGD 
from keras.utils import np_utils 
np.random.seed(1671) # for reproducibility 

# network and training 
NB_EPOCH = 20 
BATCH_SIZE = 128 
VERBOSE = 1 
NB_CLASSES = 10 # number of outputs = number of digits
OPTIMIZER = SGD() # optimizer, explained later in this chapter
N_HIDDEN = 512 
VALIDATION_SPLIT=0.2 # how much TRAIN is reserved for validation
# data: shuffled and split between train and test 
(X_train, y_train), (X_test, y_test) = mnist.load_data() 
# X_train is 60000 rows of 28x28 values --> reshaped in 60000 x 764
RESHAPED = 784 
# 
X_train = X_train.reshape(60000, RESHAPED) 
X_test = X_test.reshape(10000, RESHAPED) 
X_train = X_train.astype('float32') 
X_test = X_test.astype('float32') 
# normalize 
X_train /= 255 
X_test /= 255 
print(X_train.shape[0], 'train samples') 
print(X_test.shape[0], 'test samples') 
# convert class vectors to binary class matrices 
Y_train = np_utils.to_categorical(y_train, NB_CLASSES) 
Y_test = np_utils.to_categorical(y_test, NB_CLASSES) 
# M_HIDDEN hidden layers 
# 10 outputs 
# final stage is softmax 
model = Sequential() 
model.add(Dense(N_HIDDEN, input_shape=(RESHAPED,))) 
model.add(Activation('relu')) 
model.add(Dense(N_HIDDEN)) 
model.add(Activation('relu')) 
model.add(Dense(NB_CLASSES)) 
model.add(Activation('softmax')) 
model.summary() 
model.compile(loss='categorical_crossentropy', optimizer=OPTIMIZER, metrics=['accuracy']) 
history = model.fit(X_train, Y_train, batch_size=BATCH_SIZE, epochs=NB_EPOCH, verbose=VERBOSE, validation_split=VALIDATION_SPLIT) 
score = model.evaluate(X_test, Y_test, verbose=VERBOSE) 
print("Test score:", score[0]) 
print('Test accuracy:', score[1])

60000 train samples
10000 test samples
Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 512)               401920    
_________________________________________________________________
activation_7 (Activation)    (None, 512)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 512)               262656    
_________________________________________________________________
activation_8 (Activation)    (None, 512)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 10)                5130      
_________________________________________________________________
activation_9 (Activation)    (None, 10)                0         
Total params: 669,706
Trainable params: 669,706
Non-trainable params: 0
_________



Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test score: 0.17235374244600535
Test accuracy: 0.9509000182151794


## Summary

### Original Accuracy: 94.63%

### Change 1 Accuracy: 93.36%

### Change 2 Accuracy: 89.89%

### Change 3 Accuracy: 95.09%


As one can see by the accuracy rates, lowering the number of nodes (hidden neurons) in the code results in a decrease in the accuracy, where increasing them gives an increase in the accuracy rate of the tests. However the differences in accuracy weren't impacted significantly until the number of neurons were drastically reduced, which shows that the dataset wasn't too complex and it likely didn't need as many as it initially had. One thing that I did notice was that the amount of time per step, went up hugely when the number of nodes were raised to 4 times the original amount. Where the original took about 2 seconds per epoch, when it had 512 nodes it went up to about 12-30 seconds per epoch. It took ~10 times the amount of time, but increased less than half a percent for accuracy. Was it worth it?

One thing that I've read can significantly increase the accuracy is changing the optimizer. One of the ones I was reading about is called Adam (we used SGD in this example). I ran a test, the results are below.

# Change 4 - Optimizer to Adam

In [2]:
from __future__ import print_function 
import numpy as np 
from keras.datasets import mnist 
from keras.models import Sequential 
from keras.layers.core import Dense, Activation 
from keras.optimizers import Adam 
from keras.utils import np_utils 
np.random.seed(1671) # for reproducibility 

# network and training 
NB_EPOCH = 20 
BATCH_SIZE = 128 
VERBOSE = 1 
NB_CLASSES = 10 # number of outputs = number of digits
OPTIMIZER = Adam() # optimizer, explained later in this chapter
N_HIDDEN = 128 
VALIDATION_SPLIT=0.2 # how much TRAIN is reserved for validation
# data: shuffled and split between train and test 
(X_train, y_train), (X_test, y_test) = mnist.load_data() 
# X_train is 60000 rows of 28x28 values --> reshaped in 60000 x 764
RESHAPED = 784 
# 
X_train = X_train.reshape(60000, RESHAPED) 
X_test = X_test.reshape(10000, RESHAPED) 
X_train = X_train.astype('float32') 
X_test = X_test.astype('float32') 
# normalize 
X_train /= 255 
X_test /= 255 
print(X_train.shape[0], 'train samples') 
print(X_test.shape[0], 'test samples') 
# convert class vectors to binary class matrices 
Y_train = np_utils.to_categorical(y_train, NB_CLASSES) 
Y_test = np_utils.to_categorical(y_test, NB_CLASSES) 
# M_HIDDEN hidden layers 
# 10 outputs 
# final stage is softmax 
model = Sequential() 
model.add(Dense(N_HIDDEN, input_shape=(RESHAPED,))) 
model.add(Activation('relu')) 
model.add(Dense(N_HIDDEN)) 
model.add(Activation('relu')) 
model.add(Dense(NB_CLASSES)) 
model.add(Activation('softmax')) 
model.summary() 
model.compile(loss='categorical_crossentropy', optimizer=OPTIMIZER, metrics=['accuracy']) 
history = model.fit(X_train, Y_train, batch_size=BATCH_SIZE, epochs=NB_EPOCH, verbose=VERBOSE, validation_split=VALIDATION_SPLIT) 
score = model.evaluate(X_test, Y_test, verbose=VERBOSE) 
print("Test score:", score[0]) 
print('Test accuracy:', score[1])

Downloading data from https://s3.amazonaws.com/img-datasets/mnist.npz
60000 train samples
10000 test samples
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 128)               100480    
_________________________________________________________________
activation_1 (Activation)    (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               16512     
_________________________________________________________________
activation_2 (Activation)    (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 10)                1290      
_________________________________________________________________
activation_3 (Activation)    (None, 10)                0         
Total param

## Change 4 Results
With the only change from the original being the optimizer changed from SGD to Adam, we got a score over 3% higher than the original code. It did take a bit longer though, so that should be taken into account. 