## Keras and Neural Networks

## Preliminaries

### Imports

In [2]:
import pickle
import shutil

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

%matplotlib inline

import sys
sys.path.append("../..")
import E4525_ML.mnist as mnist
from E4525_ML.notebook_utils import get_logger,LoggingCallback

In [3]:
logger=get_logger("Homework7")

In [4]:
raw_data_dir="data"

In [5]:
images_filename=raw_data_dir+"/train-images-idx3-ubyte.gz"
labels_filename=raw_data_dir+"/train-labels-idx1-ubyte.gz"

test_images_filename=raw_data_dir+"/t10k-images-idx3-ubyte.gz"
test_labels_filename=raw_data_dir+"/t10k-labels-idx1-ubyte.gz"

images=mnist.read_images(images_filename)
labels=mnist.read_labels(labels_filename)

test_images=mnist.read_images(test_images_filename)
test_labels=mnist.read_labels(test_labels_filename)
    
print(images.shape,labels.shape,test_images.shape,test_labels.shape)

(60000, 28, 28) (60000,) (10000, 28, 28) (10000,)


In [30]:
X_train,X_val,Y_train,Y_val=train_test_split(images,labels,test_size=0.2,)

print(X_train.shape,X_val.shape,Y_train.shape,Y_val.shape)

(48000, 28, 28) (12000, 28, 28) (48000,) (12000,)


In [31]:
def build_model(hidden,nodes):
    model=keras.models.Sequential()
    model.add(keras.layers.Flatten(input_shape=(28,28)))
    for i in range (0,hidden):
        model.add(keras.layers.Dense(nodes,activation='relu'))
    model.add(keras.layers.Dense(10,activation='softmax'))
    return model

In [32]:
model=build_model(2,32)
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [33]:
model.summary()
# In each layer, there are respectively 25120, 1056 and 1056 trainable parameters.
# There are 26 506 parameters in total 

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_13 (Flatten)         (None, 784)               0         
_________________________________________________________________
dense_22 (Dense)             (None, 32)                25120     
_________________________________________________________________
dense_23 (Dense)             (None, 32)                1056      
_________________________________________________________________
dense_24 (Dense)             (None, 10)                330       
Total params: 26,506
Trainable params: 26,506
Non-trainable params: 0
_________________________________________________________________


In [34]:
result=model.fit(X_train,Y_train,batch_size=128,validation_data=(X_val,Y_val),
                 epochs=150,verbose=0,callbacks=[LoggingCallback(10,logger)])

2019-12-09 10:23:56|	0: TRAIN loss 0.7301,  acc 0.7454  ||  VAL loss 0.4929, acc 0.8294
2019-12-09 10:24:02|	10: TRAIN loss 0.3078,  acc 0.8882  ||  VAL loss 0.3705, acc 0.8693
2019-12-09 10:24:07|	20: TRAIN loss 0.2625,  acc 0.9038  ||  VAL loss 0.3489, acc 0.8772
2019-12-09 10:24:13|	30: TRAIN loss 0.2340,  acc 0.9142  ||  VAL loss 0.3499, acc 0.8788
2019-12-09 10:24:19|	40: TRAIN loss 0.2119,  acc 0.9219  ||  VAL loss 0.3635, acc 0.8808
2019-12-09 10:24:25|	50: TRAIN loss 0.1946,  acc 0.9285  ||  VAL loss 0.3804, acc 0.8759
2019-12-09 10:24:31|	60: TRAIN loss 0.1780,  acc 0.9346  ||  VAL loss 0.4034, acc 0.8782
2019-12-09 10:24:37|	70: TRAIN loss 0.1674,  acc 0.9390  ||  VAL loss 0.4605, acc 0.8717
2019-12-09 10:24:42|	80: TRAIN loss 0.1573,  acc 0.9419  ||  VAL loss 0.4664, acc 0.8683
2019-12-09 10:24:48|	90: TRAIN loss 0.1456,  acc 0.9464  ||  VAL loss 0.4918, acc 0.8689
2019-12-09 10:24:54|	100: TRAIN loss 0.1439,  acc 0.9470  ||  VAL loss 0.4822, acc 0.8726
2019-12-09 10:25:00|	

In [35]:
# The accuracy of the model on the valuation set is 0.8695

In [36]:
results=[]
layers_array =[0,1,2,3,4,5]
nodes_array = [16,32,64,128]
for layers in layers_array:
    for nodes in nodes_array:
        model=build_model(layers,nodes)
        model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
        model.fit(X_train,Y_train,batch_size=128,validation_data=(X_val,Y_val),
                         epochs=150,verbose=0,callbacks=[LoggingCallback(10,logger)])
        Y_pred=np.argmax(model.predict(X_val),axis=1)
        acc=np.mean(Y_pred==Y_val)
        results.append((layers,nodes,acc))
        print(">>>>>",layers,nodes,acc,"<<<<<<<<<")

2019-12-09 10:25:39|	0: TRAIN loss 0.7863,  acc 0.7405  ||  VAL loss 0.5873, acc 0.8067
2019-12-09 10:25:43|	10: TRAIN loss 0.4048,  acc 0.8617  ||  VAL loss 0.4365, acc 0.8487
2019-12-09 10:25:48|	20: TRAIN loss 0.3813,  acc 0.8679  ||  VAL loss 0.4209, acc 0.8551
2019-12-09 10:25:52|	30: TRAIN loss 0.3689,  acc 0.8712  ||  VAL loss 0.4226, acc 0.8543
2019-12-09 10:25:57|	40: TRAIN loss 0.3629,  acc 0.8725  ||  VAL loss 0.4293, acc 0.8524
2019-12-09 10:26:01|	50: TRAIN loss 0.3581,  acc 0.8736  ||  VAL loss 0.4273, acc 0.8518
2019-12-09 10:26:06|	60: TRAIN loss 0.3525,  acc 0.8758  ||  VAL loss 0.4222, acc 0.8544
2019-12-09 10:26:10|	70: TRAIN loss 0.3502,  acc 0.8763  ||  VAL loss 0.4264, acc 0.8547
2019-12-09 10:26:15|	80: TRAIN loss 0.3472,  acc 0.8766  ||  VAL loss 0.4301, acc 0.8534
2019-12-09 10:26:19|	90: TRAIN loss 0.3454,  acc 0.8779  ||  VAL loss 0.4304, acc 0.8537
2019-12-09 10:26:24|	100: TRAIN loss 0.3424,  acc 0.8794  ||  VAL loss 0.4363, acc 0.8522
2019-12-09 10:26:28|	

In [43]:
results_1=np.array(results)
df_results=pd.DataFrame(results, columns=['layers','nodes','val_accuracy']) 


# Optimal Network
results[np.argmax(results_1[:,2])]
best_layers=results[np.argmax(results_1[:,2])][0]
best_nodes=results[np.argmax(results_1[:,2])][1]
print('Best layers: ',best_layers)
print('Best nodes: ',best_nodes)

df_results

Best layers:  2
Best nodes:  128


Unnamed: 0,layers,nodes,val_accuracy
0,0,16,0.848583
1,0,32,0.8475
2,0,64,0.8485
3,0,128,0.85025
4,1,16,0.85975
5,1,32,0.86875
6,1,64,0.8745
7,1,128,0.88025
8,2,16,0.865333
9,2,32,0.867167


The width of each layer seems to impact more the accuracy of the model. For a given width of 128, the model has nearly the same 
accuracy for 2 and 5 layers

In [49]:
model=build_model(best_layers,best_nodes)
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(images,labels,batch_size=128,validation_data=(test_images,test_labels),
                         epochs=150,verbose=0,callbacks=[LoggingCallback(10,logger)])

2019-12-09 11:53:44|	0: TRAIN loss 0.5437,  acc 0.8110  ||  VAL loss 0.4345, acc 0.8463
2019-12-09 11:53:57|	10: TRAIN loss 0.2339,  acc 0.9126  ||  VAL loss 0.3334, acc 0.8824
2019-12-09 11:54:09|	20: TRAIN loss 0.1691,  acc 0.9362  ||  VAL loss 0.3572, acc 0.8909
2019-12-09 11:54:21|	30: TRAIN loss 0.1254,  acc 0.9524  ||  VAL loss 0.3921, acc 0.8934
2019-12-09 11:54:34|	40: TRAIN loss 0.0948,  acc 0.9645  ||  VAL loss 0.4600, acc 0.8871
2019-12-09 11:54:47|	50: TRAIN loss 0.0738,  acc 0.9727  ||  VAL loss 0.5116, acc 0.8936
2019-12-09 11:54:59|	60: TRAIN loss 0.0675,  acc 0.9749  ||  VAL loss 0.6197, acc 0.8886
2019-12-09 11:55:12|	70: TRAIN loss 0.0567,  acc 0.9786  ||  VAL loss 0.6733, acc 0.8854
2019-12-09 11:55:25|	80: TRAIN loss 0.0435,  acc 0.9837  ||  VAL loss 0.7141, acc 0.8905
2019-12-09 11:55:38|	90: TRAIN loss 0.0392,  acc 0.9858  ||  VAL loss 0.8014, acc 0.8897
2019-12-09 11:55:50|	100: TRAIN loss 0.0278,  acc 0.9899  ||  VAL loss 0.8295, acc 0.8897
2019-12-09 11:56:03|	

<tensorflow.python.keras.callbacks.History at 0x1c4ad42be0>

The accuracy on the MNIST fashion test data is 0.8900

In [51]:
model.summary()


Model: "sequential_39"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_39 (Flatten)         (None, 784)               0         
_________________________________________________________________
dense_112 (Dense)            (None, 128)               100480    
_________________________________________________________________
dense_113 (Dense)            (None, 128)               16512     
_________________________________________________________________
dense_114 (Dense)            (None, 10)                1290      
Total params: 118,282
Trainable params: 118,282
Non-trainable params: 0
_________________________________________________________________


The number of parameters on the optimal network is 118,282