## Load data

In [1]:
from keras.datasets import mnist
(X_train, y_train), (X_test, y_test) = mnist.load_data()

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Scale the data to between 0 & 1

In [2]:
X_train = X_train/ 255
X_test = X_test/ 255
print(X_train.min(),X_train.max())

0.0 1.0


In [3]:
y_test[0:10]

array([7, 2, 1, 0, 4, 1, 4, 9, 5, 9], dtype=uint8)

In [4]:
X_train.shape

(60000, 28, 28)

Need to Flatten the Arrays

In [5]:
X_train = X_train.reshape(60000,784)
X_test = X_test.reshape(10000,784)

Convert the y's to used with softmax function

In [6]:
from keras.utils import np_utils
y_train = np_utils.to_categorical(y_train, 10)
y_test = np_utils.to_categorical(y_test, 10)

In [7]:
y_train.shape

(60000, 10)

In [8]:
input_shape= X_train[0].shape
print(input_shape)

(784,)


## Checking our result against a baseline
A simple 2 layer neural net with hardly any hyperparameter tuning


In [9]:
# the usual imports for a vanilla nueral net
import keras
from keras.models import Sequential
from keras.layers import Dense

In [10]:
model =Sequential()
model.add(Dense(16, input_shape=input_shape, activation='relu',name = 'input_layer'))
model.add(Dense(16, activation='relu', name="hidden_layer"))

model.add(Dense(10,activation='softmax',name="output_layer"))
model.compile(optimizer = 'adam', loss='categorical_crossentropy', metrics=["accuracy"])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_layer (Dense)          (None, 16)                12560     
_________________________________________________________________
hidden_layer (Dense)         (None, 16)                272       
_________________________________________________________________
output_layer (Dense)         (None, 10)                170       
Total params: 13,002
Trainable params: 13,002
Non-trainable params: 0
_________________________________________________________________


In [11]:
blackbox = model.fit(X_train, y_train, batch_size=128, epochs =3, validation_split=.15)

Train on 51000 samples, validate on 9000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [23]:
accuracy = model.evaluate(X_test,y_test)[1]
print(accuracy)

0.9283


## Using Skopt (scikit-optimize)

In [9]:
#imports we know we'll need
import skopt
# !pip install scikit-optimize if  necessary
from skopt import gbrt_minimize, gp_minimize
from skopt.utils import use_named_args
from skopt.space import Real, Categorical, Integer

Creating our search parameters.
"dim_" short for dimension. Its' just a way to label our parameters.

We can search across nearly every param in a keras model. 
This code focuses on: 
* Number of Layers
* Number of Nodes per layer
* Learning Rate & Weight Decay for the Adam Optimizer
* activation functions
* batch size

The name feature allows us to use the `@use_named_args` decorator.
We must also establish default parameters. 

In [10]:
dim_learning_rate = Real(low=1e-4, high=1e-2, prior='log-uniform',
                         name='learning_rate')
dim_num_dense_layers = Integer(low=1, high=5, name='num_dense_layers')
dim_num_input_nodes = Integer(low=1, high=512, name='num_input_nodes')
dim_num_dense_nodes = Integer(low=1, high=28, name='num_dense_nodes')
dim_activation = Categorical(categories=['relu', 'sigmoid'],
                             name='activation')
dim_batch_size = Integer(low=1, high=128, name='batch_size')
dim_adam_decay = Real(low=1e-6,high=1e-2,name="adam_decay")

dimensions = [dim_learning_rate,
              dim_num_dense_layers,
              dim_num_input_nodes,
              dim_num_dense_nodes,
              dim_activation,
              dim_batch_size,
              dim_adam_decay
             ]
default_parameters = [1e-3, 1,512, 13, 'relu',64, 1e-3]

In [11]:
# the usual imports for a vanilla nueral net
import keras
from keras.models import Sequential
from keras.layers import Dense
import tensorflow
from tensorflow.python.keras import backend as K

Here we create our model which creates other models.
We import the Adam optimizer to get the ability to adjust its learning rate and decay.


In [12]:
from keras.optimizers import Adam
def create_model(learning_rate, num_dense_layers,num_input_nodes,
                 num_dense_nodes, activation, adam_decay):
    #start the model making process and create our first layer
    model = Sequential()
    model.add(Dense(num_input_nodes, input_shape= input_shape, activation=activation
                   ))
    #create a loop making a new dense layer for the amount passed to this model.
    #naming the layers helps avoid tensorflow error deep in the stack trace.
    for i in range(num_dense_layers):
        name = 'layer_dense_{0}'.format(i+1)
        model.add(Dense(num_dense_nodes,
                 activation=activation,
                        name=name
                 ))
    #add our classification layer.
    model.add(Dense(10,activation='softmax'))
    
    #setup our optimizer and compile
    adam = Adam(lr=learning_rate, decay= adam_decay)
    model.compile(optimizer=adam, loss='categorical_crossentropy',
                 metrics=['accuracy'])
    return model

In [33]:
@use_named_args(dimensions=dimensions)
def fitness(learning_rate, num_dense_layers, num_input_nodes, 
            num_dense_nodes,activation, batch_size,adam_decay):

    model = create_model(learning_rate=learning_rate,
                         num_dense_layers=num_dense_layers,
                         num_input_nodes=num_input_nodes,
                         num_dense_nodes=num_dense_nodes,
                         activation=activation,
                         adam_decay=adam_decay
                        )
    

    #named blackbox becuase it represents the structure
    blackbox = model.fit(x=X_train,
                        y=y_train,
                        epochs=3,
                        batch_size=batch_size,
                        validation_split=0.15,
                        )
    #return the validation accuracy for the last epoch.
    accuracy = blackbox.history['val_acc'][-1]

    # Print the classification accuracy.
    print()
    print("Accuracy: {0:.2%}".format(accuracy))
    print()


    # Delete the Keras model with these hyper-parameters from memory.
    del model
    
    # Clear the Keras session, otherwise it will keep adding new
    # models to the same TensorFlow graph each time we create
    # a model with a different set of hyper-parameters.
    K.clear_session()
    tensorflow.reset_default_graph()
    
    return -accuracy

## It is always a good idea to run the code below everytime you restart the search function or the notebook

The two lines of code below allows us to restart using tensorflow. Otherwise some stuff under the hood will cause problems. 

In [34]:
K.clear_session()
tensorflow.reset_default_graph()

### Hyper parameters for our hyper parameter search
For the Gaussian Project search, I have added a few extra parameters to try to improve the search.

In [35]:
gp_result = gp_minimize(func=fitness,
                            dimensions=dimensions,
                            n_calls=12,
                            noise= 0.01,
                            n_jobs=-1,
                            kappa = 5,
                            x0=default_parameters)

Train on 51000 samples, validate on 9000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 98.19%

Train on 51000 samples, validate on 9000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 98.17%

Train on 51000 samples, validate on 9000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 98.18%

Train on 51000 samples, validate on 9000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 98.20%

Train on 51000 samples, validate on 9000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 98.19%

Train on 51000 samples, validate on 9000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 98.23%

Train on 51000 samples, validate on 9000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 98.22%

Train on 51000 samples, validate on 9000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 98.22%

Train on 51000 samples, validate on 9000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 98.21%

Train on 51000 samples, validate on 9000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 98.20%

Train on 51000 sampl

## Find our best accuracy

In [42]:
print("best accuracy was " + str(round(gp_result.fun *-100,2))+"%.")

best accuracy was 98.23%.


### returning the parameters for the best function

In [43]:
gp_result.x

[0.007141340424839902, 4, 7, 4, 'sigmoid', 2, 0.00537784574467934]

In [44]:
gp_result.func_vals

array([-0.98188889, -0.98166667, -0.98177778, -0.98199999, -0.98188889,
       -0.98233333, -0.98222223, -0.98222223, -0.9821111 , -0.982     ,
       -0.98211111, -0.982     ])

### Let's see all the models our search function tried

In [66]:
import pandas as pd
pd.concat([pd.DataFrame(gp_result.x_iters, columns = ["learning rate","hidden layers","input layer nodes","hidden layer nodes",
                                           "activation function","batch size","adam learning rate decay"]),
(pd.Series(gp_result.func_vals*-100, name="accuracy"))], axis=1)

Unnamed: 0,learning rate,hidden layers,input layer nodes,hidden layer nodes,activation function,batch size,adam learning rate decay,accuracy
0,0.001,1,512,13,relu,64,0.001,98.188889
1,0.000218,2,23,8,sigmoid,63,0.000509,98.166667
2,0.001863,2,216,27,relu,92,0.00164,98.177778
3,0.001322,2,197,16,relu,112,0.001616,98.199999
4,0.000237,3,148,22,relu,38,0.001929,98.188889
5,0.007141,4,7,4,sigmoid,2,0.005378,98.233333
6,0.002599,3,115,3,relu,122,0.002137,98.222223
7,0.009535,2,85,2,sigmoid,109,0.005409,98.222223
8,0.001055,4,403,4,sigmoid,70,0.008651,98.21111
9,0.000165,2,218,26,relu,28,0.000883,98.2


In [68]:
gp_result.x

[0.007141340424839902, 4, 7, 4, 'sigmoid', 2, 0.00537784574467934]

In [67]:
gp_model = create_model(gp_result.x[0],gp_result.x[1],gp_result.x[2],gp_result.x[3],gp_result.x[4],gp_result.x[5])
gp_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 7)                 5495      
_________________________________________________________________
layer_dense_1 (Dense)        (None, 4)                 32        
_________________________________________________________________
layer_dense_2 (Dense)        (None, 4)                 20        
_________________________________________________________________
layer_dense_3 (Dense)        (None, 4)                 20        
_________________________________________________________________
layer_dense_4 (Dense)        (None, 4)                 20        
_________________________________________________________________
dense_4 (Dense)              (None, 10)                50        
Total params: 5,637
Trainable params: 5,637
Non-trainable params: 0
_________________________________________________________________


In [61]:
#retrain our best model architecture
gp_model.fit(X_train,y_train, epochs=3)
gp_model.evaluate(X_test,y_test)

Epoch 1/3
Epoch 2/3
Epoch 3/3


[0.07845962716488976, 0.9809]

## Trying a gradient boosted search with a simpler model

In [63]:
K.clear_session()
tensorflow.reset_default_graph()

In [48]:
gbrt_result = gbrt_minimize(func=fitness,
                            dimensions=dimensions,
                            n_calls=12,
                            n_jobs=-1,
                            x0=default_parameters)

Train on 51000 samples, validate on 9000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 98.20%

Train on 51000 samples, validate on 9000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 98.23%

Train on 51000 samples, validate on 9000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 98.18%

Train on 51000 samples, validate on 9000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 98.20%

Train on 51000 samples, validate on 9000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 98.19%

Train on 51000 samples, validate on 9000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 98.22%

Train on 51000 samples, validate on 9000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 98.19%

Train on 51000 samples, validate on 9000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 98.18%

Train on 51000 samples, validate on 9000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 98.18%

Train on 51000 samples, validate on 9000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Accuracy: 98.18%

Train on 51000 sampl

In [49]:
print("best accuracy was " + str(round(gbrt_result.fun *100,2))+"%.")

best accuracy was -98.23%.


In [65]:
pd.concat([pd.DataFrame(gbrt_result.x_iters, columns = ["learning rate","hidden layers","input layer nodes","hidden layer nodes",
                                           "activation function","batch size","adam learning rate decay"]),
(pd.Series(gbrt_result.func_vals*-100, name="accuracy"))], axis=1)

Unnamed: 0,learning rate,hidden layers,input layer nodes,hidden layer nodes,activation function,batch size,adam learning rate decay,accuracy
0,0.001,1,512,13,relu,64,0.001,98.2
1,0.007515,1,395,16,relu,50,0.006402,98.233334
2,0.002703,2,67,18,sigmoid,52,0.005823,98.177777
3,0.002827,4,228,6,sigmoid,13,0.001937,98.2
4,0.000681,2,164,22,sigmoid,15,0.007204,98.188889
5,0.00074,3,224,8,sigmoid,8,0.009376,98.222222
6,0.002231,3,306,2,relu,82,0.00102,98.188889
7,0.000307,1,262,1,sigmoid,71,0.002139,98.177777
8,0.001296,2,460,6,sigmoid,26,0.009184,98.177777
9,0.005741,5,509,3,sigmoid,88,0.003993,98.177778


In [51]:
gbrt_result.x

[0.007514932641849276, 1, 395, 16, 'relu', 50, 0.0064019416793896135]

In [70]:
K.clear_session()
tensorflow.reset_default_graph()

In [64]:
#call our best model 
gbrt_model = create_model(gbrt_result.x[0],gbrt_result.x[1],gbrt_result.x[2],gbrt_result.x[3],gbrt_result.x[4],gbrt_result.x[5])
gbrt_model.summary()
#retrain our best model architecture
model.fit(X_train,y_train, epochs=3)
model.evaluate(X_test,y_test)

Epoch 1/3
Epoch 2/3
Epoch 3/3


[0.07832179739567255, 0.9813]

## While the two models had very different parameters, their accuracies were basically the same.

much thanks to this link for providing a framework for this code.
https://github.com/Hvass-Labs/TensorFlow-Tutorials/blob/master/19_Hyper-Parameters.ipynb
