In [1]:
# code for loading the format for the notebook
import os

# path : store the current path to convert back to it later
path = os.getcwd()
os.chdir('../notebook_format')
from formats import load_style
load_style()

In [2]:
os.chdir(path)

import numpy as np
from keras.utils import np_utils
from keras.optimizers import SGD
from keras.regularizers import l2
from keras.models import Sequential
from sklearn.grid_search import RandomizedSearchCV
from keras.layers.advanced_activations import PReLU
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization

Using Theano backend.


# Keras Hyperparameter Tuning

We'll use MNIST dataset. The downloaded data is split into three parts, 55,000 data points of training data (mnist.train), 10,000 points of test data (mnist.test), and 5,000 points of validation data (mnist.validation).

Every part of the dataset contains the data and label and we can access them via .images and .labels. e.g. the training images are mnist.train.images and the train labels are mnist.train.labels (one-hot encoded).

In [3]:
# convenient one-liner to load the dataset
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets( "/tmp/data/", one_hot = True )

Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz


In [5]:
X_train = mnist.train.images
y_train = mnist.train.labels
print(X_train.shape)
print(y_train.shape)

n_input = X_train.shape[1]
n_class = y_train.shape[1]
print( 'feature num: ', n_input )
print( 'class num: ', n_class )

(55000, 784)
(55000, 10)
feature num:  784
class num:  10


Keras provides a wrapper class `KerasClassifier` that allows us to use our deep learning models with scikit-learn, this is especially useful when you want to tune hyperparameters using scikit-learn's `RandomizedSearchCV` or `GridSearchCV`. 

To use it, we first define a function that takes the arguments that we wish to tune, inside the function, you define the network's structure as usual and compile it. Then the function is passed to `KerasClassifier`'s `build_fn` parameter.

In [9]:
def create_model( hidden_layers = [ 64, 64, 64 ], dropout_rate = 0, 
                  l2_penalty = 0.1, optimizer = 'adam' ):
    """
    Fixed parameters in include the activation function and
    it will always uses batch normalization after the activation.
    note that n_input and n_class are global variables that
    are not defined inside the function
    
    Parameters
    ----------
    Tunable parameters are (commonly tuned)
    
    hidden_layers: list
        the number of hidden layers, and the size of each hidden layer
    
    dropout_rate: float 0 ~ 1
        if bigger than 0, there will be a dropout layer
    
    l2_penalty: float
        or so called l2 regularization
    
    optimizer: string or keras optimizer
        method to train the network
    """   
    model = Sequential()
    
    for index, layers in enumerate(hidden_layers):       
        if not index:
            # specify the input_dim to be the number of features for the first layer
            model.add( Dense( layers, input_dim = n_input, W_regularizer = l2(l2_penalty) ) )
        else:
            model.add( Dense( layers, W_regularizer = l2(l2_penalty) ) )
        
        # insert BatchNorm layer immediately after fully connected layers
        # and before activation layer
        model.add( BatchNormalization() )
        model.add( PReLU() )        
        if dropout_rate:
            model.add( Dropout( p = dropout_rate ) )
    
    model.add( Dense(n_class) )
    model.add( Activation('softmax') )
    
    # the loss for binary and muti-class classification is different 
    loss = 'binary_crossentropy'
    if n_class > 2:
        loss = 'categorical_crossentropy'
    
    model.compile( loss = loss, optimizer = optimizer, metrics = ['accuracy'] )   
    return model

In [10]:
# create model, note that verbose is turned off here
model = KerasClassifier( 
    build_fn = create_model, 
    nb_epoch = 20, 
    batch_size = 1024, 
    verbose = 0
)

# specify the options and store them inside the dictionary
sgd = SGD( lr = 0.1, decay = 1e-6, momentum = 0.9, nesterov = True )
optimizer_opts = [ 'adam', sgd ]
dropout_rate_opts  = [ 0, 0.2, 0.5 ]
hidden_layers_opts = [ [ 64, 64, 64 ], [ 256, 256 ] ]
l2_penalty_opts = [ 0.01, 0.1, 0.5 ]

param_dict = {
    'hidden_layers': hidden_layers_opts,
    'dropout_rate': dropout_rate_opts,  
    'l2_penalty': l2_penalty_opts,
    'optimizer': optimizer_opts
}

In [11]:
# 1. note that for randomized search, the parameter to pass the the dictionary that
# holds the possible parameter value is `param_distributions`
# 2. `verbose` 2 will print the class info for every cross validation, kind
# of too much
keras_cv = RandomizedSearchCV( 
    estimator = model, 
    param_distributions = param_dict, 
    n_iter = 4, 
    cv = 5,
    verbose = 1 
)
keras_cv.fit( X_train, y_train )

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  9.0min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=<keras.wrappers.scikit_learn.KerasClassifier object at 0x12afe0d30>,
          fit_params={}, iid=True, n_iter=4, n_jobs=1,
          param_distributions={'optimizer': ['adam', <keras.optimizers.SGD object at 0x12afe0cf8>], 'l2_penalty': [0.01, 0.1, 0.5], 'hidden_layers': [[64, 64, 64], [256, 256]], 'dropout_rate': [0, 0.2, 0.5]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring=None, verbose=1)

In [12]:
keras_cv.grid_scores_

[mean: 0.44036, std: 0.14376, params: {'optimizer': <keras.optimizers.SGD object at 0x12afe0cf8>, 'l2_penalty': 0.5, 'hidden_layers': [256, 256], 'dropout_rate': 0},
 mean: 0.94838, std: 0.00670, params: {'optimizer': <keras.optimizers.SGD object at 0x12afe0cf8>, 'l2_penalty': 0.01, 'hidden_layers': [64, 64, 64], 'dropout_rate': 0},
 mean: 0.84842, std: 0.03632, params: {'optimizer': <keras.optimizers.SGD object at 0x12afe0cf8>, 'l2_penalty': 0.1, 'hidden_layers': [256, 256], 'dropout_rate': 0},
 mean: 0.91502, std: 0.01026, params: {'optimizer': 'adam', 'l2_penalty': 0.5, 'hidden_layers': [64, 64, 64], 'dropout_rate': 0.5}]

## Reference

- [Kaggle Otto Keras script](https://www.kaggle.com/daretoshare/otto-group-product-classification-challenge/keras)
- [Use Keras Deep Learning Models with Scikit-Learn in Python](http://machinelearningmastery.com/use-keras-deep-learning-models-scikit-learn-python/)