In [1]:
# code for loading the format for the notebook
import os

# path : store the current path to convert back to it later
path = os.getcwd()
os.chdir('../notebook_format')
from formats import load_style
load_style()

In [2]:
os.chdir(path)

import numpy as np
from keras.utils import np_utils
from keras.optimizers import SGD
from keras.regularizers import l2
from keras.models import Sequential
from sklearn.grid_search import RandomizedSearchCV
from keras.layers.advanced_activations import PReLU
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization

Using Theano backend.


# Keras Hyperparameter Tuning

We'll use the [Pima Indians diabetes dataset](http://archive.ics.uci.edu/ml/datasets/Pima+Indians+Diabetes). This is a small dataset with all numerical attributes that is easy to work with. The output column is the last column that has a binary output of 0 and 1. There'll be zero feature engineering or interpretation of the feautures, as this is focusing on familiarizing with Keras's api.

In [3]:
# example dataset
dataset = np.loadtxt( 'pima-indians-diabetes.csv', delimiter = ',' )
print(dataset.shape)
print()
print(dataset)

# split into input (X) and output (Y) variables
X = dataset[ :, :8 ]
y = dataset[ :, 8 ]

# make sure you get the number of feature and class correct
# and also one-hot encode the class
feature_num = X.shape[1]
class_num = np.unique(y).shape[0]
y_encode = np_utils.to_categorical( y, class_num )

(768, 9)

[[   6.     148.      72.    ...,    0.627   50.       1.   ]
 [   1.      85.      66.    ...,    0.351   31.       0.   ]
 [   8.     183.      64.    ...,    0.672   32.       1.   ]
 ..., 
 [   5.     121.      72.    ...,    0.245   30.       0.   ]
 [   1.     126.      60.    ...,    0.349   47.       1.   ]
 [   1.      93.      70.    ...,    0.315   23.       0.   ]]


Keras provides a wrapper class `KerasClassifier` that allows us to use our deep learning models with scikit-learn, this is especially useful when you want to tune hyperparameters using scikit-learn's `RandomizedSearchCV` or `GridSearchCV`. 

To use it, we first define a function that takes the arguments that we wish to tune, inside the function, you define the network's structure as usual and compile it. Then the function is passed to `KerasClassifier`'s `build_fn` parameter.

In [4]:
def create_model( hidden_layers = [ 64, 64, 64 ], dropout_rate = 0, 
                  l2_penalty = 0.1, optimizer = 'adam' ):
    """
    Fixed parameters in include the activation function and
    it will always uses batch normalization after the activation.
    note that feature_num and class_num are global variables that
    are not defined inside the function
    
    Parameters
    ----------
    Tunable parameters are (commonly tuned)
    
    hidden_layers: list
        the number of hidden layers, and the size of each hidden layer
    
    dropout_rate: float 0 ~ 1
        if bigger than 0, there will be a dropout layer
    
    l2_penalty: float
        or so called l2 regularization
    
    optimizer: string or keras optimizer
        method to train the network
    """   
    model = Sequential()
    
    for index, layers in enumerate(hidden_layers):       
        if not index:
            # specify the input_dim to be the number of features for the first layer
            model.add( Dense( layers, input_dim = feature_num, W_regularizer = l2(l2_penalty) ) )
        else:
            model.add( Dense( layers, W_regularizer = l2(l2_penalty) ) )
        
        model.add( PReLU() )
        model.add( BatchNormalization() )
        if dropout_rate:
            model.add( Dropout( p = dropout_rate ) )
    
    model.add( Dense(class_num) )
    model.add( Activation('softmax') )
    
    # the loss for binary and muti-class classification is different 
    loss = 'binary_crossentropy'
    if class_num > 2:
        loss = 'categorical_crossentropy'
    
    model.compile( loss = loss, optimizer = optimizer, metrics = ['accuracy'] )   
    return model

In [6]:
# create model, note that verbose is turned off here
model = KerasClassifier( 
    build_fn = create_model, 
    nb_epoch = 15, 
    batch_size = 64, 
    verbose = 0
)

# specify the options and store them inside the dictionary
sgd = SGD( lr = 0.1, decay = 1e-6, momentum = 0.9, nesterov = True )
optimizer_opts = [ 'adam', sgd ]
dropout_rate_opts  = [ 0, 0.2, 0.5 ]
hidden_layers_opts = [ [ 64, 64, 64 ], [ 128, 32, 32, 32, 32 ] ]
l2_penalty_opts = [ 0.01, 0.1, 0.5 ]

param_dict = {
    'hidden_layers': hidden_layers_opts,
    'dropout_rate': dropout_rate_opts,  
    'l2_penalty': l2_penalty_opts,
    'optimizer': optimizer_opts
}

In [7]:
# 1. note that for randomized search, the parameter to pass the the dictionary that
# holds the possible parameter value is `param_distributions`
# 2. `verbose` 2 will print the class info for every cross validation, kind
# of too much
keras_cv = RandomizedSearchCV( 
    estimator = model, 
    param_distributions = param_dict, 
    n_iter = 4, 
    cv = 5,
    verbose = 1 
)
keras_cv.fit( X, y_encode )

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  2.6min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=<keras.wrappers.scikit_learn.KerasClassifier object at 0x10f58a4a8>,
          fit_params={}, iid=True, n_iter=4, n_jobs=1,
          param_distributions={'optimizer': ['adam', <keras.optimizers.SGD object at 0x10f58a400>], 'l2_penalty': [0.01, 0.1, 0.5], 'hidden_layers': [[64, 64, 64], [128, 32, 32, 32, 32]], 'dropout_rate': [0, 0.2, 0.5]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring=None, verbose=1)

In [8]:
keras_cv.grid_scores_

[mean: 0.69010, std: 0.02811, params: {'l2_penalty': 0.01, 'optimizer': <keras.optimizers.SGD object at 0x10f58a400>, 'hidden_layers': [128, 32, 32, 32, 32], 'dropout_rate': 0.5},
 mean: 0.71094, std: 0.03656, params: {'l2_penalty': 0.01, 'optimizer': <keras.optimizers.SGD object at 0x10f58a400>, 'hidden_layers': [64, 64, 64], 'dropout_rate': 0.2},
 mean: 0.63932, std: 0.06504, params: {'l2_penalty': 0.1, 'optimizer': <keras.optimizers.SGD object at 0x10f58a400>, 'hidden_layers': [128, 32, 32, 32, 32], 'dropout_rate': 0},
 mean: 0.65625, std: 0.05126, params: {'l2_penalty': 0.1, 'optimizer': <keras.optimizers.SGD object at 0x10f58a400>, 'hidden_layers': [128, 32, 32, 32, 32], 'dropout_rate': 0.2}]

## Reference

- [Kaggle Otto Keras script](https://www.kaggle.com/daretoshare/otto-group-product-classification-challenge/keras)
- [Use Keras Deep Learning Models with Scikit-Learn in Python](http://machinelearningmastery.com/use-keras-deep-learning-models-scikit-learn-python/)