# Tuning NN parameters

Data from Kaggle Digit Recognition competition  
https://www.kaggle.com/c/digit-recognizer/data

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import Dense, Activation
from keras import optimizers
from keras.callbacks import Callback, ModelCheckpoint
from keras.wrappers.scikit_learn import KerasClassifier

from sklearn.model_selection import GridSearchCV, ParameterGrid

Using TensorFlow backend.


## Load data

In [2]:
!pwd

/Users/zubo/Dropbox/projects/kaggle/digit-recognizer


In [3]:
train_file = 'data/train.csv'
test_file = 'data/test.csv'

train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)

In [4]:
train_df.head() # labels and pixels

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
test_df.head() # no labels here

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


So each row of the dataset is a sequence of ink intensity for the 784 pixels that represents square image with the size 28x28.

In [6]:
train_labels = train_df.label
train_images = train_df.iloc[:,1:]
test_images = test_df

## Preprocess data for NN

In [7]:
train_images = (train_images/train_images.max()).fillna(0) # normalize values
test_images = (test_images/test_images.max()).fillna(0) # normalize values
train_labels = pd.get_dummies(train_labels) # one-hot encoding of the label

In [8]:
train_labels.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,1,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0


## Tuning hyperparameters for 1-hidden layer dense NN
Some more information about Keras' Sequential models  
https://keras.io/getting-started/sequential-model-guide/

There are different ways to go through selected set of hyperparameters.  
One of them - use Keras models with Scikit-learn wrapper, and then use Scikit-learn capabilities.  

Pro's: there are exist convinient functions to do grid search, for example, which includes cross-validation.    
Contra's: No way (at the moment) get history objects (to plot loss/accuracy vs epoch) for the tuned models.

For this tutorial, let's tune the number of the nodes in the hidden layers, and the value of the learning rate for the RMSProp optimizer.

In [9]:
nodes = [32, 64, 128, 256, 512] # number of nodes in the hidden layer
lrs = [0.001, 0.002, 0.003] # learning rate, default = 0.001
epochs = 15
batch_size = 64

In order to use Scikit-based grid search, we need to define the function that builds model. The function should take tuning parameters as an input and return the model as an output.

In [10]:
def build_model(nodes=10, lr=0.001):
    model = Sequential()
    model.add(Dense(nodes, kernel_initializer='uniform', input_dim=784))
    model.add(Activation('relu'))
    model.add(Dense(10))
    model.add(Activation('softmax'))

    opt = optimizers.RMSprop(lr=lr)
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

    return(model)

In [11]:
model = KerasClassifier(build_fn=build_model, epochs=epochs,
                        batch_size=batch_size, verbose=0)
X = train_images.values
Y = train_labels.values

In [12]:
param_grid = dict(nodes=nodes, lr=lrs)
param_grid

{'lr': [0.001, 0.002, 0.003], 'nodes': [32, 64, 128, 256, 512]}

_refit_=True for retraining the best model on the whole training dataset.  
Also, I'm using _verbose_=2 to see how the process goes, and to estimate needed time.

In [13]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3,
                    n_jobs=1, refit=True, verbose=2)

In [14]:
grid_result = grid.fit(X, Y)

Fitting 3 folds for each of 15 candidates, totalling 45 fits
[CV] lr=0.001, nodes=32 ..............................................
[CV] ............................... lr=0.001, nodes=32, total=  10.9s
[CV] lr=0.001, nodes=32 ..............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   11.4s remaining:    0.0s


[CV] ............................... lr=0.001, nodes=32, total=  11.2s
[CV] lr=0.001, nodes=32 ..............................................
[CV] ............................... lr=0.001, nodes=32, total=  12.3s
[CV] lr=0.001, nodes=64 ..............................................
[CV] ............................... lr=0.001, nodes=64, total=  16.4s
[CV] lr=0.001, nodes=64 ..............................................
[CV] ............................... lr=0.001, nodes=64, total=  14.1s
[CV] lr=0.001, nodes=64 ..............................................
[CV] ............................... lr=0.001, nodes=64, total=  15.7s
[CV] lr=0.001, nodes=128 .............................................
[CV] .............................. lr=0.001, nodes=128, total=  16.4s
[CV] lr=0.001, nodes=128 .............................................
[CV] .............................. lr=0.001, nodes=128, total=  16.8s
[CV] lr=0.001, nodes=128 .............................................
[CV] .

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed: 16.9min finished


In [15]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.975048 using {'lr': 0.002, 'nodes': 256}
0.955571 (0.001869) with: {'lr': 0.001, 'nodes': 32}
0.965476 (0.001058) with: {'lr': 0.001, 'nodes': 64}
0.970357 (0.001303) with: {'lr': 0.001, 'nodes': 128}
0.972357 (0.000964) with: {'lr': 0.001, 'nodes': 256}
0.974714 (0.000802) with: {'lr': 0.001, 'nodes': 512}
0.956571 (0.003364) with: {'lr': 0.002, 'nodes': 32}
0.964595 (0.000642) with: {'lr': 0.002, 'nodes': 64}
0.970762 (0.000844) with: {'lr': 0.002, 'nodes': 128}
0.975048 (0.000994) with: {'lr': 0.002, 'nodes': 256}
0.974738 (0.000496) with: {'lr': 0.002, 'nodes': 512}
0.958071 (0.000745) with: {'lr': 0.003, 'nodes': 32}
0.967024 (0.002577) with: {'lr': 0.003, 'nodes': 64}
0.970881 (0.001058) with: {'lr': 0.003, 'nodes': 128}
0.973357 (0.001325) with: {'lr': 0.003, 'nodes': 256}
0.974810 (0.001687) with: {'lr': 0.003, 'nodes': 512}


In [16]:
acc = grid_result.cv_results_

In [18]:
# pd.DataFrame({'lr': [x['lr'] for x in list(acc['params'])],
#              'nodes': [x['nodes'] for x in list(acc['params'])]})

## Predict test labels

In [19]:
pred_classes = grid.predict(test_images.values)

In [20]:
pred_classes

array([2, 0, 9, ..., 3, 9, 2])

In [21]:
pred = pd.DataFrame({'ImageId': range(1, len(pred_classes)+1), 'Label': pred_classes})

In [22]:
pred.head()

Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,9
4,5,3


In [23]:
pred.tail()

Unnamed: 0,ImageId,Label
27995,27996,9
27996,27997,7
27997,27998,3
27998,27999,9
27999,28000,2


In [24]:
pred.to_csv('data/output/subm10-256-lr002.csv', index=False)

## Experimental code to save history of every try in the grid search and cross-validation

In [84]:
class LossHistory(Callback):
    def on_train_begin(self, logs={}):
        # detect what run 
        if self.cv_run is None:
            self.cv_run = 1
        else:
            self.cv_run += 1
        if self.losses in None:
            self.losses = pd.DataFrame()
        print('on_train_begin')
        print(self.params)
        nodes = self.model.get_layer(index=1).units
        lr = self.model.optimizer.get_config()['lr']
        print('nodes: {}, lr: {}'.format(nodes, lr))

    def on_epoch_end(self, epoch, logs={}):
        self.losses.append(logs.get('loss'))
        print('on_epoch_end')

history = LossHistory()

