# Techniques for trianing deep neural nets

In [1]:
import os
import numpy as np
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf

## Creating our own DNNClassifier class

We want compatability with Scikit-Learn's `RandomizedSerchCV` class so we can perform hyperparameter tuning.  It must include the following:
* `__init__()` constructor
* `fit()` method that creates the graph, starts a session, and trains the model:
    * calls `_build_graph()` to build graph and save important operations
    * `_dnn()` method builds hidden layers with support for batch_normalization and dropout
    * support for early stopping if given validation sets.
    * Keeps the session open so the predictions can be made without saving and loading the model to disk. Can clost the session with `close_session()` method
* `predict_proba()` method to predict class probabilities
* `predict()` method calls `predict_proba()` and returns most probable class

In [44]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.exceptions import NotFittedError

class DNNClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, n_hidden_layers=5, n_neurons=100, optimizer_class=tf.train.AdamOptimizer,
                 learning_rate=0.01, batch_size=20, activation=tf.nn.elu, 
                 initializer=tf.contrib.layers.variance_scaling_initializer(),
                 batch_norm_momentum=None, dropout_rate=None, random_state=None):
        """Initialize the DNNClassifier by storing all hyperparameters"""
        self.n_hidden_layers = n_hidden_layers
        self.n_neurons = n_neurons
        self.optimizer_class = optimizer_class
        self.learning_rate = learning_rate
        self.batch_size = batch_size 
        self.activation = activation
        self.initializer = initializer
        self.batch_norm_momentum = batch_norm_momentum
        self.dropout_rate = dropout_rate
        self.random_state = random_state
        self._session = None
        
    def _dnn(self, inputs):
        """Build the hidden layers with support for batch normalization and dropout"""
        for layer in range(self.n_hidden_layers):
            if self.dropout_rate:
                inputs = tf.layers.dropout(inputs, self.dropout_rate, training=self._training)
            inputs = tf.layers.dense(inputs, self.n_neurons, 
                                     kernel_initializer=self.initializer,
                                     name="hidden%d" % (layer + 1))
            if self.batch_norm_momentum:
                inputs = tf.layers.batch_normalization(inputs, momentum=self.batch_norm_momentum,
                                                      training=self._training)
            inputs = self.activation(inputs, name="hidden%d" % (layer + 1))
        return inputs
    
    def _build_graph(self, n_inputs, n_outputs):
        """Build the model"""
        if self.random_state is not None:
            tf.set_random_seed(self.random_state)
            np.random.seed(self.random_state)
            
        X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
        y = tf.placeholder(tf.int32, shape=(None), name="y")
        
        if self.batch_norm_momentum or self.dropout_rate:
            self._training = tf.placeholder_with_default(False, shape=(), name='training')
        else:
            self._training = None
        
        dnn_outputs = self._dnn(X)
        
        logits = tf.layers.dense(dnn_outputs, n_outputs, 
                                 kernel_initializer=tf.contrib.layers.variance_scaling_initializer(),
                                 name="logits")
        Y_proba = tf.nn.softmax(logits, name="Y_proba")
        
        xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
        loss = tf.reduce_mean(xentropy, name="loss")
        
        optimizer = self.optimizer_class(learning_rate=self.learning_rate)
        training_op = optimizer.minimize(loss)
        
        correct = tf.nn.in_top_k(logits, y, 1)
        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")
        
        init = tf.global_variables_initializer()
        saver = tf.train.Saver()
        
        # make important operations available through instance variables
        self._X, self._y = X, y
        self._training_op, self._Y_proba = training_op, Y_proba
        self._loss, self._accuracy =  loss, accuracy
        self._init, self._saver = init, saver
            
    def close_session(self):
        if self._session:
            self._session.close()
    
    def _get_model_params(self):
        """Get all variable values (used for early stopping, faster than saving to disk)"""
        with self._graph.as_default():
            gvars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
        return {gvar.op.name: value for gvar, value in zip(gvars, self._session.run(gvars))}
    
    def _restore_model_params(self, model_params):
        """Set all variables to given values (again for early stopping and faster than saving to disk)"""
        gvar_names = list(model_params.keys())
        assign_ops = {gvar_name: self._graph.get_operation_by_name(gvar_name + "/Assign")
                     for gvar_name in gvar_names}
        init_values = {gvar_name: assign_op.inputs[1] for gvar_name, assign_op in assign_ops.items()}
        feed_dict = {init_values[gvar_name]: model_params[gvar_name] for gvar_name in gvar_names}
        self._session.run(assign_ops, feed_dict=feed_dict)
    
    def fit(self, X, y, n_epochs=100, X_valid=None, y_valid=None):
        """Fit the mode to the training set.  If validation set provided, implement early stopping"""
        self.close_session()
        
        n_inputs = X.shape[1]
        self.classes_ = np.unique(y)
        n_outputs = len(self.classes_)
        
        # Translate the labels vector to a vector of sorted class indices containing integers from 0 to n_outputs - 1
        self.class_to_index_ = {label: index for index, label in enumerate(self.classes_)}
        y = np.array([self.class_to_index_[label] for label in y], dtype=np.int32)
        
        self._graph = tf.Graph()
        with self._graph.as_default():
            self._build_graph(n_inputs, n_outputs)
            # extra ops for batch normalization
            extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        
        # early stopping params
        max_checks_without_progress = 20
        checks_without_progress = 0
        best_loss = np.infty
        best_params = None
        
        # training the model
        self._session = tf.Session(graph=self._graph)
        with self._session.as_default() as sess:
            self._init.run()
            for epoch in range(n_epochs):
                rnd_idx = np.random.permutation(len(X))
                for rnd_indices in np.array_split(rnd_idx, len(X) //self.batch_size):
                    X_batch, y_batch = X[rnd_indices], y[rnd_indices]
                    feed_dict={self._X: X_batch, self._y:y_batch}
                    if self._training is not None:
                        feed_dict[self._training] = True
                    sess.run(self._training_op, feed_dict=feed_dict)
                    if extra_update_ops:
                        sess.run(extra_update_ops, feed_dict=feed_dict)
                # implementing early stopping if validation data provided
                if X_valid is not None and y_valid is not None:
                    loss_val, acc_val = sess.run([self._loss, self._accuracy], 
                                                 feed_dict={self._X: X_valid, self._y: y_valid})
                    if loss_val < best_loss:
                        best_params = self._get_model_params()
                        best_loss = loss_val
                        checks_without_progress = 0
                    else:
                        checks_without_progress += 1
                    print("{}\tValidation loss: {:.6f}\tBest loss: {:.6f}\tAccuracy: {:.3f}%".format(
                        epoch, loss_val, best_loss, acc_val * 100))
                    if checks_without_progress > max_checks_without_progress:
                        print("Early Stopping!")
                        break
                else:
                    loss_train, acc_train = sess.run([self._loss, self._accuracy],
                                                    feed_dict={self._X: X_batch, self._y: y_batch})
                    print("{}\tLast training batch loss: {:.6f}\tAccuracy: {:.3f}%".format(
                        epoch, loss_val, acc_train * 100))
            # If early stopping, rollback to best model found  
            if best_params:
                self._restore_model_params(best_params)
            return self
        
    def predict_proba(self, X):
        if not self._session:
            raise NotFittedError("This %s instance is not fitted yet" % self.__class__.__name__)
        with self._session.as_default() as sess:
            return self._Y_proba.eval(feed_dict={self._X: X})
        
    def predict(self, X):
        class_indices = np.argmax(self.predict_proba(X), axis=1)
        return np.array([[self.classes_[class_index]] for class_index in class_indices],np.int32)
    
    def save(self, path):
        self._saver.save(self._session, path)

In [6]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets(os.getcwd() + "/tmp/data/")

Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Instructions for updating:
Please write your own downloading logic.
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting /Users/dhensle/Documents/dabbles-in-ML/oreilly/tmp/data/train-images-idx3-ubyte.gz
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting /Users/dhensle/Documents/dabbles-in-ML/oreilly/tmp/data/train-labels-idx1-ubyte.gz
Extracting /Users/dhensle/Documents/dabbles-in-ML/oreilly/tmp/data/t10k-images-idx3-ubyte.gz
Extracting /Users/dhensle/Documents/dabbles-in-ML/oreilly/tmp/data/t10k-labels-idx1-ubyte.gz
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.


We are going to train this model on only digits 0 to 4 and then use transfer learning later for digits 5 to 9

In [8]:
X_train1 = mnist.train.images[mnist.train.labels < 5]
y_train1 = mnist.train.labels[mnist.train.labels < 5]
X_valid1 = mnist.validation.images[mnist.validation.labels < 5]
y_valid1 = mnist.validation.labels[mnist.validation.labels < 5]
X_test1 = mnist.test.images[mnist.test.labels < 5]
y_test1 = mnist.test.labels[mnist.test.labels < 5]

In [29]:
dnn_clf = DNNClassifier()
dnn_clf.fit(X_train1, y_train1, n_epochs=200, X_valid= X_valid1, y_valid=y_valid1)

0	Validation loss: 0.181448	Best loss: 0.181448	Accuracy: 96.403%
1	Validation loss: 0.150555	Best loss: 0.150555	Accuracy: 95.113%
2	Validation loss: 0.105407	Best loss: 0.105407	Accuracy: 97.733%
3	Validation loss: 0.383666	Best loss: 0.105407	Accuracy: 94.410%
4	Validation loss: 0.183491	Best loss: 0.105407	Accuracy: 96.951%
5	Validation loss: 1.178350	Best loss: 0.105407	Accuracy: 39.601%
6	Validation loss: 1.326445	Best loss: 0.105407	Accuracy: 39.758%
7	Validation loss: 1.193772	Best loss: 0.105407	Accuracy: 41.908%
8	Validation loss: 1.197022	Best loss: 0.105407	Accuracy: 40.266%
9	Validation loss: 1.169166	Best loss: 0.105407	Accuracy: 40.227%
10	Validation loss: 1.141830	Best loss: 0.105407	Accuracy: 40.500%
11	Validation loss: 1.421717	Best loss: 0.105407	Accuracy: 40.188%
12	Validation loss: 1.185472	Best loss: 0.105407	Accuracy: 40.422%
13	Validation loss: 1.309775	Best loss: 0.105407	Accuracy: 40.344%
14	Validation loss: 1.214567	Best loss: 0.105407	Accuracy: 40.618%
15	Va

DNNClassifier(activation=<function elu at 0x122915730>,
       batch_norm_momentum=None, batch_size=20, dropout_rate=None,
       initializer=<function variance_scaling_initializer.<locals>._initializer at 0x1c2b10b950>,
       learning_rate=0.01, n_hidden_layers=5, n_neurons=100,
       optimizer_class=<class 'tensorflow.python.training.adam.AdamOptimizer'>,
       random_state=None)

In [30]:
from sklearn.metrics import accuracy_score

y_pred = dnn_clf.predict(X_test1)
accuracy_score(y_test1, y_pred)

0.9801517805020432

Achieved 98% accuracy on the test set with the default parameters coded above (again, still digits 0 to 4). Can we do better with different hyperparameters?

In [35]:
from sklearn.model_selection import RandomizedSearchCV

param_distribs = {
    "n_neurons": [10,50,100,150],
    "batch_size": [10,20,50,100],
    "n_hidden_layers": [2,3,4,5,6],
    "learning_rate": [0.01,0.02]
}

rnd_search = RandomizedSearchCV(DNNClassifier(), param_distribs, n_iter=10,
                                fit_params={"X_valid": X_valid1, "y_valid": y_valid1, "n_epochs": 200}, 
                                verbose=2)
rnd_search.fit(X_train1, y_train1)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] n_neurons=10, n_hidden_layers=2, learning_rate=0.02, batch_size=10 
0	Validation loss: 0.126747	Best loss: 0.126747	Accuracy: 96.599%
1	Validation loss: 0.111963	Best loss: 0.111963	Accuracy: 96.873%
2	Validation loss: 0.141261	Best loss: 0.111963	Accuracy: 96.169%
3	Validation loss: 0.109532	Best loss: 0.109532	Accuracy: 97.068%
4	Validation loss: 0.166700	Best loss: 0.109532	Accuracy: 96.052%
5	Validation loss: 0.156626	Best loss: 0.109532	Accuracy: 96.833%
6	Validation loss: 0.112775	Best loss: 0.109532	Accuracy: 96.560%
7	Validation loss: 0.152289	Best loss: 0.109532	Accuracy: 97.029%
8	Validation loss: 0.135275	Best loss: 0.109532	Accuracy: 96.951%
9	Validation loss: 0.188457	Best loss: 0.109532	Accuracy: 95.309%
10	Validation loss: 0.168280	Best loss: 0.109532	Accuracy: 97.068%
11	Validation loss: 0.120843	Best loss: 0.109532	Accuracy: 97.459%
12	Validation loss: 0.123251	Best loss: 0.109532	Accuracy: 97.263%
13	Va

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   26.9s remaining:    0.0s


0	Validation loss: 0.165866	Best loss: 0.165866	Accuracy: 95.739%
1	Validation loss: 0.119879	Best loss: 0.119879	Accuracy: 97.342%
2	Validation loss: 0.133423	Best loss: 0.119879	Accuracy: 96.599%
3	Validation loss: 0.160040	Best loss: 0.119879	Accuracy: 95.504%
4	Validation loss: 0.149861	Best loss: 0.119879	Accuracy: 96.833%
5	Validation loss: 0.188397	Best loss: 0.119879	Accuracy: 95.817%
6	Validation loss: 0.127417	Best loss: 0.119879	Accuracy: 96.833%
7	Validation loss: 0.137712	Best loss: 0.119879	Accuracy: 96.482%
8	Validation loss: 0.134159	Best loss: 0.119879	Accuracy: 96.873%
9	Validation loss: 0.133186	Best loss: 0.119879	Accuracy: 95.778%
10	Validation loss: 0.185027	Best loss: 0.119879	Accuracy: 95.387%
11	Validation loss: 0.135414	Best loss: 0.119879	Accuracy: 96.482%
12	Validation loss: 0.188442	Best loss: 0.119879	Accuracy: 96.443%
13	Validation loss: 0.189533	Best loss: 0.119879	Accuracy: 96.521%
14	Validation loss: 0.139474	Best loss: 0.119879	Accuracy: 96.794%
15	Va

20	Validation loss: 0.605653	Best loss: 0.181907	Accuracy: 97.224%
21	Validation loss: 0.674504	Best loss: 0.181907	Accuracy: 95.661%
22	Validation loss: 1.009035	Best loss: 0.181907	Accuracy: 95.231%
23	Validation loss: 0.764647	Best loss: 0.181907	Accuracy: 96.873%
24	Validation loss: 0.956566	Best loss: 0.181907	Accuracy: 96.247%
Early Stopping!
[CV]  n_neurons=150, n_hidden_layers=2, learning_rate=0.02, batch_size=20, total=  30.8s
[CV] n_neurons=10, n_hidden_layers=5, learning_rate=0.01, batch_size=100 
0	Validation loss: 0.098141	Best loss: 0.098141	Accuracy: 97.342%
1	Validation loss: 0.090924	Best loss: 0.090924	Accuracy: 97.303%
2	Validation loss: 0.092820	Best loss: 0.090924	Accuracy: 97.459%
3	Validation loss: 0.083688	Best loss: 0.083688	Accuracy: 97.654%
4	Validation loss: 0.082736	Best loss: 0.082736	Accuracy: 97.850%
5	Validation loss: 0.086560	Best loss: 0.082736	Accuracy: 97.537%
6	Validation loss: 0.076479	Best loss: 0.076479	Accuracy: 97.850%
7	Validation loss: 0.083

13	Validation loss: 1.797066	Best loss: 0.346742	Accuracy: 19.273%
14	Validation loss: 1.700886	Best loss: 0.346742	Accuracy: 20.915%
15	Validation loss: 1.647481	Best loss: 0.346742	Accuracy: 19.273%
16	Validation loss: 1.652160	Best loss: 0.346742	Accuracy: 19.077%
17	Validation loss: 1.714665	Best loss: 0.346742	Accuracy: 22.009%
18	Validation loss: 1.677096	Best loss: 0.346742	Accuracy: 22.009%
19	Validation loss: 1.668743	Best loss: 0.346742	Accuracy: 22.009%
20	Validation loss: 1.709270	Best loss: 0.346742	Accuracy: 19.273%
21	Validation loss: 1.660771	Best loss: 0.346742	Accuracy: 18.726%
Early Stopping!
[CV]  n_neurons=150, n_hidden_layers=6, learning_rate=0.01, batch_size=20, total=  43.4s
[CV] n_neurons=150, n_hidden_layers=6, learning_rate=0.01, batch_size=20 
0	Validation loss: 0.159711	Best loss: 0.159711	Accuracy: 96.833%
1	Validation loss: 1.978220	Best loss: 0.159711	Accuracy: 19.273%
2	Validation loss: 1.645687	Best loss: 0.159711	Accuracy: 19.077%
3	Validation loss: 1

0	Validation loss: 0.089406	Best loss: 0.089406	Accuracy: 97.498%
1	Validation loss: 0.074695	Best loss: 0.074695	Accuracy: 97.733%
2	Validation loss: 0.075743	Best loss: 0.074695	Accuracy: 97.576%
3	Validation loss: 0.091640	Best loss: 0.074695	Accuracy: 97.889%
4	Validation loss: 0.066557	Best loss: 0.066557	Accuracy: 98.397%
5	Validation loss: 0.061910	Best loss: 0.061910	Accuracy: 98.241%
6	Validation loss: 0.096854	Best loss: 0.061910	Accuracy: 98.241%
7	Validation loss: 0.064961	Best loss: 0.061910	Accuracy: 98.436%
8	Validation loss: 0.094797	Best loss: 0.061910	Accuracy: 98.241%
9	Validation loss: 0.544545	Best loss: 0.061910	Accuracy: 94.605%
10	Validation loss: 0.186659	Best loss: 0.061910	Accuracy: 97.733%
11	Validation loss: 0.121694	Best loss: 0.061910	Accuracy: 98.124%
12	Validation loss: 0.102330	Best loss: 0.061910	Accuracy: 98.241%
13	Validation loss: 0.138678	Best loss: 0.061910	Accuracy: 98.397%
14	Validation loss: 0.109934	Best loss: 0.061910	Accuracy: 98.280%
15	Va

26	Validation loss: 0.094799	Best loss: 0.071481	Accuracy: 98.045%
27	Validation loss: 0.119848	Best loss: 0.071481	Accuracy: 98.045%
28	Validation loss: 0.092476	Best loss: 0.071481	Accuracy: 97.889%
29	Validation loss: 0.135294	Best loss: 0.071481	Accuracy: 97.654%
30	Validation loss: 0.139832	Best loss: 0.071481	Accuracy: 97.733%
31	Validation loss: 0.127226	Best loss: 0.071481	Accuracy: 97.967%
Early Stopping!
[CV]  n_neurons=10, n_hidden_layers=6, learning_rate=0.01, batch_size=50, total=  13.8s
[CV] n_neurons=50, n_hidden_layers=4, learning_rate=0.01, batch_size=20 
0	Validation loss: 0.107202	Best loss: 0.107202	Accuracy: 97.498%
1	Validation loss: 0.103761	Best loss: 0.103761	Accuracy: 97.654%
2	Validation loss: 0.100171	Best loss: 0.100171	Accuracy: 98.045%
3	Validation loss: 0.401478	Best loss: 0.100171	Accuracy: 95.426%
4	Validation loss: 0.086085	Best loss: 0.086085	Accuracy: 97.928%
5	Validation loss: 0.152593	Best loss: 0.086085	Accuracy: 97.029%
6	Validation loss: 0.0970

20	Validation loss: 1.355102	Best loss: 0.280083	Accuracy: 96.247%
21	Validation loss: 1.656543	Best loss: 0.280083	Accuracy: 96.912%
Early Stopping!
[CV]  n_neurons=150, n_hidden_layers=2, learning_rate=0.02, batch_size=10, total= 1.1min
[CV] n_neurons=150, n_hidden_layers=2, learning_rate=0.02, batch_size=10 
0	Validation loss: 0.768344	Best loss: 0.768344	Accuracy: 94.762%
1	Validation loss: 0.285757	Best loss: 0.285757	Accuracy: 92.885%
2	Validation loss: 0.550406	Best loss: 0.285757	Accuracy: 95.309%
3	Validation loss: 1.133031	Best loss: 0.285757	Accuracy: 91.830%
4	Validation loss: 1.111725	Best loss: 0.285757	Accuracy: 95.934%
5	Validation loss: 0.542917	Best loss: 0.285757	Accuracy: 96.638%
6	Validation loss: 0.410294	Best loss: 0.285757	Accuracy: 96.912%
7	Validation loss: 0.626380	Best loss: 0.285757	Accuracy: 94.957%
8	Validation loss: 0.380616	Best loss: 0.285757	Accuracy: 94.957%
9	Validation loss: 0.664005	Best loss: 0.285757	Accuracy: 95.817%
10	Validation loss: 0.87947

2	Validation loss: 0.104681	Best loss: 0.103465	Accuracy: 97.185%
3	Validation loss: 0.084738	Best loss: 0.084738	Accuracy: 97.850%
4	Validation loss: 0.088055	Best loss: 0.084738	Accuracy: 97.576%
5	Validation loss: 0.069645	Best loss: 0.069645	Accuracy: 98.124%
6	Validation loss: 0.075077	Best loss: 0.069645	Accuracy: 98.084%
7	Validation loss: 0.088379	Best loss: 0.069645	Accuracy: 97.772%
8	Validation loss: 0.080357	Best loss: 0.069645	Accuracy: 97.772%
9	Validation loss: 0.123169	Best loss: 0.069645	Accuracy: 97.303%
10	Validation loss: 0.086706	Best loss: 0.069645	Accuracy: 97.928%
11	Validation loss: 0.074533	Best loss: 0.069645	Accuracy: 97.811%
12	Validation loss: 0.085075	Best loss: 0.069645	Accuracy: 97.850%
13	Validation loss: 0.106269	Best loss: 0.069645	Accuracy: 97.537%
14	Validation loss: 0.087921	Best loss: 0.069645	Accuracy: 97.889%
15	Validation loss: 0.083550	Best loss: 0.069645	Accuracy: 97.850%
16	Validation loss: 0.074320	Best loss: 0.069645	Accuracy: 98.006%
17	

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 12.1min finished


0	Validation loss: 0.065705	Best loss: 0.065705	Accuracy: 97.967%
1	Validation loss: 0.070968	Best loss: 0.065705	Accuracy: 98.358%
2	Validation loss: 0.066692	Best loss: 0.065705	Accuracy: 98.202%
3	Validation loss: 0.070587	Best loss: 0.065705	Accuracy: 98.124%
4	Validation loss: 0.096074	Best loss: 0.065705	Accuracy: 97.772%
5	Validation loss: 0.092291	Best loss: 0.065705	Accuracy: 97.889%
6	Validation loss: 0.071830	Best loss: 0.065705	Accuracy: 98.593%
7	Validation loss: 0.067388	Best loss: 0.065705	Accuracy: 98.710%
8	Validation loss: 1.510571	Best loss: 0.065705	Accuracy: 95.543%
9	Validation loss: 0.123414	Best loss: 0.065705	Accuracy: 97.811%
10	Validation loss: 0.103560	Best loss: 0.065705	Accuracy: 98.319%
11	Validation loss: 0.106296	Best loss: 0.065705	Accuracy: 97.459%
12	Validation loss: 0.085290	Best loss: 0.065705	Accuracy: 97.850%
13	Validation loss: 0.090928	Best loss: 0.065705	Accuracy: 98.358%
14	Validation loss: 0.153566	Best loss: 0.065705	Accuracy: 97.381%
15	Va

RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=DNNClassifier(activation=<function elu at 0x122915730>,
       batch_norm_momentum=None, batch_size=20, dropout_rate=None,
       initializer=<function variance_scaling_initializer.<locals>._initializer at 0x1c2b10b950>,
       learning_rate=0.01, n_hidden_layers=5, n_neurons=100,
       optimizer_class=<class 'tensorflow.python.training.adam.AdamOptimizer'>,
       random_state=None),
          fit_params={'X_valid': array([[0., 0., ..., 0., 0.],
       [0., 0., ..., 0., 0.],
       ...,
       [0., 0., ..., 0., 0.],
       [0., 0., ..., 0., 0.]], dtype=float32), 'y_valid': array([0, 4, ..., 1, 2], dtype=uint8), 'n_epochs': 200},
          iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'n_neurons': [10, 50, 100, 150], 'batch_size': [10, 20, 50, 100], 'n_hidden_layers': [2, 3, 4, 5, 6], 'learning_rate': [0.01, 0.02]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
 

In [36]:
rnd_search.best_estimator_

DNNClassifier(activation=<function elu at 0x122915730>,
       batch_norm_momentum=None, batch_size=100, dropout_rate=None,
       initializer=<function variance_scaling_initializer.<locals>._initializer at 0x1c2b10b950>,
       learning_rate=0.02, n_hidden_layers=3, n_neurons=50,
       optimizer_class=<class 'tensorflow.python.training.adam.AdamOptimizer'>,
       random_state=None)

In [39]:

y_pred = rnd_search.best_estimator_.predict(X_test1)
accuracy_score(y_test1, y_pred)

0.9877408056042032

Got an extra 0.8% boost in accuracy from the hyperparameter search to 98.8% accuracy. The model is also considerably smaller: 3 hidden layers with 50 neurons each vs 5 hidden layers with 100 neurons each.

In [40]:
rnd_search.best_estimator_.save(os.getcwd() + "/tmp/my_best_mnist_model_0to4")

In [41]:
dnn_clf_best= rnd_search.best_estimator_

In [42]:
dnn_clf_best.fit(X_train1, y_train1, n_epochs=200, X_valid=X_valid1, y_valid=y_valid1)

0	Validation loss: 0.074438	Best loss: 0.074438	Accuracy: 97.772%
1	Validation loss: 0.063490	Best loss: 0.063490	Accuracy: 98.163%
2	Validation loss: 0.071202	Best loss: 0.063490	Accuracy: 98.554%
3	Validation loss: 0.072434	Best loss: 0.063490	Accuracy: 97.967%
4	Validation loss: 0.085798	Best loss: 0.063490	Accuracy: 98.006%
5	Validation loss: 0.074184	Best loss: 0.063490	Accuracy: 98.475%
6	Validation loss: 0.084736	Best loss: 0.063490	Accuracy: 97.850%
7	Validation loss: 0.077121	Best loss: 0.063490	Accuracy: 98.397%
8	Validation loss: 0.092576	Best loss: 0.063490	Accuracy: 98.671%
9	Validation loss: 0.174159	Best loss: 0.063490	Accuracy: 95.426%
10	Validation loss: 0.117952	Best loss: 0.063490	Accuracy: 98.632%
11	Validation loss: 0.090332	Best loss: 0.063490	Accuracy: 98.944%
12	Validation loss: 0.192857	Best loss: 0.063490	Accuracy: 98.436%
13	Validation loss: 0.109729	Best loss: 0.063490	Accuracy: 98.514%
14	Validation loss: 0.087223	Best loss: 0.063490	Accuracy: 98.514%
15	Va

DNNClassifier(activation=<function elu at 0x122915730>,
       batch_norm_momentum=None, batch_size=100, dropout_rate=None,
       initializer=<function variance_scaling_initializer.<locals>._initializer at 0x1c2b10b950>,
       learning_rate=0.02, n_hidden_layers=3, n_neurons=50,
       optimizer_class=<class 'tensorflow.python.training.adam.AdamOptimizer'>,
       random_state=None)

Will adding batch normalization improve the convergence rate?

In [45]:
dnn_clf_bn = DNNClassifier(batch_size=100, learning_rate=0.02, 
                           n_hidden_layers=3, n_neurons=50, 
                           batch_norm_momentum=.95)

dnn_clf_bn.fit(X_train1, y_train1, n_epochs=200, X_valid=X_valid1, y_valid=y_valid1)

Instructions for updating:
Use keras.layers.batch_normalization instead.
Instructions for updating:
Use tf.cast instead.
0	Validation loss: 0.081830	Best loss: 0.081830	Accuracy: 97.576%
1	Validation loss: 0.045811	Best loss: 0.045811	Accuracy: 98.475%
2	Validation loss: 0.049957	Best loss: 0.045811	Accuracy: 98.358%
3	Validation loss: 0.054633	Best loss: 0.045811	Accuracy: 98.397%
4	Validation loss: 0.040926	Best loss: 0.040926	Accuracy: 98.710%
5	Validation loss: 0.040496	Best loss: 0.040496	Accuracy: 98.632%
6	Validation loss: 0.067757	Best loss: 0.040496	Accuracy: 98.045%
7	Validation loss: 0.053114	Best loss: 0.040496	Accuracy: 98.358%
8	Validation loss: 0.040338	Best loss: 0.040338	Accuracy: 98.944%
9	Validation loss: 0.035408	Best loss: 0.035408	Accuracy: 99.179%
10	Validation loss: 0.033591	Best loss: 0.033591	Accuracy: 98.944%
11	Validation loss: 0.034732	Best loss: 0.033591	Accuracy: 98.827%
12	Validation loss: 0.037494	Best loss: 0.033591	Accuracy: 99.062%
13	Validation loss

DNNClassifier(activation=<function elu at 0x122915730>,
       batch_norm_momentum=0.95, batch_size=100, dropout_rate=None,
       initializer=<function variance_scaling_initializer.<locals>._initializer at 0x1c2c077950>,
       learning_rate=0.02, n_hidden_layers=3, n_neurons=50,
       optimizer_class=<class 'tensorflow.python.training.adam.AdamOptimizer'>,
       random_state=None)

In [46]:
y_pred = dnn_clf_bn.predict(X_test1)
accuracy_score(y_test1, y_pred)

0.9929947460595446

In [47]:
y_pred_train = dnn_clf_bn.predict(X_train1)
accuracy_score(y_train1, y_pred_train)

0.9995720094157928

Well, the model defintiely didn't converge faster, but we significantly increased our test accuracy from 98.8% to 99.3%!  We should not that the training accuracy is 99.95%, suggesting we are overfitting the training set.  Let's see if adding some dropout to the model will help:

In [48]:
dnn_clf_bn_dropout = DNNClassifier(batch_size=100, learning_rate=0.02, 
                           n_hidden_layers=3, n_neurons=50, 
                           batch_norm_momentum=.95, dropout_rate=0.5)

dnn_clf_bn_dropout.fit(X_train1, y_train1, n_epochs=200, X_valid=X_valid1, y_valid=y_valid1)

Instructions for updating:
Use keras.layers.dropout instead.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
0	Validation loss: 0.116321	Best loss: 0.116321	Accuracy: 96.247%
1	Validation loss: 0.099511	Best loss: 0.099511	Accuracy: 96.912%
2	Validation loss: 0.090064	Best loss: 0.090064	Accuracy: 97.185%
3	Validation loss: 0.084490	Best loss: 0.084490	Accuracy: 97.342%
4	Validation loss: 0.075616	Best loss: 0.075616	Accuracy: 97.850%
5	Validation loss: 0.083259	Best loss: 0.075616	Accuracy: 97.381%
6	Validation loss: 0.074241	Best loss: 0.074241	Accuracy: 97.733%
7	Validation loss: 0.064662	Best loss: 0.064662	Accuracy: 98.006%
8	Validation loss: 0.067642	Best loss: 0.064662	Accuracy: 97.615%
9	Validation loss: 0.074290	Best loss: 0.064662	Accuracy: 97.850%
10	Validation loss: 0.063373	Best loss: 0.063373	Accuracy: 97.811%
11	Validation loss: 0.060571	Best loss: 0.060571	Accuracy: 98.124%
12	Validation loss: 0.060467	B

114	Validation loss: 0.037499	Best loss: 0.036382	Accuracy: 98.984%
115	Validation loss: 0.036469	Best loss: 0.036382	Accuracy: 98.944%
116	Validation loss: 0.039144	Best loss: 0.036382	Accuracy: 98.827%
117	Validation loss: 0.038562	Best loss: 0.036382	Accuracy: 98.866%
118	Validation loss: 0.039233	Best loss: 0.036382	Accuracy: 98.827%
119	Validation loss: 0.038979	Best loss: 0.036382	Accuracy: 98.827%
120	Validation loss: 0.036247	Best loss: 0.036247	Accuracy: 98.866%
121	Validation loss: 0.036813	Best loss: 0.036247	Accuracy: 98.944%
122	Validation loss: 0.037473	Best loss: 0.036247	Accuracy: 98.866%
123	Validation loss: 0.037674	Best loss: 0.036247	Accuracy: 98.944%
124	Validation loss: 0.037914	Best loss: 0.036247	Accuracy: 98.788%
125	Validation loss: 0.035780	Best loss: 0.035780	Accuracy: 98.984%
126	Validation loss: 0.036685	Best loss: 0.035780	Accuracy: 98.905%
127	Validation loss: 0.036283	Best loss: 0.035780	Accuracy: 98.827%
128	Validation loss: 0.034768	Best loss: 0.03476

DNNClassifier(activation=<function elu at 0x122915730>,
       batch_norm_momentum=0.95, batch_size=100, dropout_rate=0.5,
       initializer=<function variance_scaling_initializer.<locals>._initializer at 0x1c2c077950>,
       learning_rate=0.02, n_hidden_layers=3, n_neurons=50,
       optimizer_class=<class 'tensorflow.python.training.adam.AdamOptimizer'>,
       random_state=None)

In [49]:
y_pred = dnn_clf_bn_dropout.predict(X_test1)
accuracy_score(y_test1, y_pred)

0.9902704806382565

Adding dropout did not help and significantly slowed down convergence in this case.

## Transfer Learning
We will use the previous model trained on digits 0 to 4 to create a new network on digits 5 to 9 using only 100 images per digit.

In [115]:
X_train2_full = mnist.train.images[mnist.train.labels >= 5]
y_train2_full = mnist.train.labels[mnist.train.labels >= 5] - 5 #tf expects integers from 0 to n_classes-1
X_valid2_full = mnist.validation.images[mnist.validation.labels >= 5]
y_valid2_full = mnist.validation.labels[mnist.validation.labels >= 5] - 5
X_test2 = mnist.test.images[mnist.test.labels >= 5]
y_test2 = mnist.test.labels[mnist.test.labels >= 5] - 5

In [116]:
def sample_n_instances_per_class(X, y, n=100):
    Xs, ys = [], []
    for label in np.unique(y):
        idx = (y == label)
        Xc = X[idx][:n]
        yc = y[idx][:n]
        Xs.append(Xc)
        ys.append(yc)
    return np.concatenate(Xs), np.concatenate(ys)

In [117]:
X_train2, y_train2 = sample_n_instances_per_class(X_train2_full, y_train2_full, n=100)
X_valid2, y_valid2 = sample_n_instances_per_class(X_valid2_full, y_valid2_full, n=30)

Creating a new DNN that reuses all the pretrained hidden layers of the previous model, freezes them, and replaces the softmax output layer with a new one

In [118]:
tf.reset_default_graph()

restore_saver = tf.train.import_meta_graph(os.getcwd() + "/tmp/my_best_mnist_model_0to4.meta")

X = tf.get_default_graph().get_tensor_by_name("X:0")
y = tf.get_default_graph().get_tensor_by_name("y:0")
loss = tf.get_default_graph().get_tensor_by_name("loss:0")
Y_proba = tf.get_default_graph().get_tensor_by_name("Y_proba:0")
logits = Y_proba.op.inputs[0]
accuracy = tf.get_default_graph().get_tensor_by_name("accuracy:0")

In [119]:
learning_rate = 0.001
# only allowing optimizer to vary output layer variables
output_layer_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="logits")
optimizer = tf.train.AdamOptimizer(learning_rate, name="Adam2")
training_op = optimizer.minimize(loss, var_list=output_layer_vars)

In [120]:
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")
init = tf.global_variables_initializer()
frozen_saver = tf.train.Saver()

In [121]:
n_epochs = 200
batch_size = 20

max_checks_without_progress = 50
checks_without_progress = 0
best_loss = np.infty

with tf.Session() as sess:
    init.run()
    restore_saver.restore(sess,os.getcwd() + "/tmp/my_best_mnist_model_0to4")
    for var in output_layer_vars:
        var.initializer.run()
        
    for epoch in range(n_epochs):
        rnd_idx = np.random.permutation(len(X_train2))
        for rnd_indices in np.array_split(rnd_idx, len(X_train2) // batch_size):
            X_batch, y_batch = X_train2[rnd_indices], y_train2[rnd_indices]
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        loss_val, acc_val = sess.run([loss, accuracy], feed_dict={X: X_valid2, y: y_valid2})
        if loss_val < best_loss:
            best_loss = loss_val
            checks_without_progress = 0
            save_path = frozen_saver.save(sess, os.getcwd() + "/tmp/my_mnist_model_5to9_frozen")
        else:
            checks_without_progress += 1
            if checks_without_progress > max_checks_without_progress:
                print("Early Stopping!")
                break
        print("{}\tValidation loss: {:.6f}\tBest loss: {:.6f}\tAccuracy: {:.3f}%".format(
                        epoch, loss_val, best_loss, acc_val * 100))

with tf.Session() as sess:
    frozen_saver.restore(sess, os.getcwd() + "/tmp/my_mnist_model_5to9_frozen")
    acc_test = accuracy.eval(feed_dict={X: X_test2, y: y_test2})
    print("Final test accuracy: {:.3f}%".format(acc_test * 100))

INFO:tensorflow:Restoring parameters from /Users/dhensle/Documents/dabbles-in-ML/oreilly/tmp/my_best_mnist_model_0to4
0	Validation loss: 6.009096	Best loss: 6.009096	Accuracy: 16.000%
1	Validation loss: 5.313174	Best loss: 5.313174	Accuracy: 29.333%
2	Validation loss: 4.886661	Best loss: 4.886661	Accuracy: 32.000%
3	Validation loss: 4.605320	Best loss: 4.605320	Accuracy: 39.333%
4	Validation loss: 4.337883	Best loss: 4.337883	Accuracy: 34.000%
5	Validation loss: 4.077963	Best loss: 4.077963	Accuracy: 32.000%
6	Validation loss: 3.831248	Best loss: 3.831248	Accuracy: 33.333%
7	Validation loss: 3.611040	Best loss: 3.611040	Accuracy: 30.000%
8	Validation loss: 3.372285	Best loss: 3.372285	Accuracy: 29.333%
9	Validation loss: 3.149727	Best loss: 3.149727	Accuracy: 28.667%
10	Validation loss: 2.942883	Best loss: 2.942883	Accuracy: 34.000%
11	Validation loss: 2.749185	Best loss: 2.749185	Accuracy: 36.000%
12	Validation loss: 2.580470	Best loss: 2.580470	Accuracy: 35.333%
13	Validation loss: 2

The smaller model with only 3 hidden layers of 50 nodes each was not a sufficient starting model for transfer learning on the other 5 to 9 digits, reaching only 46% accuracy on the test set.  How does this compare to the default graph having 5 hidden layers of 100 nodes each?

In [109]:
dnn_clf.save(os.getcwd() + "/tmp/mnist_5hidden_default_0to4")

In [110]:
y_pred = dnn_clf.predict(X_test1)
accuracy_score(y_test1, y_pred)

0.9801517805020432

In [130]:
tf.reset_default_graph()

restore_saver = tf.train.import_meta_graph(os.getcwd() + "/tmp/mnist_5hidden_default_0to4.meta")

X = tf.get_default_graph().get_tensor_by_name("X:0")
y = tf.get_default_graph().get_tensor_by_name("y:0")
loss = tf.get_default_graph().get_tensor_by_name("loss:0")
Y_proba = tf.get_default_graph().get_tensor_by_name("Y_proba:0")
logits = Y_proba.op.inputs[0]
accuracy = tf.get_default_graph().get_tensor_by_name("accuracy:0")

In [131]:
learning_rate = 0.01
# only allowing optimizer to vary output layer variables
output_layer_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="logits")
optimizer = tf.train.AdamOptimizer(learning_rate, name="Adam2")
training_op = optimizer.minimize(loss, var_list=output_layer_vars)

In [132]:
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")
init = tf.global_variables_initializer()
frozen_saver = tf.train.Saver()

In [133]:
n_epochs = 200
batch_size = 20

max_checks_without_progress = 50
checks_without_progress = 0
best_loss = np.infty

with tf.Session() as sess:
    init.run()
    restore_saver.restore(sess,os.getcwd() + "/tmp/mnist_5hidden_default_0to4")
    for var in output_layer_vars:
        var.initializer.run()
        
    for epoch in range(n_epochs):
        rnd_idx = np.random.permutation(len(X_train2))
        for rnd_indices in np.array_split(rnd_idx, len(X_train2) // batch_size):
            X_batch, y_batch = X_train2[rnd_indices], y_train2[rnd_indices]
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        loss_val, acc_val = sess.run([loss, accuracy], feed_dict={X: X_valid2, y: y_valid2})
        if loss_val < best_loss:
            best_loss = loss_val
            checks_without_progress = 0
            save_path = frozen_saver.save(sess, os.getcwd() + "/tmp/my_mnist_model_5to9_frozen_v2")
        else:
            checks_without_progress += 1
            if checks_without_progress > max_checks_without_progress:
                print("Early Stopping!")
                break
        print("{}\tValidation loss: {:.6f}\tBest loss: {:.6f}\tAccuracy: {:.3f}%".format(
                        epoch, loss_val, best_loss, acc_val * 100))

with tf.Session() as sess:
    frozen_saver.restore(sess, os.getcwd() + "/tmp/my_mnist_model_5to9_frozen_v2")
    acc_test = accuracy.eval(feed_dict={X: X_test2, y: y_test2})
    print("Final test accuracy: {:.3f}%".format(acc_test * 100))

INFO:tensorflow:Restoring parameters from /Users/dhensle/Documents/dabbles-in-ML/oreilly/tmp/mnist_5hidden_default_0to4
0	Validation loss: 1.608032	Best loss: 1.608032	Accuracy: 40.000%
1	Validation loss: 1.353469	Best loss: 1.353469	Accuracy: 42.000%
2	Validation loss: 1.276303	Best loss: 1.276303	Accuracy: 42.000%
3	Validation loss: 1.228508	Best loss: 1.228508	Accuracy: 50.000%
4	Validation loss: 1.482660	Best loss: 1.228508	Accuracy: 44.667%
5	Validation loss: 1.338100	Best loss: 1.228508	Accuracy: 43.333%
6	Validation loss: 1.249893	Best loss: 1.228508	Accuracy: 47.333%
7	Validation loss: 1.218791	Best loss: 1.218791	Accuracy: 45.333%
8	Validation loss: 1.395154	Best loss: 1.218791	Accuracy: 41.333%
9	Validation loss: 1.381170	Best loss: 1.218791	Accuracy: 40.000%
10	Validation loss: 1.234009	Best loss: 1.218791	Accuracy: 51.333%
11	Validation loss: 1.348279	Best loss: 1.218791	Accuracy: 44.667%
12	Validation loss: 1.247799	Best loss: 1.218791	Accuracy: 46.667%
13	Validation loss:

This model does slightly better, achieving ~50% on the test set. Not great, but let's see just how bad a completely new network would do with only 100 training instances for each digit:

In [135]:
dnn_clf_5to9_small = DNNClassifier()
dnn_clf_5to9_small.fit(X_train2, y_train2, n_epochs=200, X_valid= X_valid2, y_valid=y_valid2)

0	Validation loss: 0.717509	Best loss: 0.717509	Accuracy: 73.333%
1	Validation loss: 0.550045	Best loss: 0.550045	Accuracy: 84.000%
2	Validation loss: 0.457258	Best loss: 0.457258	Accuracy: 90.000%
3	Validation loss: 0.620275	Best loss: 0.457258	Accuracy: 86.000%
4	Validation loss: 0.558197	Best loss: 0.457258	Accuracy: 88.000%
5	Validation loss: 0.646159	Best loss: 0.457258	Accuracy: 89.333%
6	Validation loss: 0.580513	Best loss: 0.457258	Accuracy: 92.000%
7	Validation loss: 1.180428	Best loss: 0.457258	Accuracy: 81.333%
8	Validation loss: 0.657429	Best loss: 0.457258	Accuracy: 86.000%
9	Validation loss: 1.531142	Best loss: 0.457258	Accuracy: 86.667%
10	Validation loss: 0.851642	Best loss: 0.457258	Accuracy: 90.000%
11	Validation loss: 1.160095	Best loss: 0.457258	Accuracy: 88.000%
12	Validation loss: 1.447976	Best loss: 0.457258	Accuracy: 89.333%
13	Validation loss: 0.886492	Best loss: 0.457258	Accuracy: 92.000%
14	Validation loss: 1.050790	Best loss: 0.457258	Accuracy: 88.000%
15	Va

DNNClassifier(activation=<function elu at 0x122915730>,
       batch_norm_momentum=None, batch_size=20, dropout_rate=None,
       initializer=<function variance_scaling_initializer.<locals>._initializer at 0x1c2c077950>,
       learning_rate=0.01, n_hidden_layers=5, n_neurons=100,
       optimizer_class=<class 'tensorflow.python.training.adam.AdamOptimizer'>,
       random_state=None)

In [137]:
y_pred = dnn_clf_5to9_small.predict(X_test2)
accuracy_score(y_test2, y_pred)

0.8646369059864225

Well, looks like transfer learning was not applicable in this particular task, but we learned anyways!

## What did we accomplish?

1. Developed a Scikit-Learn compatible DNNClassifier class that implemented early stopping, batch normalization, and dropout.
2. Applied this new classifier to mnist digits 0 to 4 and did a small hyperparameter search.
3. Used the hidden layers of the 0 to 4 classifier with a new output layer to predict the 5 to 9 digits with only 100 training instances for each digit.  While this achieved accuracy less than a network trained from scratch on that small trianing set, it still demonstrated the transfer learning technique.