In [1]:
import os
import numpy as np

import tensorflow as tf
from tensorflow.python.keras.datasets import mnist
from tensorflow.contrib.eager.python import tfe

  from ._conv import register_converters as _register_converters


In [2]:
# enable eager mode
tf.enable_eager_execution()
tf.set_random_seed(0)
np.random.seed(0)

In [3]:
# constants
batch_size = 128
epochs = 10
num_classes = 10

In [5]:
# dataset loading
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# normalization of dataset
x_train = x_train.astype('float32') / 255.
x_test = x_test.astype('float32') / 255.

# flatten the dataset
x_train = x_train.reshape((-1, 28 * 28))
x_test = x_test.reshape((-1, 28 * 28))

# one hot encode the labels. convert back to numpy as we cannot use a combination of numpy
# and tensors as input to keras
y_train_ohe = tf.one_hot(y_train, depth=num_classes).numpy()
y_test_ohe = tf.one_hot(y_test, depth=num_classes).numpy()

print('x train', x_train.shape)
print('y train', y_train_ohe.shape)
print('x test', x_test.shape)
print('y test', y_test_ohe.shape)

x train (60000, 784)
y train (60000, 10)
x test (10000, 784)
y test (10000, 10)


# Logistic regression in Eager
This is the standard linear classifier that is the easiest to build in almost all frameworks.

In Keras, its even easier with a single `Dense` layer doing all of the important work.

## The Catch

Normally, we output the raw logits without an activation function, and then use `tf.nn.softmax_cross_entropy_with_logits` to use softmax and calculate the loss in one go.

We however are going to use Model.fit(), which will **not** be using `tf.nn.softmax_cross_entropy_with_logits`, and therefore we need to use `softmax` activation function for the final layer.

Now another issue crops up - The softmax op doesnt exist for the GPU, only the CPU. This is easy enough to fix thankfully. Simply force a `with tf.device('/cpu:0'):` over the softmax activation to force it onto the CPU.

Note, for Keras, it is important **not to use activation='softmax' for the final Dense layer**. Since the layer will be on the GPU it will try to use softmax activation of the GPU as well, and cause an exception. For Eager mode, stick to using the activation seperately in side a `tf.device()` block.

## A word on performance
Tensorflow 1.8 now automatically places operations in CPU or GPU silently, which is great for useability and bad for maximum performance. When using Tensorflow without Eager, we generally let Tensorflow decide where to place ops, since the graph is later optimized to get near optimal performance anyway.

Eager doesnt bother with graphs, so there isn't much room for optimizations. Instead, I will be checking for GPU availability and force the entire training or testing loop onto the available device. 

If you don't want this, or would rather not bother with performance finetuning like this, then you can do so by skipping the `with tf.device()` wrapper. However, when speed of execution on the GPU is comparared to other frameworks, say PyTorch, this is absolutely important, especially for certain models like custom RNNs. I have noticed speed ups over over 4x when forcing everything - the model, the optimizer, fit, predict and evaluate calls onto the GPU, and it ranks quite closely with PyTorch at that level. 

This is ofcourse just an example, it wont hold true for everything, but its nice to use the GPU at maximum possible utilization if you have one.

In [6]:
# model definition (canonical way)
class LogisticRegression(tf.keras.Model):

    def __init__(self, num_classes):
        super(LogisticRegression, self).__init__()
        self.dense = tf.keras.layers.Dense(num_classes)

    def call(self, inputs, training=None, mask=None):
        output = self.dense(inputs)

        # softmax op does not exist on the gpu
        with tf.device('/cpu:0'):
            output = tf.nn.softmax(output)

        return output

In [8]:
device = '/cpu:0' if tfe.num_gpus() == 0 else '/gpu:0'

with tf.device(device):
    # build model and optimizer
    model = LogisticRegression(num_classes)
    model.compile(optimizer=tf.train.GradientDescentOptimizer(0.1), loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    # TF Keras tries to use entire dataset to determine shape without this step when using .fit()
    # Fix = Use exactly one sample from the provided input dataset to determine input/output shape/s for the model
    dummy_x = tf.zeros((1, 28 * 28))
    model._set_inputs(dummy_x)

    # train
    model.fit(x_train, y_train_ohe, batch_size=batch_size, epochs=epochs,
              validation_data=(x_test, y_test_ohe), verbose=2)

    # evaluate on test set
    scores = model.evaluate(x_test, y_test_ohe, batch_size, verbose=2)
    print("Final test loss and accuracy :", scores)


Train on 60000 samples, validate on 10000 samples
Epoch 1/10
 - 5s - loss: 0.5850 - acc: 0.8504 - val_loss: 0.3811 - val_acc: 0.8988
Epoch 2/10
 - 6s - loss: 0.3759 - acc: 0.8966 - val_loss: 0.3356 - val_acc: 0.9080
Epoch 3/10
 - 6s - loss: 0.3436 - acc: 0.9041 - val_loss: 0.3181 - val_acc: 0.9129
Epoch 4/10
 - 5s - loss: 0.3269 - acc: 0.9091 - val_loss: 0.3055 - val_acc: 0.9153
Epoch 5/10
 - 6s - loss: 0.3160 - acc: 0.9114 - val_loss: 0.3022 - val_acc: 0.9179
Epoch 6/10
 - 6s - loss: 0.3083 - acc: 0.9135 - val_loss: 0.2926 - val_acc: 0.9164
Epoch 7/10
 - 6s - loss: 0.3025 - acc: 0.9155 - val_loss: 0.2903 - val_acc: 0.9191
Epoch 8/10
 - 6s - loss: 0.2978 - acc: 0.9169 - val_loss: 0.2880 - val_acc: 0.9205
Epoch 9/10
 - 6s - loss: 0.2940 - acc: 0.9184 - val_loss: 0.2854 - val_acc: 0.9211
Epoch 10/10
 - 6s - loss: 0.2905 - acc: 0.9187 - val_loss: 0.2834 - val_acc: 0.9199
Final test loss and accuracy : [0.2833801755428314, 0.9199]
