In [1]:
import os
import numpy as np

import tensorflow as tf
from tensorflow.python.keras.datasets import fashion_mnist
from tensorflow.contrib.eager.python import tfe

  from ._conv import register_converters as _register_converters


In [2]:
# enable eager mode
tf.enable_eager_execution()
tf.set_random_seed(0)
np.random.seed(0)

In [3]:
if not os.path.exists('weights/'):
    os.makedirs('weights/')

# constants
image_size = 28
batch_size = 128
epochs = 10
num_classes = 10

In [4]:
# dataset loading
(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()
x_train = x_train.reshape((-1, image_size, image_size, 1))
x_test = x_test.reshape((-1, image_size, image_size, 1))
x_train = x_train.astype('float32') / 255.
x_test = x_test.astype('float32') / 255.

# one hot encode the labels. convert back to numpy as we cannot use a combination of numpy
# and tensors as input to keras
y_train_ohe = tf.one_hot(y_train, depth=num_classes).numpy()
y_test_ohe = tf.one_hot(y_test, depth=num_classes).numpy()

print('x train', x_train.shape)
print('y train', y_train_ohe.shape)
print('x test', x_test.shape)
print('y test', y_test_ohe.shape)

x train (60000, 28, 28, 1)
y train (60000, 10)
x test (10000, 28, 28, 1)
y test (10000, 10)


# Create a basic Conv layer helper

In [5]:
# 3x3 convolution
def conv3x3(channels, stride=1, kernel=(3, 3)):
    return tf.keras.layers.Conv2D(channels, kernel, strides=(stride, stride), padding='same', use_bias=False,
                                  kernel_initializer=tf.variance_scaling_initializer())

# ResNet block builder

This can be either an Identity block or a Convolution block, and I am using the pre-activation variant of ResNets without the BottleNeck variant, since this is an example.

In [6]:
class ResnetBlock(tf.keras.Model):

    def __init__(self, channels, strides=1, residual_path=False):
        super(ResnetBlock, self).__init__()
        self.channels = channels
        self.strides = strides
        self.residual_path = residual_path

        self.conv1 = conv3x3(channels, strides)
        self.bn1 = tf.keras.layers.BatchNormalization()
        self.conv2 = conv3x3(channels)
        self.bn2 = tf.keras.layers.BatchNormalization()

        if residual_path:
            self.down_conv = conv3x3(channels, strides, kernel=(1, 1))
            self.down_bn = tf.keras.layers.BatchNormalization()

    def call(self, inputs, training=None, mask=None):
        residual = inputs

        x = self.bn1(inputs, training=training)
        x = tf.nn.relu(x)
        x = self.conv1(x)
        x = self.bn2(x, training=training)
        x = tf.nn.relu(x)
        x = self.conv2(x)

        if self.residual_path:
            residual = self.down_bn(inputs, training=training)
            residual = tf.nn.relu(residual)
            residual = self.down_conv(residual)

        x = x + residual
        return x

# Create a Configurable Network

This network is adaptive, in that it can have many layers, and therefore we cannot determine the layers before hand.

To remedy this, we use the convenient `setattr` (and optinally `getattr`) to dynamically "register" and "call" sublayers.

# Note on why this is needed

Eager Models *will* automatically register all variables that have been bound to an identifier inside that class - 

- Using `self.layer_name = tf.keras.layers.***`
- Using `self.block = ClassWhichInheritsModel(...)`

However. **it will not register variables that have not been bound directly to the class itself or are custom variables.**

- Using `self.layers = [layer1, layer2]`
- Using `self.layers = {'l1':layer1, 'l2':layer2}`
- Using `self.variable = tf.get_variable(...)`

Special case : 

- Using `self.cells = [LSTMCell(), LSTMCell()]` and then wrapping it around an RNN as : `self.rnn = RNN(self.cells)` **will work as expected**. The weights of the LSTMCell will be registered and the RNN itself is registered as well.

**`setattr` and `getattr` bypasses the above issues, and sets the layers or models to the class itself, so it is registered by Keras.**

# Note 2

This registration of layers is important only for convenience of using Model methods - when using Model.compile(), Model.fit(), Model.predict() and Model.evaluate().

If there is no need for these utilities, you can write the class as you want, extract all the variables in a list, get the gradients using `tf.GradientTape()` and then update the parameters by hand using `Optimizer.apply_gradients()`. In such a scenario, even the **Model._set_input(...)** fix need not be applied, since you will be doing batch level training anyways and the first update will use that small batch to determine the shape of the model. Such an example is shown in `10_custom_model.ipynb`

However, it is far too convenient to use Keras' inbuilt methods for general use-cases such as classification and regression.

In [7]:
class ResNet(tf.keras.Model):

    def __init__(self, block_list, num_classes, initial_filters=16, **kwargs):
        super(ResNet, self).__init__(**kwargs)
        self.num_blocks = len(block_list)
        self.block_list = block_list

        self.in_channels = initial_filters
        self.out_channels = initial_filters
        self.conv_initial = conv3x3(self.out_channels)

        self.blocks = []

        # build all the blocks
        for block_id in range(len(block_list)):
            for layer_id in range(block_list[block_id]):
                key = 'block_%d_%d' % (block_id + 1, layer_id + 1)
                if block_id != 0 and layer_id == 0:
                    block = ResnetBlock(self.out_channels, strides=2, residual_path=True)
                else:
                    if self.in_channels != self.out_channels:
                        residual_path = True
                    else:
                        residual_path = False
                    block = ResnetBlock(self.out_channels, residual_path=residual_path)

                self.in_channels = self.out_channels

                # "register" this block to this model ; Without this, weights wont update.
                setattr(self, key, block)

                self.blocks.append(block)

            self.out_channels *= 2

        self.final_bn = tf.keras.layers.BatchNormalization()
        self.avg_pool = tf.keras.layers.GlobalAveragePooling2D()
        self.fc = tf.keras.layers.Dense(num_classes)

    def call(self, inputs, training=None, mask=None):
        out = self.conv_initial(inputs)

        # forward pass through all the blocks
        # build all the blocks
        for block in self.blocks:
            out = block(out, training=training)

        out = self.final_bn(out)
        out = tf.nn.relu(out)

        out = self.avg_pool(out)
        out = self.fc(out)

        # softmax op does not exist on the gpu, so always use cpu
        with tf.device('/cpu:0'):
            output = tf.nn.softmax(out)

        return output

# Fashion MNIST
Here, we try a harder dataset than the basic MNIST, where it is very easy to get 99% with even small networks. The basic average of small models on Fashion MNIST on the other hand is close to 90-92%.

In [8]:
device = '/cpu:0' if tfe.num_gpus() == 0 else '/gpu:0'

with tf.device(device):
    # build model and optimizer
    model = ResNet([2, 2, 2], num_classes)
    model.compile(optimizer=tf.train.AdamOptimizer(0.001), loss='categorical_crossentropy',
                  metrics=['accuracy'])

    # TF Keras tries to use entire dataset to determine shape without this step when using .fit()
    # Fix = Use exactly one sample from the provided input dataset to determine input/output shape/s for the model
    dummy_x = tf.zeros((1, image_size, image_size, 1))
    model._set_inputs(dummy_x)

    print("Number of variables in the model :", len(model.variables))
    model.summary()

    # train
    model.fit(x_train, y_train_ohe, batch_size=batch_size, epochs=epochs,
              validation_data=(x_test, y_test_ohe), verbose=1)

    # evaluate on test set
    scores = model.evaluate(x_test, y_test_ohe, batch_size, verbose=1)
    print("Final test loss and accuracy :", scores)

    saver = tfe.Saver(model.variables)
    saver.save('weights/05_resnet/weights.ckpt')

Number of variables in the model : 77
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            multiple                  144       
_________________________________________________________________
resnet_block_1 (ResnetBlock) multiple                  4736      
_________________________________________________________________
resnet_block_2 (ResnetBlock) multiple                  4736      
_________________________________________________________________
resnet_block_3 (ResnetBlock) multiple                  14592     
_________________________________________________________________
resnet_block_4 (ResnetBlock) multiple                  18688     
_________________________________________________________________
resnet_block_5 (ResnetBlock) multiple                  57856     
_________________________________________________________________
resnet_block_6 (ResnetBlock) multiple 