In [1]:
from learning_lib.nn.monitoring.activations import ActivationMonitor
from learning_lib.nn.monitoring.weights import WeightMonitor
from learning_lib.nn.monitoring.gradient import LossGradientMonitor, LossGradientNormMonitor
from learning_lib.nn.monitoring.loss import LossMonitor
from learning_lib.nn.ffnn import FFNN

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data

import plotly.offline as plotly
import plotly.graph_objs as go
from PIL import Image

  from ._conv import register_converters as _register_converters


In [2]:
%matplotlib inline
plotly.init_notebook_mode(connected=True)

# Problem Statement

When initializing weights using uniform distributions with ranges shrinking with size of the incoming layer, we eventually encounter training problems as our networks get deeper. Training starts flat with non existent gradients except in the last few layers. Was not able to easily derive why this happens mathematically but better initializations definately solves the problem.

Convergence as networks get deeper seems to be complicated process that can't be easily isolated to a few variables.

# Data

In [3]:
dataset = input_data.read_data_sets("MNIST_data/", one_hot=True)
train_data = dataset.train.next_batch(1000000)
train_in = train_data[0] - train_data[0].mean()
train_labels = train_data[1]

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


In [5]:
d = tf.data.Dataset.from_tensor_slices((train_in, train_labels))
d = d.repeat(30)
d = d.batch(30)
iterator =  d.make_one_shot_iterator()
pipe_out = iterator.get_next()

# Tanh

## 2 Hidden Layers

In [31]:
lc = [
    784,
    {
        'n_nodes': 2048, 'activation': tf.nn.tanh, 'init_weight_lower': -0.0357, 'init_weight_upper': 0.0357,
        'init_bias_lower': -0.0, 'init_bias_upper': 0.0
    },
    {
        'n_nodes': 2048, 'activation': tf.nn.tanh, 'init_weight_lower': -0.022, 'init_weight_upper': 0.022,
        'init_bias_lower': 0.0, 'init_bias_upper': 0.0
    },
    {
        'n_nodes': 10, 'activation': tf.identity, 'init_weight_lower': -0.022, 'init_weight_upper': 0.022,
        'init_bias_lower': 0.0, 'init_bias_upper': 0.0
    }
]

In [32]:
def cross_entropy_with_softmax(model_output, true_output):
    return tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits_v2(labels=true_output, logits=model_output))

In [33]:
monitors = [LossMonitor(200), LossGradientNormMonitor(200, 0), LossGradientNormMonitor(200, -1)]

In [34]:
network_2 = FFNN(
    lc, loss_func=cross_entropy_with_softmax, monitors=monitors, optimizer=tf.train.GradientDescentOptimizer(0.01),
    input_vector=pipe_out[0], train_targets_vector=pipe_out[1]
)

In [35]:
%%time
network_2.train_online()

CPU times: user 3min 53s, sys: 35.1 s, total: 4min 28s
Wall time: 2min 49s


In [36]:
plotly.iplot(network_2.monitors[0].plot())

## 4 Hidden Layers

We had previously attempted to train a 4-hidden-layer network with 2048 nodes per layer as we had used in the 2-hidden-layer network. Despite observing reasonably healthy gradients, convergence could not be acheived despite attempting a large variety of training hyperparameters.

[This](https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw) stackexchange answer suggested to us that we were attempting to train models which used an order of magnitude too many nodes per layer and we are thus henceforth training with 256 hidden layers instead.

Furthermore, we had to reduce the learning rate by a factor of 10 to prevent NaN loss.

---

One observation in this case is that nodes overall since our hidden layers are so much smaller, the learning rate was significantly faster than in the network with only two hidden layers

In [24]:
lc = [
    784,
    {
        'n_nodes': 256, 'activation': tf.nn.tanh, 'init_weight_lower': -0.0357, 'init_weight_upper': 0.0357,
        'init_bias_lower': -0.0, 'init_bias_upper': 0.0
    },
    {
        'n_nodes': 256, 'activation': tf.nn.tanh, 'init_weight_lower': -0.0625, 'init_weight_upper': 0.0625,
        'init_bias_lower': -0.0, 'init_bias_upper': 0.0
    },
    {
        'n_nodes': 256, 'activation': tf.nn.tanh, 'init_weight_lower': -0.0625, 'init_weight_upper': 0.0625,
        'init_bias_lower': -0.0, 'init_bias_upper': 0.0
    },
    {
        'n_nodes': 256, 'activation': tf.nn.tanh, 'init_weight_lower': -0.0625, 'init_weight_upper': 0.0625,
        'init_bias_lower': -0.0, 'init_bias_upper': 0.0
    },
    {
        'n_nodes': 10, 'activation': tf.identity, 'init_weight_lower': -0.0625, 'init_weight_upper': 0.0625,
        'init_bias_lower': -0.0, 'init_bias_upper': 0.0
    }
]

In [25]:
def cross_entropy_with_softmax(model_output, true_output):
    return tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits_v2(labels=true_output, logits=model_output))

In [26]:
monitors = [LossMonitor(200), LossGradientMonitor(200, 0), LossGradientMonitor(200, -1)]

In [27]:
network_4 = FFNN(
    lc, loss_func=cross_entropy_with_softmax, monitors=monitors, optimizer=tf.train.GradientDescentOptimizer(0.001),
    input_vector=pipe_out[0], train_targets_vector=pipe_out[1]
)

In [28]:
%%time
network_4.train_online()

CPU times: user 4min 42s, sys: 2min 24s, total: 7min 7s
Wall time: 5min 19s


In [29]:
plotly.iplot(network_4.monitors[0].plot())

In [30]:
train_pred = network_4.predict(train_in)
np.sum(train_pred.argmax(axis=1) == train_labels.argmax(axis=1)) / train_in.shape[0]

1.0

## 16 Hidden Layers

Reducing the number of layers was necessary to train the 16-hidden-layer network as well, though the learning rate did not have to be tuned. In this case, learning was slower than in the 4-hidden-layer case.

Additionally, learning was very sensitive to the learning rate being too high. Setting the learning rate to 0.001 gave modest convergence but setting the rate to 0.005 resulted in no learning at all. Initialization also apepars to become and issue here so we will not go deeper than 10 layers to avoid having to leverage more involved initialization.

In [84]:
lc = [
    784,
    {
        'n_nodes': 64, 'activation': tf.nn.tanh, 'init_weight_lower': -0.0357, 'init_weight_upper': 0.0357,
        'init_bias_lower': -0.0, 'init_bias_upper': 0.0
    },
    {
        'n_nodes': 64, 'activation': tf.nn.tanh, 'init_weight_lower': -0.125, 'init_weight_upper': 0.125,
        'init_bias_lower': 0.0, 'init_bias_upper': 0.0
    },
    {
        'n_nodes': 64, 'activation': tf.nn.tanh, 'init_weight_lower': -0.125, 'init_weight_upper': 0.125,
        'init_bias_lower': 0.0, 'init_bias_upper': 0.0
    },
    {
        'n_nodes': 64, 'activation': tf.nn.tanh, 'init_weight_lower': -0.125, 'init_weight_upper': 0.125,
        'init_bias_lower': 0.0, 'init_bias_upper': 0.0
    },
    {
        'n_nodes': 64, 'activation': tf.nn.tanh, 'init_weight_lower': -0.125, 'init_weight_upper': 0.125,
        'init_bias_lower': 0.0, 'init_bias_upper': 0.0
    },
    {
        'n_nodes': 64, 'activation': tf.nn.tanh, 'init_weight_lower': -0.125, 'init_weight_upper': 0.125,
        'init_bias_lower': 0.0, 'init_bias_upper': 0.0
    },
    {
        'n_nodes': 64, 'activation': tf.nn.tanh, 'init_weight_lower': -0.125, 'init_weight_upper': 0.125,
        'init_bias_lower': 0.0, 'init_bias_upper': 0.0
    },
    {
        'n_nodes': 64, 'activation': tf.nn.tanh, 'init_weight_lower': -0.125, 'init_weight_upper': 0.125,
        'init_bias_lower': 0.0, 'init_bias_upper': 0.0
    },
    {
        'n_nodes': 64, 'activation': tf.nn.tanh, 'init_weight_lower': -0.125, 'init_weight_upper': 0.125,
        'init_bias_lower': 0.0, 'init_bias_upper': 0.0
    },
    {
        'n_nodes': 64, 'activation': tf.nn.tanh, 'init_weight_lower': -0.125, 'init_weight_upper': 0.125,
        'init_bias_lower': 0.0, 'init_bias_upper': 0.0
    },
    {
        'n_nodes': 64, 'activation': tf.nn.tanh, 'init_weight_lower': -0.125, 'init_weight_upper': 0.125,
        'init_bias_lower': 0.0, 'init_bias_upper': 0.0
    },
    {
        'n_nodes': 64, 'activation': tf.nn.tanh, 'init_weight_lower': -0.125, 'init_weight_upper': 0.125,
        'init_bias_lower': 0.0, 'init_bias_upper': 0.0
    },
    {
        'n_nodes': 64, 'activation': tf.nn.tanh, 'init_weight_lower': -0.125, 'init_weight_upper': 0.125,
        'init_bias_lower': 0.0, 'init_bias_upper': 0.0
    },
    {
        'n_nodes': 64, 'activation': tf.nn.tanh, 'init_weight_lower': -0.125, 'init_weight_upper': 0.125,
        'init_bias_lower': 0.0, 'init_bias_upper': 0.0
    },
    {
        'n_nodes': 64, 'activation': tf.nn.tanh, 'init_weight_lower': -0.125, 'init_weight_upper': 0.125,
        'init_bias_lower': 0.0, 'init_bias_upper': 0.0
    },
    {
        'n_nodes': 64, 'activation': tf.nn.tanh, 'init_weight_lower': -0.125, 'init_weight_upper': 0.125,
        'init_bias_lower': 0.0, 'init_bias_upper': 0.0
    },
    {
        'n_nodes': 10, 'activation': tf.identity, 'init_weight_lower': -0.125, 'init_weight_upper': 0.125,
        'init_bias_lower': 0.0, 'init_bias_upper': 0.0
    }
]

In [85]:
def cross_entropy_with_softmax(model_output, true_output):
    return tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits_v2(labels=true_output, logits=model_output))

In [86]:
monitors = [LossMonitor(200), LossGradientMonitor(200, 0), LossGradientMonitor(200, -1)]

In [87]:
network_16 = FFNN(
    lc, loss_func=cross_entropy_with_softmax, monitors=monitors, optimizer=tf.train.GradientDescentOptimizer(0.001),
    input_vector=pipe_out[0], train_targets_vector=pipe_out[1]
)

In [88]:
%%time
network_16.train_online()

CPU times: user 8min 8s, sys: 1min 16s, total: 9min 24s
Wall time: 6min 37s


In [90]:
plotly.iplot([
    go.Scatter(
        x=network_16.monitors[0].values['epochs'],
        y=network_16.monitors[0].values['loss']
    )
])

In [91]:
train_pred = network_16.predict(train_in)
np.sum(train_pred.argmax(axis=1) == train_labels.argmax(axis=1)) / train_in.shape[0]

0.9932727272727273

In [92]:
plotly.iplot([
    go.Heatmap(
    z=np.abs(network_16.monitors[1].values['bias']).transpose(),
    colorscale=[[0.0, 'rgb(165,0,38)'], [0.1111111111111111, 'rgb(215,48,39)'], [0.2222222222222222, 'rgb(244,109,67)'], [0.3333333333333333, 'rgb(253,174,97)'], [0.4444444444444444, 'rgb(254,224,144)'], [0.5555555555555556, 'rgb(224,243,248)'], [0.6666666666666666, 'rgb(171,217,233)'], [0.7777777777777778, 'rgb(116,173,209)'], [0.8888888888888888, 'rgb(69,117,180)'], [1.0, 'rgb(49,54,149)']]
)])

In [93]:
plotly.iplot([
    go.Heatmap(
    z=np.abs(network_16.monitors[2].values['bias']).transpose(),
    colorscale=[[0.0, 'rgb(165,0,38)'], [0.1111111111111111, 'rgb(215,48,39)'], [0.2222222222222222, 'rgb(244,109,67)'], [0.3333333333333333, 'rgb(253,174,97)'], [0.4444444444444444, 'rgb(254,224,144)'], [0.5555555555555556, 'rgb(224,243,248)'], [0.6666666666666666, 'rgb(171,217,233)'], [0.7777777777777778, 'rgb(116,173,209)'], [0.8888888888888888, 'rgb(69,117,180)'], [1.0, 'rgb(49,54,149)']]
)])

# Not Solved by RELU

The problem doesn't seem to be activation. ReLU exhibits similar symptoms at just 10 layers.

In [94]:
lc = [
    784,
    {
        'n_nodes': 64, 'activation': tf.nn.relu, 'init_weight_lower': -0.0357, 'init_weight_upper': 0.0357,
        'init_bias_lower': -0.0, 'init_bias_upper': 0.0
    },
    {
        'n_nodes': 64, 'activation': tf.nn.relu, 'init_weight_lower': -0.125, 'init_weight_upper': 0.125,
        'init_bias_lower': 0.0, 'init_bias_upper': 0.0
    },
    {
        'n_nodes': 64, 'activation': tf.nn.relu, 'init_weight_lower': -0.125, 'init_weight_upper': 0.125,
        'init_bias_lower': 0.0, 'init_bias_upper': 0.0
    },
    {
        'n_nodes': 64, 'activation': tf.nn.relu, 'init_weight_lower': -0.125, 'init_weight_upper': 0.125,
        'init_bias_lower': 0.0, 'init_bias_upper': 0.0
    },
    {
        'n_nodes': 64, 'activation': tf.nn.relu, 'init_weight_lower': -0.125, 'init_weight_upper': 0.125,
        'init_bias_lower': 0.0, 'init_bias_upper': 0.0
    },
    {
        'n_nodes': 64, 'activation': tf.nn.relu, 'init_weight_lower': -0.125, 'init_weight_upper': 0.125,
        'init_bias_lower': 0.0, 'init_bias_upper': 0.0
    },
    {
        'n_nodes': 64, 'activation': tf.nn.relu, 'init_weight_lower': -0.125, 'init_weight_upper': 0.125,
        'init_bias_lower': 0.0, 'init_bias_upper': 0.0
    },
    {
        'n_nodes': 64, 'activation': tf.nn.relu, 'init_weight_lower': -0.125, 'init_weight_upper': 0.125,
        'init_bias_lower': 0.0, 'init_bias_upper': 0.0
    },
    {
        'n_nodes': 64, 'activation': tf.nn.relu, 'init_weight_lower': -0.125, 'init_weight_upper': 0.125,
        'init_bias_lower': 0.0, 'init_bias_upper': 0.0
    },
    {
        'n_nodes': 64, 'activation': tf.nn.relu, 'init_weight_lower': -0.125, 'init_weight_upper': 0.125,
        'init_bias_lower': 0.0, 'init_bias_upper': 0.0
    },
    {
        'n_nodes': 10, 'activation': tf.identity, 'init_weight_lower': -0.125, 'init_weight_upper': 0.125,
        'init_bias_lower': 0.0, 'init_bias_upper': 0.0
    }
]

In [103]:
monitors = [
    LossMonitor(200), LossGradientMonitor(200, 0), LossGradientMonitor(200, 10), WeightMonitor(200, 0), WeightMonitor(200, 4),
    WeightMonitor(200, 8), ActivationMonitor(200, 0), ActivationMonitor(200, 8)
]

In [104]:
def cross_entropy_with_softmax(model_output, true_output):
    return tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits_v2(labels=true_output, logits=model_output))

In [105]:
network_relu_10 = FFNN(
    lc, loss_func=cross_entropy_with_softmax, monitors=monitors, optimizer=tf.train.GradientDescentOptimizer(0.001),
    input_vector=pipe_out[0], train_targets_vector=pipe_out[1]
)

In [106]:
%%time
network_relu_10.train_online()

CPU times: user 6min 40s, sys: 1min 46s, total: 8min 26s
Wall time: 5min 43s


In [107]:
plotly.iplot([
    go.Scatter(
        x=network_relu_10.monitors[0].values['epochs'],
        y=network_relu_10.monitors[0].values['loss']
    )
])

In [108]:
train_pred = network_relu_10.predict(train_in)
np.sum(train_pred.argmax(axis=1) == train_labels.argmax(axis=1)) / train_in.shape[0]

0.9954363636363637

In [109]:
plotly.iplot([
    go.Heatmap(
    z=np.abs(network_relu_10.monitors[1].values['bias']).transpose(),
    colorscale=[[0.0, 'rgb(165,0,38)'], [0.1111111111111111, 'rgb(215,48,39)'], [0.2222222222222222, 'rgb(244,109,67)'], [0.3333333333333333, 'rgb(253,174,97)'], [0.4444444444444444, 'rgb(254,224,144)'], [0.5555555555555556, 'rgb(224,243,248)'], [0.6666666666666666, 'rgb(171,217,233)'], [0.7777777777777778, 'rgb(116,173,209)'], [0.8888888888888888, 'rgb(69,117,180)'], [1.0, 'rgb(49,54,149)']]
)])

In [110]:
plotly.iplot([
    go.Heatmap(
    z=np.abs(network_relu_10.monitors[2].values['bias']).transpose(),
    colorscale=[[0.0, 'rgb(165,0,38)'], [0.1111111111111111, 'rgb(215,48,39)'], [0.2222222222222222, 'rgb(244,109,67)'], [0.3333333333333333, 'rgb(253,174,97)'], [0.4444444444444444, 'rgb(254,224,144)'], [0.5555555555555556, 'rgb(224,243,248)'], [0.6666666666666666, 'rgb(171,217,233)'], [0.7777777777777778, 'rgb(116,173,209)'], [0.8888888888888888, 'rgb(69,117,180)'], [1.0, 'rgb(49,54,149)']]
)])