In [1]:
from learning_lib.nn.ffnn import FFNN
from learning_lib.nn.monitoring.loss import LossMonitor

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data

import plotly.offline as plotly
import plotly.graph_objs as go

  from ._conv import register_converters as _register_converters


In [2]:
plotly.init_notebook_mode(connected=True)

# Data

In [3]:
dataset = input_data.read_data_sets("MNIST_data/", one_hot=True)

train_data = dataset.train.next_batch(1000000)
train_in = train_data[0] - train_data[0].mean()
train_labels = train_data[1]

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


# Problem Statement

Theoretically, neural networks with Mean Squared Loss should be able fit a standard classification problem. However many classification models use cross-entropy loss with softmax instead. Here we investigate the hypothesis that the latter exhibits much stronger optimization properties and investigate why this might be.

# MSE Loss

In [30]:
lc = [
    784,
    {
        'n_nodes': 2048, 'activation': tf.nn.tanh, 'init_weight_lower': -1, 'init_weight_upper': 1,
        'init_bias_lower': 0, 'init_bias_upper': 1
    },
    {
        'n_nodes': 10, 'activation': tf.nn.softmax, 'init_weight_lower': -1, 'init_weight_upper': 1,
        'init_bias_lower': 0, 'init_bias_upper': 1
    }
]

In [13]:
monitors = [LossMonitor(100)]

In [None]:
network_mse_softmax = FFNN(lc, monitors=monitors)

In [51]:
%%time
network_mse_softmax.train(
    train_in,
    train_labels,
    epochs=4400*10,
    batch_size=100,
    report_interval=100,
    optimizer=tf.train.GradientDescentOptimizer(0.01)
)

Reached epoch 35200
Reached epoch 35300
Reached epoch 35400
Reached epoch 35500
Reached epoch 35600
Reached epoch 35700
Reached epoch 35800
Reached epoch 35900
Reached epoch 36000
Reached epoch 36100
Reached epoch 36200
Reached epoch 36300
Reached epoch 36400
Reached epoch 36500
Reached epoch 36600
Reached epoch 36700
Reached epoch 36800
Reached epoch 36900
Reached epoch 37000
Reached epoch 37100
Reached epoch 37200
Reached epoch 37300
Reached epoch 37400
Reached epoch 37500
Reached epoch 37600
Reached epoch 37700
Reached epoch 37800
Reached epoch 37900
Reached epoch 38000
Reached epoch 38100
Reached epoch 38200
Reached epoch 38300
Reached epoch 38400
Reached epoch 38500
Reached epoch 38600
Reached epoch 38700
Reached epoch 38800
Reached epoch 38900
Reached epoch 39000
Reached epoch 39100
Reached epoch 39200
Reached epoch 39300
Reached epoch 39400
Reached epoch 39500
Reached epoch 39600
Reached epoch 39700
Reached epoch 39800
Reached epoch 39900
Reached epoch 40000
Reached epoch 40100


Reached epoch 76200
Reached epoch 76300
Reached epoch 76400
Reached epoch 76500
Reached epoch 76600
Reached epoch 76700
Reached epoch 76800
Reached epoch 76900
Reached epoch 77000
Reached epoch 77100
Reached epoch 77200
Reached epoch 77300
Reached epoch 77400
Reached epoch 77500
Reached epoch 77600
Reached epoch 77700
Reached epoch 77800
Reached epoch 77900
Reached epoch 78000
Reached epoch 78100
Reached epoch 78200
Reached epoch 78300
Reached epoch 78400
Reached epoch 78500
Reached epoch 78600
Reached epoch 78700
Reached epoch 78800
Reached epoch 78900
Reached epoch 79000
Reached epoch 79100
CPU times: user 1h 3min 19s, sys: 1h 21min 51s, total: 2h 25min 10s
Wall time: 2h 11min 52s


# Cross Entropy Loss

In [13]:
def cross_entropy_with_softmax(model_output, true_output):
    return tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits_v2(labels=true_output, logits=model_output))

In [14]:
lc = [
    784,
    {
        'n_nodes': 2048, 'activation': tf.nn.tanh, 'init_weight_lower': -1, 'init_weight_upper': 1,
        'init_bias_lower': 0, 'init_bias_upper': 1
    },
    {
        'n_nodes': 10, 'activation': tf.identity, 'init_weight_lower': -1, 'init_weight_upper': 1,
        'init_bias_lower': 0, 'init_bias_upper': 1
    }
]

In [15]:
monitors = [LossMonitor(100)]

In [16]:
network_xe = FFNN(lc, post_proc_function=tf.nn.softmax, loss_func=cross_entropy_with_softmax, monitors=monitors)

In [17]:
%%time
print("Test")
network_xe.train(
    train_in,
    train_labels,
    epochs=4400,
    batch_size=100,
    optimizer=tf.train.GradientDescentOptimizer(0.01)
)

Test
Reached epoch 4399
CPU times: user 2min 38s, sys: 6min 51s, total: 9min 29s
Wall time: 9min 6s


# Comparison

In [18]:
train_pred = network_xe.evaluate(train_in)
np.sum(train_pred.argmax(axis=1) == train_labels.argmax(axis=1)) / train_in.shape[0]

0.9982181818181818

In [19]:
plotly.iplot([
    go.Scatter(
        x=network_xe.monitors[0].values['epochs'],
        y=network_xe.monitors[0].values['loss']
    )
])

In [53]:
train_pred = network_mse_softmax.evaluate(train_in)
np.sum(train_pred.argmax(axis=1) == train_labels.argmax(axis=1)) / train_in.shape[0]

0.5658181818181818

In [54]:
plotly.iplot([
    go.Scatter(
        x=pd.DataFrame(network_mse_softmax.learning_curve)[0],
        y=pd.DataFrame(network_mse_softmax.learning_curve)[1]
    )
])

# Conclusion

These two examples show that all else being equal, having a softmax + cross entopy loss converges at a much faster rate than just MSE. Both methods appear to exhibit strong fitment to the data given sufficient time but convergence of the MSE loss was significantly slower than cross-entropy.