# Autoencoder Test for Saddle-Free Optimizer

Copyright 2018 Dave Fernandes. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0
  
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

## Description
This example trains an autoencoder on MNIST data using either SGD with momentum or the Saddle-Free (SF) method. You can train using SGD first for 2000 epochs until it plateaus, and then use the SF method to further train the model.


In [1]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import time
from SFOptimizer import SFOptimizer
from SFOptimizer import SFDamping
from mnist import dataset

import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

np.set_printoptions(suppress=True)

In [2]:
var_list = []

def logistic_layer(layer_name, input_layer, hidden_units, n_random):
    # Initialize weights with sparse random values as per Martens (2010)
    initial_W = np.zeros((input_layer.shape[1], hidden_units))
    for i in range(hidden_units):
        column = np.zeros((input_layer.shape[1], 1))
        column[0:n_random,:] += np.random.randn(n_random, 1)
        np.random.shuffle(column)
        initial_W[:, i:i+1] = column
    
    with tf.name_scope('layer_' + layer_name):
        W = tf.get_variable('W_' + layer_name, initializer=tf.convert_to_tensor(initial_W, dtype=tf.float64), use_resource=True)
        b = tf.get_variable('b_' + layer_name, [hidden_units], initializer=tf.zeros_initializer(), dtype=tf.float64, use_resource=True)
        y = tf.sigmoid(tf.matmul(input_layer, W) + b)
    
    var_list.append(W)
    var_list.append(b)
    return W, b, y

In [3]:
""" Deep autoencoder network from Hinton & Salakhutdinov (2006) """
def AE_model():
    n_inputs = 28*28
    n_hidden1 = 1000
    n_hidden2 = 500
    n_hidden3 = 250
    n_hidden4 = 30
    
    x = tf.placeholder(tf.float64, shape=(None, n_inputs), name='input')

    with tf.name_scope('dnn'):
        _, _, y1 = logistic_layer('1', x, n_hidden1, 15)
        _, _, y2 = logistic_layer('2', y1, n_hidden2, 15)
        _, _, y3 = logistic_layer('3', y2, n_hidden3, 15)
        
        W4, b4, _ = logistic_layer('4', y3, n_hidden4, 15)
        y4 = tf.matmul(y3, W4) + b4
        
        _, _, y5 = logistic_layer('5', y4, n_hidden3, 15)
        _, _, y6 = logistic_layer('6', y5, n_hidden2, 15)
        _, _, y7 = logistic_layer('7', y6, n_hidden1, 15)
        W8, b8, y_out = logistic_layer('8', y7, n_inputs, 15)
        y_logits = tf.matmul(y7, W8) + b8

    saver = tf.train.Saver(var_list)

    with tf.name_scope('loss'):
        cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=x, logits=y_logits)
        loss = tf.reduce_mean(cross_entropy, name='loss')
        error = tf.reduce_mean(tf.reduce_sum(tf.squared_difference(x, y_out), axis=1))

    return x, loss, error

In [4]:
model_filepath = os.path.join(os.getcwd(), 'data', 'ae_weights')

def MNIST_AE_test(use_SF, start_from_previous_run):
    from tensorflow.examples.tutorials.mnist import input_data
    mnist = input_data.read_data_sets("./data/")

    x, loss, error = AE_model()
    saver = tf.train.Saver(var_list)

    if use_SF:
        max_epochs = 5
        batch_size = 2000
        print_interval = 1
    else:
        max_epochs = 2000
        batch_size = 200
        print_interval = 100
    
    if use_SF:
        # See SFOptimizer.py for options
        optimizer = SFOptimizer(var_list, krylov_dimension=100, damping_type=SFDamping.marquardt, dtype=tf.float64)
    else:
        # See Sutskever et al. (2013)
        #optimizer = tf.train.MomentumOptimizer(learning_rate=0.001, momentum=0.99)
        
        optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
        train_op = optimizer.minimize(loss)

    print('Initializing...')
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    
    if start_from_previous_run:
        saver.restore(sess, model_filepath)
    
    print('Constructing graph...')
    if use_SF:
        big_train_op = optimizer.minimize(loss)
        little_train_op = optimizer.fixed_subspace_step()
        update_op = optimizer.update()
        reset_op = optimizer.reset_lambda()

    history = []
    t0 = time.perf_counter()
    
    print("Training...")
    for epoch in range(max_epochs):
        n_batches = mnist.train.num_examples // batch_size
        
        for iteration in range(n_batches):
            if iteration % print_interval == 0:
                print('-- Epoch:', epoch + 1, ' Batch:', iteration + 1, '/', n_batches, '--')

            x_batch, t_batch = mnist.train.next_batch(batch_size)
            feed_dict = {x: x_batch}
            
            if use_SF:
                # Reset the damping parameter
                _ = sess.run(reset_op)
                
                # Compute Krylov subspace and take one training step
                initial_loss, initial_lambda, _ = sess.run([loss, optimizer.lambda_damp, big_train_op], feed_dict=feed_dict)
                final_loss, rho, _ = sess.run([loss, optimizer.rho, update_op], feed_dict=feed_dict)
                
                if iteration % print_interval == 0:
                    print('    Loss_i:', initial_loss, 'Loss_f:', final_loss, 'rho', rho, 'lambda:', initial_lambda)
                
                # Take up to 5 more steps without recomputing the Krylov subspace
                for little_step in range(5):
                    initial_loss, initial_lambda, _ = sess.run([loss, optimizer.lambda_damp, little_train_op], feed_dict=feed_dict)
                    final_loss, rho, _ = sess.run([loss, optimizer.rho, update_op], feed_dict=feed_dict)
                    
                    if iteration % print_interval == 0:
                        print('    Loss_i:', initial_loss, 'Loss_f:', final_loss, 'rho', rho, 'lambda:', initial_lambda)
            else:
                # Take a gradient descent step
                i = iteration + (epoch * n_batches)
                
                sess.run(train_op, feed_dict=feed_dict)
                initial_loss = sess.run(loss, feed_dict=feed_dict)
                
                if iteration % print_interval == 0:
                    print('    Loss:', initial_loss)
            
            history += [initial_loss]
            
            if iteration % print_interval == 0:
                error_train = sess.run(error, feed_dict=feed_dict)
                print('    Train error:', error_train)

        error_train = sess.run(error, feed_dict=feed_dict)
        error_test = sess.run(error, feed_dict={x: mnist.test.images})

        t1 = time.perf_counter()
        dt = t1 - t0
        t0 = t1

        print('\n*** Epoch:', epoch + 1, 'Train error:', error_train, ' Test error:', error_test, ' Time:', dt, 'sec\n')
        save_path = saver.save(sess, model_filepath)
    
    return history, optimizer.get_name()

First train with `use_SF = False` and `start_from_previous_run = False`.
Then train with `use_SF = True` and `start_from_previous_run = True` to use the SF method.

In [None]:
history, opt_name = MNIST_AE_test(use_SF = False, start_from_previous_run = False)
    
# Plot the cost
plt.plot(history)
plt.ylabel('Loss')
plt.yscale('log')
plt.xlabel('Steps')
plt.title(opt_name + ' Optimizer')
plt.show()

Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Instructions for updating:
Please write your own downloading logic.
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting ./data/train-images-idx3-ubyte.gz
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting ./data/train-labels-idx1-ubyte.gz
Extracting ./data/t10k-images-idx3-ubyte.gz
Extracting ./data/t10k-labels-idx1-ubyte.gz
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Initializing...
Constructing graph...
Training...
-- Epoch: 1  Batch: 1 / 275 --
    Loss: 0.9177406902456087
    Train error: 216.58256535214582
-- Epoch: 1  Batch: 101 / 275 --
    Loss: 0.21997505451413732
    Train error: 40.219384555975346
-- Epoch: 1  Batch: 201 / 275 --
    Loss: 0.16507290147817644
    Train error: 26.082189695190873

*** Epoch: 1 Train error: 20.7765