# Second-Order Methods in TensorFlow - Part 3 - Sandbox

Now let's implement a second-order method on a neural network.

Again, we implement a version of Newton's method that's designed to find critical points, which need not be minima for non-convex functions.

In [None]:
import tensorflow as tf
# import matplotlib.pyplot as plt
import numpy as np

from collections import namedtuple

In [None]:
NeuralNetwork = namedtuple("NeuralNetwork",
                           ["graph", "graph_dictionary", "hyperparameter_dictionary"])

In [None]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.per_process_gpu_memory_fraction = 0.4

The tricky part here is specifying the parameters: in order to calculate a Hessian, we need the parameters to be inside the same variable, but TensorFlow is not designed with the expectation that all of our weights need to be initialized at once. In addition, this makes keeping abstraction barriers up more difficult.

In [None]:
def make_neural_network(hyperparameter_dictionary):
    
    graph = tf.Graph()
    
    with graph.as_default():
        
        input_size = hyperparameter_dictionary["input_size"]
        output_size = hyperparameter_dictionary["output_size"]
        
        num_parameters = calculate_num_parameters(hyperparameter_dictionary)
        hyperparameter_dictionary["num_parameters"] = num_parameters
        parameters_placeholder = tf.placeholder(tf.float32, shape=[num_parameters],
                                                name="initial_parameters")
        parameters_var = tf.Variable(initial_value=parameters_placeholder,
                                         name="parameters_variable")

        weight_matrices, bias_vectors = make_weights_and_biases(parameters_var,
                                                                hyperparameter_dictionary)
        
        input = tf.placeholder(tf.float32, shape=[None, input_size])
        
        network_output = build_by_layer(input, weight_matrices, bias_vectors,
                                      hyperparameter_dictionary)
        
        labels = tf.placeholder(tf.float32, shape=[None, output_size])
        cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=network_output,
                                                                     labels=labels))
        
        network_predictions = tf.nn.softmax(network_output, name="network_predictions")
        prediction_correct = tf.equal(tf.argmax(network_predictions,1), tf.argmax(labels,1))
        accuracy = tf.reduce_mean(tf.cast(prediction_correct, tf.float32))
        
        with tf.variable_scope("grads_and_hess"):

            gradients = tf.gradients(cost, parameters_var, name="gradients")

            hessian_matrix = tf.squeeze(tf.hessians(cost, parameters_var, name="hessians_output"),
                                       name="hessian_matrix")
            
            inverse_hessian = invert_hessian(hessian_matrix, num_parameters,
                                            hyperparameter_dictionary)

            gradient_descent = tf.train.GradientDescentOptimizer(hyperparameter_dictionary["learning_rate"])
            step_gradient_descent = gradient_descent.minimize(cost)

            newton_base = tf.train.GradientDescentOptimizer(hyperparameter_dictionary["newton_rate"])
            gd_grads_and_vars = newton_base.compute_gradients(cost, parameters_var)
            step_newton = add_step_newton(newton_base, gd_grads_and_vars, inverse_hessian)

        graph_dictionary = {"parameters_placeholder": parameters_placeholder,
                            "parameters": parameters_var,
                            "input": input,
                            "weight_matrices": weight_matrices,
                            "bias_vectors": bias_vectors,
                            "labels": labels,
                            "cost": cost,
                            "accuracy": accuracy,
                            "gradients": gradients,
                            "hessian": hessian_matrix,
                            "step_gradient_descent": step_gradient_descent,
                            "step_newton": step_newton
                           }
    
    return NeuralNetwork(graph, graph_dictionary, hyperparameter_dictionary)

def add_step_newton(gradient_descent, gd_grads_and_vars, inverse_hessian):
    gd_gradients, gd_variables = gd_grads_and_vars[0]
    gd_gradient_vector = tf.expand_dims(gd_gradients, name="gradient_vector", axis=1)

    newton_gradient_vector = tf.matmul(inverse_hessian, gd_gradient_vector,
                                           name="newton_gradient_vector")
    newton_gradients = tf.squeeze(newton_gradient_vector)
      
    newton_grads_and_vars = [(newton_gradients, gd_variables)]

    step_newton = gradient_descent.apply_gradients(newton_grads_and_vars)
    
    return step_newton

def invert_hessian(hessian, num_parameters, hyperparameter_dictionary):
    method = hyperparameter_dictionary["inverse_method"]
    
    if method == "fudged":
        fudging_vector = tf.constant([hyperparameter_dictionary["fudge_factor"]]*num_parameters,
                                         dtype=tf.float32, name="fudging_vector")
            
        fudged_hessian = tf.add(tf.diag(fudging_vector),
                                        hessian, name ="fudged_hessian")
            
        inverse_hessian = tf.matrix_inverse(fudged_hessian, name="inverse_hessian")
        
    elif method == "pseudo":
        
        eigenvalues, eigenvectors = tf.self_adjoint_eig(tf.expand_dims(hessian, axis=0))
        
        threshold = hyperparameter_dictionary["minimum_eigenvalue_magnitude"]
        keep = tf.reduce_sum(tf.cast(tf.greater_equal(tf.abs(eigenvalues), threshold), tf.int32))
        
        truncated_eigenvalues = tf.squeeze(eigenvalues)[-keep:]
        truncated_eigenvectors = tf.squeeze(eigenvectors)[:, -keep:]

        inverse_hessian = tf.matmul(truncated_eigenvectors,
                                    (1. / tf.expand_dims(truncated_eigenvalues, axis=0)) * truncated_eigenvectors,
                                    transpose_b=True, name="inverse_hessian")
        
    else:
        raise NotImplementedError("no inverse hessian method for {0}".format(method))
        
    return inverse_hessian

def calculate_num_parameters(hyperparameter_dictionary):
    layer_sizes = hyperparameter_dictionary["layer_sizes"][:]
    input_sizes = hyperparameter_dictionary["input_size"]
    output_size = hyperparameter_dictionary["output_size"]
    layer_sizes = [input_sizes] + layer_sizes + [output_size]
    
    num_weights = np.sum(np.multiply(layer_sizes[1:],layer_sizes[:-1]))
    num_biases = np.sum(layer_sizes[1:])
    
    return num_weights+num_biases

def make_weights_and_biases(parameters, hyperparameter_dictionary):
    layer_sizes = hyperparameter_dictionary["layer_sizes"][:]
    input_sizes = hyperparameter_dictionary["input_size"]
    output_size = hyperparameter_dictionary["output_size"]
    layer_sizes = [input_sizes] + layer_sizes + [output_size]
    
    weight_matrices = make_weights(parameters, layer_sizes, hyperparameter_dictionary)
    bias_vectors = make_biases(parameters, layer_sizes, hyperparameter_dictionary)
    
    return weight_matrices, bias_vectors

def make_weights(parameters, layer_sizes, hyperparameter_dictionary):
    weight_shapes = zip(layer_sizes[:-1], layer_sizes[1:])
    starting_index = 0
    weight_matrices = []
    
    with tf.variable_scope("weights"):

        for weight_shape in weight_shapes:
            num_weights = weight_shape[0]*weight_shape[1]

            weight_variables = tf.slice(parameters, [starting_index], [num_weights],
                                        name="sliced")
            weight_matrix = tf.reshape(weight_variables, weight_shape,
                                       name="reshaped")

            weight_matrices.append(weight_matrix)

            starting_index += num_weights
    
    return weight_matrices

def make_biases(parameters, layer_sizes, hyperparameter_dictionary):
    bias_shapes = layer_sizes[1:]
    total_biases = np.sum(bias_shapes)
    total_weights = hyperparameter_dictionary["num_parameters"]-total_biases
    hyperparameter_dictionary["total_weights"] = total_weights
    hyperparameter_dictionary["total_biases"] = total_biases
    starting_index = total_weights-total_biases
    bias_vectors = []
    
    with tf.variable_scope("biases"):
        
        for bias_shape in bias_shapes:
            num_biases = bias_shape
            
            bias_vector = tf.slice(parameters, [starting_index], [num_biases],
                                     name="sliced")

            bias_vectors.append(bias_vector)
            
            starting_index += num_biases
            
    return bias_vectors

def build_by_layer(input, weight_matrices, bias_vectors, hyperparameter_dictionary):
    current_output = input
    
    for weight_matrix, bias_vector in zip(weight_matrices[:-1], bias_vectors[:-1]):
        current_output = build_layer(current_output, weight_matrix, bias_vector,
                             hyperparameter_dictionary)
        
    final_output = build_output_layer(current_output, weight_matrices[-1], bias_vectors[-1],
                                      hyperparameter_dictionary)
    
    return final_output

def build_layer(current_output, weight_matrix, bias_vector, hyperparameter_dictionary):
    with tf.variable_scope("internal_layers"):
        nonlinearity = hyperparameter_dictionary["nonlinearity"]
        new_output = nonlinearity(tf.add(tf.matmul(current_output, weight_matrix), bias_vector))
    return new_output

def build_output_layer(current_output, weight_matrix, bias_vector, hyperparameter_dictionary):
    with tf.variable_scope("output_layer"):
        final_output = tf.add(tf.matmul(current_output, weight_matrix), bias_vector)
    return final_output

In [None]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)

The biggest practical issue is handling the non-invertibility of the Hessian and related numerical issues in the Newton method. Seems like it requires fairly careful tuning of the step size, or else it can cause the performance to drop and/or the (raw) gradients to blow up.

Need to look into trust region methods more closely for intuition about how to set this.

Tested with batch size 50k -- doesn't seem to be an error in estimating the Hessian.

01-24-2018 - Added pseudo-inverse based on Jesse's code. Gradient norm now goes down a bit and doesn't increase substantially, but it's not as small as I'd like.

Suggestion - if cost function is piecewise linear in the parameters, then gradient norm need not go to zero.

In [None]:
fudged_hess_hyperparameters = {"layer_sizes":[10],
                            "nonlinearity":tf.nn.sigmoid,
                            "input_size":784,
                            "output_size":10,
                             "learning_rate":0.01,
                             "newton_rate":1e-9,
                             "fudge_factor":1e-4,
                             "inverse_method": "fudged"
                            }

pseudo_inverse_hyperparameters = {"layer_sizes":[10],
                            "nonlinearity":tf.nn.sigmoid,
                            "input_size":784,
                            "output_size":10,
                             "learning_rate":0.01,
                             "newton_rate":1e-5,
                             "minimum_eigenvalue_magnitude":1e-7,
                             "inverse_method": "pseudo"
                            }

In [None]:
# time_stats = %prun -r network = make_neural_network(hyperparameter_dictionary)
network = make_neural_network(pseudo_inverse_hyperparameters)

In [None]:
# time_stats_filename = "time_stats_gpu.txt"

# with open(time_stats_filename, 'w') as time_stats.stream:
#     time_stats.print_stats()

In [None]:
graph = network.graph
graph_dict = network.graph_dictionary
hyperparameter_dictionary = network.hyperparameter_dictionary
num_parameters = hyperparameter_dictionary["num_parameters"]
total_weights = hyperparameter_dictionary["total_weights"]
total_biases = hyperparameter_dictionary["total_biases"]
initialized_parameters = np.hstack([0.1*np.random.standard_normal(size=total_weights),
                                  [0.1]*total_biases]).astype(np.float32)

In [None]:
num_batches_init = 5000
num_newton_steps = 3

gradient_descent_batch_size = 50
newton_batch_size = 500

print(hyperparameter_dictionary["inverse_method"])
print(hyperparameter_dictionary["newton_rate"])

with graph.as_default():
    sess = tf.InteractiveSession()
    input = graph_dict["input"]
    labels = graph_dict["labels"]
    initial_parameters = graph_dict["parameters_placeholder"]
    trained_parameters = graph_dict["parameters"]
    step_gradient_descent = graph_dict["step_gradient_descent"]
    step_newton = graph_dict["step_newton"]
    accuracy = graph_dict["accuracy"]
    gradient_op = graph_dict["gradients"]
    
    initializer_feed_dict = {initial_parameters: initialized_parameters}
    tf.global_variables_initializer().run(initializer_feed_dict)    
    
    for batch_idx in range(num_batches_init):
        
        if (batch_idx+1 == 1):
            
            batch_inputs, batch_labels = mnist.train.next_batch(gradient_descent_batch_size)
            train_feed_dict = {input: batch_inputs,
                       labels: batch_labels}
            
            acc = sess.run(accuracy, feed_dict=train_feed_dict)
            gradients = sess.run(gradient_op, feed_dict=train_feed_dict)
            gradient_norm = np.sqrt(np.mean(np.square(gradients)))
            
            print("init values")
            print("\taccuracy: {0:.2f}".format(acc))
            print("\tgrad_norm: {0:.10f}".format(gradient_norm))
            
        batch_inputs, batch_labels = mnist.train.next_batch(gradient_descent_batch_size)
        train_feed_dict = {input: batch_inputs,
                       labels: batch_labels}
        
        sess.run(step_gradient_descent, feed_dict=train_feed_dict)
        
        acc = sess.run(accuracy, feed_dict=train_feed_dict)
        
        gradients = sess.run(gradient_op, feed_dict=train_feed_dict)
        gradient_norm = np.sqrt(np.mean(np.square(gradients)))
        
        if (batch_idx+1 == 1) or ((batch_idx+1)%500 == 0):
            print("grad step: {0}".format(batch_idx+1))
            print("\taccuracy: {0:.2f}".format(acc))
            print("\tgrad_norm: {0:.10f}".format(gradient_norm))
        
    for batch_idx in range(num_newton_steps):
        print("newton step: {0}".format(batch_idx+1))
        batch_inputs, batch_labels = mnist.train.next_batch(newton_batch_size)
        train_feed_dict = {input: batch_inputs,
                       labels: batch_labels}
        
        sess.run(step_newton, feed_dict=train_feed_dict)
        
        acc = sess.run(accuracy, feed_dict=train_feed_dict)
        print("\taccuracy: {0:.2f}".format(acc))
        gradients = sess.run(gradient_op, feed_dict=train_feed_dict)
        gradient_norm = np.sqrt(np.mean(np.square(gradients)))
        print("\tgrad_norm: {0:.10f}".format(gradient_norm))
    
    sess.close()