In [1]:
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle

In [2]:
#loading the data generated in notMNIST.ipynb

pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
    save = pickle.load(f)
    train_dataset = save['train_dataset']
    train_labels = save['train_labels']
    valid_dataset = save['valid_dataset']
    valid_labels = save['valid_labels']
    test_dataset = save['test_dataset']
    test_labels = save['test_labels']
    del save
    print('Training set', train_dataset.shape, train_labels.shape)
    print('validation set', valid_dataset.shape, valid_labels.shape)
    print('Test set', test_dataset.shape, test_labels.shape)

Training set (20000, 28, 28) (20000,)
validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


In [3]:
#Reformatting data into a shape that's more adapted to the models we're going to train:
#1. data as a flat matrix, and
#2. labels as float 1-hot encodings

image_size = 28
num_labels = 10

def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
  # Map 1 to [0.0, 1.0, 0.0 ...], 2 to [0.0, 0.0, 1.0 ...]
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (20000, 784) (20000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


In [4]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

In [5]:
#Introducing and tuning L2 regularization for both logistic and neural network models. L2 model amounts to addying a penalty on the norms of the weights of the loss. In tensorflow we can compute the L2 loss for a tensor t using nn.l2_loss(t). The right amount of rregularization should improve our validation/test accuracy.

#Multinomial logistic regression with L2 loss function. L' = L + B*(w1*w1 + w2*w2 + w3*w3 + .... + wn*wn)/2
#loading data and building computation graph.

#this is to expedite the process.
train_subset = 10000
#this is a good beta value to start with.
beta = 0.01

graph = tf.Graph()
with graph.as_default():
    #input data
    #they are all constnts
    tf_train_dataset = tf.constant(train_dataset[:train_subset, :])
    tf_train_labels = tf.constant(train_labels[:train_subset])
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    #variables
    weights = tf.Variable(tf.truncated_normal([image_size * image_size, num_labels]))
    biases = tf.Variable(tf.zeros([num_labels]))
    
    #training computation
    logits = tf.matmul(tf_train_dataset, weights) + biases
    
    #original loss function
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = tf_train_labels, logits = logits))
    
    #loss function using L2 regularization
    regularizer = tf.nn.l2_loss(weights)
    loss = tf.reduce_mean(loss + beta * regularizer)
    
    #optimizer
    optimizer = tf.train.GradientDescentOptimizer(0.2).minimize(loss)
    
    #predictions for training, validation and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(tf.matmul(tf_valid_dataset, weights) + biases)
    test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.



In [6]:
#running computation and iterating.

num_steps = 801

def accuracy(predictions, labels):
    return(100.0 * np.sum(np.argmax(predictions, 1)==np.argmax(labels, 1))/predictions.shape[0])

with tf.Session(graph = graph) as session:
    tf.initialize_all_variables().run()
    print("Initialized")
    for step in range(num_steps):
        _, l, predictions = session.run([optimizer, loss, train_prediction])
        if(step%100==0):
            print("loss at step %d: %f" %(step, l))
            print("Training accuracy: %.1f%%" %accuracy(predictions, train_labels[:train_subset, :]))
            print("Validation accuracy: %.1f%%" %accuracy(valid_prediction.eval(), valid_labels))
            
    print("Test accuracy: %.1f%%" %accuracy(test_prediction.eval(), test_labels))

Instructions for updating:
Use `tf.global_variables_initializer` instead.
Initialized
loss at step 0: 53.734550
Training accuracy: 5.9%
Validation accuracy: 7.3%
loss at step 100: 21.892302
Training accuracy: 66.0%
Validation accuracy: 65.5%
loss at step 200: 14.415401
Training accuracy: 73.0%
Validation accuracy: 71.0%
loss at step 300: 9.627072
Training accuracy: 75.4%
Validation accuracy: 73.4%
loss at step 400: 6.505567
Training accuracy: 77.7%
Validation accuracy: 75.3%
loss at step 500: 4.471324
Training accuracy: 79.3%
Validation accuracy: 76.9%
loss at step 600: 3.148792
Training accuracy: 80.9%
Validation accuracy: 78.2%
loss at step 700: 2.290159
Training accuracy: 82.0%
Validation accuracy: 79.4%
loss at step 800: 1.732566
Training accuracy: 83.0%
Validation accuracy: 80.1%
Test accuracy: 87.3%


In [7]:
#Neural network with L2 regularization.(1 hidden Layer RELU)

num_nodes = 1024
batch_size = 128
beta = 0.01

graph = tf.Graph()

with graph.as_default():
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape = (batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    #variables
    weights1 = tf.Variable(tf.truncated_normal([image_size * image_size, num_nodes]))
    biases1 = tf.Variable(tf.zeros([num_nodes]))
    weights2 = tf.Variable(tf.truncated_normal([num_nodes, num_labels]))
    biases2 = tf.Variable(tf.zeros([num_labels]))
    
    #training computation
    logits1 = tf.matmul(tf_train_dataset, weights1) + biases1
    relu_layer = tf.nn.relu(logits1)
    logits2 = tf.matmul(relu_layer, weights2) + biases2
    
    #normal loss function
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = tf_train_labels, logits = logits2))
    #loss function with L2 regularization
    regularizers = tf.nn.l2_loss(weights1)+tf.nn.l2_loss(weights2)
    loss = tf.reduce_mean(loss + beta * regularizers)
    
    #optimizer
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
    
    # Predictions for the training
    train_prediction = tf.nn.softmax(logits2)
    
    # Predictions for validation 
    logits1 = tf.matmul(tf_valid_dataset, weights1) + biases1
    relu_layer= tf.nn.relu(logits1)
    logits2 = tf.matmul(relu_layer, weights2) + biases2
    
    valid_prediction = tf.nn.softmax(logits2)
    
    # Predictions for test
    logits1 = tf.matmul(tf_test_dataset, weights1) + biases1
    relu_layer= tf.nn.relu(logits1)
    logits2 = tf.matmul(relu_layer, weights2) + biases2
    
    test_prediction =  tf.nn.softmax(logits2)

In [8]:
num_steps = 3001

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print("Initialized")
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # Generate a minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 100 == 0):
            print("Minibatch loss at step {}: {}".format(step, l))
            print("Minibatch accuracy: {:.1f}".format(accuracy(predictions, batch_labels)))
            print("Validation accuracy: {:.1f}".format(accuracy(valid_prediction.eval(), valid_labels)))
    print("Test accuracy: {:.1f}".format(accuracy(test_prediction.eval(), test_labels)))

Initialized
Minibatch loss at step 0: 3491.993896484375
Minibatch accuracy: 4.7
Validation accuracy: 32.7
Minibatch loss at step 100: 1155.7646484375
Minibatch accuracy: 77.3
Validation accuracy: 76.7
Minibatch loss at step 200: 419.0970153808594
Minibatch accuracy: 86.7
Validation accuracy: 79.7
Minibatch loss at step 300: 153.64720153808594
Minibatch accuracy: 78.9
Validation accuracy: 81.2
Minibatch loss at step 400: 56.72582244873047
Minibatch accuracy: 89.1
Validation accuracy: 83.2
Minibatch loss at step 500: 21.220348358154297
Minibatch accuracy: 89.8
Validation accuracy: 83.6
Minibatch loss at step 600: 8.445679664611816
Minibatch accuracy: 83.6
Validation accuracy: 83.8
Minibatch loss at step 700: 3.630857229232788
Minibatch accuracy: 84.4
Validation accuracy: 83.1
Minibatch loss at step 800: 1.8090903759002686
Minibatch accuracy: 82.8
Validation accuracy: 83.1
Minibatch loss at step 900: 1.2673168182373047
Minibatch accuracy: 77.3
Validation accuracy: 83.1
Minibatch loss at s

In [13]:
#Seeing an extreme case overfitting. Restrcting our training data to just a few batches and see what happens.

#contunuing with L2 regularization from above computation.
num_steps = 3001

train_dataset_2 = train_dataset[:500, :]
train_labels_2 = train_labels[:500]

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print("Initialized")
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels_2.shape[0] - batch_size)
        # Generate a minibatch.
        batch_data = train_dataset_2[offset:(offset + batch_size), :]
        batch_labels = train_labels_2[offset:(offset + batch_size), :]
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
            print("Minibatch loss at step {}: {}".format(step, l))
            print("Minibatch accuracy: {:.1f}".format(accuracy(predictions, batch_labels)))
            print("Validation accuracy: {:.1f}".format(accuracy(valid_prediction.eval(), valid_labels)))
    print("Test accuracy: {:.1f}".format(accuracy(test_prediction.eval(), test_labels)))

Initialized
Minibatch loss at step 0: 3503.384765625
Minibatch accuracy: 6.2
Validation accuracy: 38.4
Minibatch loss at step 500: 21.0670223236084
Minibatch accuracy: 100.0
Validation accuracy: 76.5
Minibatch loss at step 1000: 0.49081099033355713
Minibatch accuracy: 100.0
Validation accuracy: 78.0
Minibatch loss at step 1500: 0.3089255094528198
Minibatch accuracy: 100.0
Validation accuracy: 78.0
Minibatch loss at step 2000: 0.29134583473205566
Minibatch accuracy: 100.0
Validation accuracy: 78.0
Minibatch loss at step 2500: 0.27686914801597595
Minibatch accuracy: 100.0
Validation accuracy: 78.0
Minibatch loss at step 3000: 0.27620482444763184
Minibatch accuracy: 100.0
Validation accuracy: 78.1
Test accuracy: 85.8


In [15]:
#there is overfitting here as there is high training accuracy and low validation accuracy.

#introducing dropout to the hidden layer of the neural network. Dropout shouls be introduced during training and not during evaluation, otherwise our evaluation results would be stochastic as well. Tensorflow provides nn.dropout() for that, but we have to make sure it is inserted only during training.
#lets see what happens to the overfitting case.

batch_size = 128
deep_graph = tf.Graph()
with deep_graph.as_default():
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)

  hidden_layer_size = 1024
  hidden_weights = tf.Variable(
    tf.truncated_normal([image_size * image_size, hidden_layer_size]))
  hidden_biases = tf.Variable(tf.zeros([hidden_layer_size]))
  hidden_layer = tf.nn.dropout(
    tf.nn.relu(tf.matmul(tf_train_dataset, hidden_weights) + hidden_biases), 0.5)
  
  output_weights = tf.Variable(
    tf.truncated_normal([hidden_layer_size, num_labels]))
  output_biases = tf.Variable(tf.zeros([num_labels]))
  logits = tf.matmul(hidden_layer, output_weights) + output_biases

  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits = logits))
  #l2_regularizer = tf.nn.l2_loss(output_weights) + tf.nn.l2_loss(hidden_weights)
  #loss += 5e-4 * l2_regularizer
  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  train_prediction = tf.nn.softmax(logits)

  # Setup validation prediction step.
  valid_hidden = tf.nn.relu(tf.matmul(tf_valid_dataset, hidden_weights) + hidden_biases)
  valid_logits = tf.matmul(valid_hidden, output_weights) + output_biases
  valid_prediction = tf.nn.softmax(valid_logits)

  # And setup the test prediction step.
  test_hidden = tf.nn.relu(tf.matmul(tf_test_dataset, hidden_weights) + hidden_biases)
  test_logits = tf.matmul(test_hidden, output_weights) + output_biases
  test_prediction = tf.nn.softmax(test_logits)

In [16]:
num_steps = 3001

with tf.Session(graph=deep_graph) as session:
  tf.initialize_all_variables().run()
  print("Initialized")
  for step in range(num_steps):
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("  Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 519.420166
Minibatch accuracy: 5.5%
Validation accuracy: 31.9%
Minibatch loss at step 500: 9.627921
Minibatch accuracy: 85.2%
Validation accuracy: 80.8%
Minibatch loss at step 1000: 12.728596
Minibatch accuracy: 75.8%
Validation accuracy: 81.2%
Minibatch loss at step 1500: 7.145221
Minibatch accuracy: 81.2%
Validation accuracy: 80.6%
Minibatch loss at step 2000: 2.996078
Minibatch accuracy: 85.2%
Validation accuracy: 81.4%
Minibatch loss at step 2500: 8.871046
Minibatch accuracy: 87.5%
Validation accuracy: 81.4%
Minibatch loss at step 3000: 2.171005
Minibatch accuracy: 92.2%
Validation accuracy: 81.2%
  Test accuracy: 88.5%


In [20]:
#not much effect -- some people say it helps if the networks are larger: https://discussions.udacity.com/t/problem-3-3-dropout-does-not-improve-test-accuarcy/46286/17

#trying to get better result. We can use multilayer model! The best reported test accuracy using a deep network is 97.1%. One avenue we can explore is to add multiple layers.
#Another one is to use learning rate decay.
#global_step = tf.Variable(0)  # count the number of steps taken.
#learning_rate = tf.train.exponential_decay(0.5, global_step, ...)
#optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)

batch_size = 128

hidden_layer_1_size = 1024
hidden_layer_2_size = 300
hidden_layer_3_size = 50
hidden_layer_1_stddev = np.sqrt(2.0/784) 
hidden_layer_2_stddev = np.sqrt(2.0/hidden_layer_1_size)
hidden_layer_3_stddev = np.sqrt(2.0/hidden_layer_2_size)
output_layer_stddev = np.sqrt(2.0/hidden_layer_3_size)
hidden_layer_1_keep_prob = 0.5
hidden_layer_2_keep_prob = 0.7
hidden_layer_3_keep_prob = 0.8
beta_1 = 1e-5
beta_2 = 1e-5
beta_3 = 1e-5
beta_4 = 1e-5

deep_graph = tf.Graph()
with deep_graph.as_default():
  tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)

  # first hidden layer
  hidden_layer_1_weights = tf.Variable(
    tf.truncated_normal(
      [image_size * image_size, hidden_layer_1_size], stddev=hidden_layer_1_stddev))
  hidden_layer_1_biases = tf.Variable(tf.zeros([hidden_layer_1_size]))
  hidden_layer_1 = tf.nn.dropout(
    tf.nn.relu(tf.matmul(tf_train_dataset, hidden_layer_1_weights) + hidden_layer_1_biases),
    hidden_layer_1_keep_prob)
  
  # second hidden layer
  hidden_layer_2_weights = tf.Variable(
    tf.truncated_normal(
      [hidden_layer_1_size, hidden_layer_2_size], stddev=hidden_layer_2_stddev))
  hidden_layer_2_biases = tf.Variable(tf.zeros([hidden_layer_2_size]))
  hidden_layer_2 = tf.nn.dropout(
    tf.nn.relu(tf.matmul(hidden_layer_1, hidden_layer_2_weights) + hidden_layer_2_biases),
    hidden_layer_2_keep_prob)
  
  # third hidden layer
  hidden_layer_3_weights = tf.Variable(
    tf.truncated_normal(
      [hidden_layer_2_size, hidden_layer_3_size], stddev=hidden_layer_3_stddev))
  hidden_layer_3_biases = tf.Variable(tf.zeros([hidden_layer_3_size]))
  hidden_layer_3 = tf.nn.dropout(
    tf.nn.relu(tf.matmul(hidden_layer_2, hidden_layer_3_weights) + hidden_layer_3_biases), 
    hidden_layer_3_keep_prob)
  
  # output layer
  output_weights = tf.Variable(
    tf.truncated_normal(
      [hidden_layer_3_size, num_labels],
      stddev=output_layer_stddev))
  output_biases = tf.Variable(tf.zeros([num_labels]))
  logits = tf.matmul(hidden_layer_3, output_weights) + output_biases

  # Calculate the loss with regularization
  loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits =logits, labels =tf_train_labels))
  loss += (beta_1 * tf.nn.l2_loss(hidden_layer_1_weights) +
           beta_2 * tf.nn.l2_loss(hidden_layer_2_weights) +
           beta_3 * tf.nn.l2_loss(hidden_layer_3_weights) +
           beta_4 * tf.nn.l2_loss(output_weights))
  
  # Learn with exponential rate decay.
  global_step = tf.Variable(0, trainable=False)
  starter_learning_rate = 0.4
  learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, 100000, 0.96, staircase=True)
  #learning_rate = 0.1
  optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
  train_prediction = tf.nn.softmax(logits)

  # Setup validation prediction step.
  validation_hidden_layer_1 = tf.nn.relu(tf.matmul(tf_valid_dataset, hidden_layer_1_weights) + hidden_layer_1_biases)
  validation_hidden_layer_2 = tf.nn.relu(tf.matmul(validation_hidden_layer_1, hidden_layer_2_weights) + hidden_layer_2_biases)
  validation_hidden_layer_3 = tf.nn.relu(tf.matmul(validation_hidden_layer_2, hidden_layer_3_weights) + hidden_layer_3_biases)
  validation_logits = tf.matmul(validation_hidden_layer_3, output_weights) + output_biases
  validation_prediction = tf.nn.softmax(validation_logits)

  # And setup the test prediction step.  
  test_hidden_layer_1 = tf.nn.relu(tf.matmul(tf_test_dataset, hidden_layer_1_weights) + hidden_layer_1_biases)
  test_hidden_layer_2 = tf.nn.relu(tf.matmul(test_hidden_layer_1, hidden_layer_2_weights) + hidden_layer_2_biases)
  test_hidden_layer_3 = tf.nn.relu(tf.matmul(test_hidden_layer_2, hidden_layer_3_weights) + hidden_layer_3_biases)
  test_logits = tf.matmul(test_hidden_layer_3, output_weights) + output_biases
  test_prediction = tf.nn.softmax(test_logits)

num_steps = 20000

with tf.Session(graph=deep_graph) as session:
  tf.initialize_all_variables().run()
  print("Initialized")
  for step in range(num_steps):
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(validation_prediction.eval(), valid_labels))
  print("  Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 2.642131
Minibatch accuracy: 9.4%
Validation accuracy: 11.0%
Minibatch loss at step 500: 0.396757
Minibatch accuracy: 85.9%
Validation accuracy: 84.0%
Minibatch loss at step 1000: 0.404667
Minibatch accuracy: 85.9%
Validation accuracy: 85.3%
Minibatch loss at step 1500: 0.381477
Minibatch accuracy: 88.3%
Validation accuracy: 85.7%
Minibatch loss at step 2000: 0.329934
Minibatch accuracy: 89.1%
Validation accuracy: 85.9%
Minibatch loss at step 2500: 0.382126
Minibatch accuracy: 90.6%
Validation accuracy: 86.3%
Minibatch loss at step 3000: 0.306568
Minibatch accuracy: 92.2%
Validation accuracy: 86.3%
Minibatch loss at step 3500: 0.281733
Minibatch accuracy: 89.8%
Validation accuracy: 85.8%
Minibatch loss at step 4000: 0.399985
Minibatch accuracy: 85.9%
Validation accuracy: 86.5%
Minibatch loss at step 4500: 0.314602
Minibatch accuracy: 90.6%
Validation accuracy: 86.7%
Minibatch loss at step 5000: 0.264766
Minibatch accuracy: 89.8%
Validation accuracy