In [9]:
from __future__ import division, print_function, absolute_import

import tensorflow as tf

# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)

Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz


In [10]:
# Training Paramaters
learning_rate = 0.001
num_steps = 500
batch_size = 128
display_step = 10

# network Paramaters
num_input = 784
num_classes = 10
dropout = 0.75 # Probability to keep units

# Tf Graph Units
X = tf.placeholder(tf.float32,[None,num_input])
Y = tf.placeholder(tf.float32,[None,num_classes])
keep_prob = tf.placeholder(tf.float32) # dropout(Keep probability)



# Create Convolution Models

- Tesnorflow Conv2D: It is a 2D Convolution Layer, this layer creates a convolution kernel that is wind with layers input which helps produce a tensor of outputs.

- Max pooling: Its function is to progressively reduce the spatial size of the representation to reduce the amount of parameters and computation in the network. Pooling layer operates on each feature map independently. The most common approach used in pooling is max pooling.

In [11]:
def conv2d(x, W, b, strides=1):
    # Convolutional Wrapper with bias and resistant linear Unit ( ReLU) activation
    
    x = tf.nn.conv2d(x,W,strides=[1,strides,strides,1],padding='SAME')
    x = tf.nn.bias_add(x,b)
    return tf.nn.relu(x) 

In [12]:
def maxpool2d(x,k=2):
    return tf.nn.max_pool(x,ksize=[1,k,k,1],strides=[1,k,k,1], padding='SAME')

In [13]:
def conv_net(x,weights,biases,dropout):
    # MNIST data input is a 1-D vector of 784 features (28*28 pixels)
    # Reshape to match picture format [Height x Width x Channel]
    # Tensor input become 4-D: [Batch Size, Height, Width, Channel]
    
    x = tf.reshape(x,shape=[-1,28,28,1])
    
    #(A) Convolution layer 1
    conv1 = conv2d(x,  weights['wc1'], biases['bc1'])
    conv1 = maxpool2d(conv1, k=2)
    
    #(B) Convolution Layer 2
    conv2 = conv2d(conv1,  weights['wc2'], biases['bc2'])
    conv2 = maxpool2d(conv2, k=2)
    
    #(C) Reshape the Conv Layer 2 output to fit fully connected layer input
    fc1 = tf.reshape(conv2, [-1, weights['wd1'].get_shape().as_list()[0]])
    fc1 = tf.add(tf.matmul(fc1,weights['wd1']), biases['bd1'])
    fc1 = tf.nn.relu(fc1)
    
    fc1 = tf.nn.dropout(fc1,dropout)
    
    #(D) Output, class prediction
    out = tf.add(tf.matmul(fc1,weights['out']), biases['out'])
    return out

In [14]:
# Store Weights a nd biasis
weights = {
    #5x5 conv, 1 input, 32 outputs
    'wc1': tf.Variable(tf.random_normal([5,5,1,32])),
    #5x5 conv, 32 input, 64 outputs
    'wc2': tf.Variable(tf.random_normal([5,5,32,64])),
    # Fully Connected, 7*7*64 inputs, 1024 outputs
    'wd1': tf.Variable(tf.random_normal([7*7*64,1024])),
    # Class Predictions, 1024 inputs, 10 outputs
    'out': tf.Variable(tf.random_normal([1024,num_classes]))
}
biases= {
    
    'bc1': tf.Variable(tf.random_normal([32])),
    'bc2': tf.Variable(tf.random_normal([64])),
    'bd1': tf.Variable(tf.random_normal([1024])),
    'out': tf.Variable(tf.random_normal([num_classes]))
}

# Adam Optimizer

- Stochastic gradient descent maintains a single learning rate (termed alpha) for all weight updates and the learning rate does not change during training.
- The method computes individual adaptive learning rates for different parameters from estimates of first and second moments of the gradients

In [15]:
# Construct Model
logits = conv_net(X, weights, biases,keep_prob)
prediction = tf.nn.softmax(logits)


# define Loss and optimizer
loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=Y))

optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)

# Evaluate Model
correct_pred = tf.equal(tf.argmax(prediction,1),tf.argmax(Y,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# Initialize the Variables( ie assign their default values)
init = tf.global_variables_initializer()

# Start Training

In [16]:
with tf.Session() as sess:
    
    sess.run(init)
    
    for step in range(1, num_steps+1):
        batch_x, batch_y = mnist.train.next_batch(batch_size)
        
        # Run Back Propagation
        sess.run(train_op, feed_dict={X:batch_x, Y:batch_y, keep_prob:dropout})
        if step % display_step == 0 or step == 1:
            
            loss, acc = sess.run([loss_op, accuracy], feed_dict={X: batch_x, Y: batch_y, keep_prob: 1.0})
            print("Step " + str(step) + ", Minibatch Loss= " + \
                  "{:.4f}".format(loss) + ", Training Accuracy= " + \
                  "{:.3f}".format(acc))

    print("Optimization Finished!")

    # Calculate accuracy for 256 MNIST test images
    print("Testing Accuracy:", \
        sess.run(accuracy, feed_dict={X: mnist.test.images[:256], Y: mnist.test.labels[:256], keep_prob: 1.0}))

Step 1, Minibatch Loss= 57975.8789, Training Accuracy= 0.117
Step 10, Minibatch Loss= 27368.8828, Training Accuracy= 0.156
Step 20, Minibatch Loss= 7475.2979, Training Accuracy= 0.594
Step 30, Minibatch Loss= 5640.0518, Training Accuracy= 0.703
Step 40, Minibatch Loss= 4607.6860, Training Accuracy= 0.750
Step 50, Minibatch Loss= 4025.4976, Training Accuracy= 0.758
Step 60, Minibatch Loss= 2333.5850, Training Accuracy= 0.867
Step 70, Minibatch Loss= 2230.7544, Training Accuracy= 0.875
Step 80, Minibatch Loss= 2183.7117, Training Accuracy= 0.828
Step 90, Minibatch Loss= 2825.0396, Training Accuracy= 0.828
Step 100, Minibatch Loss= 1597.9348, Training Accuracy= 0.898
Step 110, Minibatch Loss= 2046.6309, Training Accuracy= 0.883
Step 120, Minibatch Loss= 2496.8472, Training Accuracy= 0.844
Step 130, Minibatch Loss= 2070.6782, Training Accuracy= 0.898
Step 140, Minibatch Loss= 1650.1099, Training Accuracy= 0.898
Step 150, Minibatch Loss= 1173.9304, Training Accuracy= 0.898
Step 160, Minibat

In [17]:
help(tf.train.AdamOptimizer)

Help on class AdamOptimizer in module tensorflow.python.training.adam:

class AdamOptimizer(tensorflow.python.training.optimizer.Optimizer)
 |  AdamOptimizer(learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam')
 |  
 |  Optimizer that implements the Adam algorithm.
 |  
 |  See [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
 |  ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
 |  
 |  Method resolution order:
 |      AdamOptimizer
 |      tensorflow.python.training.optimizer.Optimizer
 |      tensorflow.python.training.checkpointable.base.CheckpointableBase
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam')
 |      Construct a new Adam optimizer.
 |      
 |      Initialization:
 |      
 |      $$m_0 := 0  ext{(Initialize initial 1st moment vector)}$$
 |      $$v_0 := 0  ext{(Initialize initial 2nd moment vector)}$$
 |      $