In [1]:
# https://www.tensorflow.org/get_started/mnist/beginners
import tensorflow as tf
import numpy as np

In [2]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


In [3]:
# Basically we're just applying softmax to a linear equation (y = softmax(Wx + b))
# and optimizing the Ws and bs on that.
with tf.name_scope("simple"):
    x = tf.placeholder(tf.float32, [None, 784])
    W = tf.Variable(tf.zeros([784, 10]))
    b = tf.Variable(tf.zeros([10]))
    y = tf.nn.softmax(x @ W + b)

In [4]:
with tf.name_scope("simple"):
    # y_ is the true values
    y_ = tf.placeholder(tf.float32, [None, 10])
    # cross_entropy = -tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1])
    cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
    train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
    correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [5]:
sess = tf.InteractiveSession()
tf.global_variables_initializer().run()

In [6]:
for _ in range(1000):
    batch_xs, batch_ys = mnist.train.next_batch(100)
    sess.run(train_step, {x: batch_xs, y_: batch_ys})

In [7]:
sess.run(accuracy, {x: mnist.test.images, y_: mnist.test.labels})

0.90640002

In [8]:
# Now let's extend this to softmax running against a deep CNN

# https://www.tensorflow.org/api_docs/python/tf/nn/conv2d

# Reshape the input from [batch_size, 784] to square images [batch_size, 28, 28, 1]
# The -1 says "however many you need on this axis", so we don't need to hard-code the batch size,
# and the final 1 is because there's one channel (grayscale images)
x_image = tf.reshape(x, [-1, 28, 28, 1], name="input")
# This is the filter to the first conv2d layer. We are looking at 5x5 image patches,
# there's 1 input channel, and we want to output 32 features.
# We initialize the filter to random values with a stddev of 0.1. We could initialize to 0 but that
# could cause symmetry issues.
with tf.name_scope("conv1") as scope:
    W_conv1 = tf.Variable(tf.truncated_normal([5, 5, 1, 32], stddev=0.1), name="filter")
    # Each output feature also gets a bias component, defined here. We initialize to 0.1 to help avoid
    # "dead neurons" that can happen when using relu activation and neurons get permanently stuck
    # in below-zero values. (since relu is max(0, val))
    b_conv1 = tf.Variable(tf.constant(0.1, shape=[32]), name="bias")
    # Finally, defining the actual conv layer. Stride is 4D because the input is
    # typically it is [1, stride, stride, 1]
    # We are computing 32 features, so the conv2d layer returns shape [batch_size, 28, 28, 32]
    # In other words, we get back the original input but 32 channels of "feature" output at each pixel
    # instead of the 1 color channel.
    h_conv1 = tf.nn.relu(tf.nn.conv2d(x_image, W_conv1, strides = [1,1,1,1], padding='SAME') + b_conv1, name="conv")
    # Then, we add a max pooling layer. Because our stride is 2, this cuts each image dimension in half using max()
    # giving a total reduction to 25% original size. Output shape is thus [batch_size, 14, 14, 32]
    h_pool1 = tf.nn.max_pool(h_conv1, strides=[1, 2, 2, 1], ksize=[1, 2, 2, 1], padding='SAME', name=scope)

# The second conv layer returns 64 features for each 5x5 patch. Same stddev as before.
with tf.name_scope("conv2") as scope:
    W_conv2 = tf.Variable(tf.truncated_normal([5, 5, 32, 64], stddev=0.1), name="filter")
    b_conv2 = tf.Variable(tf.constant(0.1, shape=[64]), name="bias")
    h_conv2 = tf.nn.relu(tf.nn.conv2d(h_pool1, W_conv2, strides=[1,1,1,1], padding='SAME') + b_conv2, name="conv")
    # Reduce to 25% original size again. So we end up with shape [batch_size, 7, 7, 64]
    h_pool2 = tf.nn.max_pool(h_conv2, strides=[1, 2, 2, 1], ksize=[1, 2, 2, 1], padding='SAME', name=scope)

# Now we add a fully-connected layer with 1024 neurons to process the features found across the entire image.
# First, reshape to [batch_size, 7*7*64] so that each image is flattened to a one-dimensional array.
h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64], name="flattened")
# Now, create the fully-connected layer.
with tf.name_scope("fc1") as scope:
    # Weights
    W_fc1 = tf.Variable(tf.truncated_normal([7*7*64, 1024], stddev=0.1), name="weights")
    # Biases
    b_fc1 = tf.Variable(tf.constant(0.1, shape=[1024]), name="biases")
    # Create the layer. Using relu activation here as well. Relu seems to be popular in CNNs, not sure why yet
    # other than "it works best".
    h_fc1 = tf.nn.relu(h_pool2_flat @ W_fc1 + b_fc1, name=scope)

# Now we add a dropout layer to help prevent over-fitting. The dropout layer randomly deactivates
# some neurons during training.
# Since we want to disable dropout during testing/use, we add a placeholder to control the
# dropout probability (actually the inverse, the "keep probability", set to 100% during use)
keep_prob = tf.placeholder(tf.float32, name="keep_prob")
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob, name="dropout")

# Lastly, the output layer ("read layer"), same as the simple softmax model above.
with tf.name_scope("fc2") as scope:
    W_fc2 = tf.Variable(tf.truncated_normal([1024, 10], stddev=0.1), name="weights")
    b_fc2 = tf.Variable(tf.constant(0.1, shape=[10]), name="biases")
    y_conv = tf.identity(h_fc1_drop @ W_fc2 + b_fc2, name=scope)

In [9]:
# Training and testing is similar to the above.

from tqdm import trange

# cross_entropy = -tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1])
cross_entropy_conv = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv), name="cross_entropy")
tf.summary.scalar('loss', cross_entropy_conv)
train_step_conv = tf.train.AdamOptimizer(1e-4, name="train_step").minimize(cross_entropy_conv)
correct_prediction_conv = tf.equal(tf.argmax(y_conv,1), tf.argmax(y_, 1))
accuracy_conv = tf.reduce_mean(tf.cast(correct_prediction_conv, tf.float32), name="accuracy")
tf.summary.scalar('accuracy', accuracy_conv)

with tf.Session() as sess:
    summary_data = tf.summary.merge_all()
    writer = tf.summary.FileWriter("summary/", sess.graph)
    sess.run(tf.global_variables_initializer())
    pbar = trange(20000)
    for i in pbar:
        batch = mnist.train.next_batch(50)
        if i % 100 == 0:
            train_accuracy, summary = sess.run([accuracy_conv, summary_data], {x: batch[0], y_: batch[1], keep_prob: 1.0})
            pbar.set_description("accuracy {}".format(train_accuracy))
            writer.add_summary(summary, i)
        train_step_conv.run({x: batch[0], y_: batch[1], keep_prob: 0.5})
    print("test accuracy {}".format(accuracy_conv.eval({x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0})))

accuracy 1.0: 100%|██████████| 20000/20000 [26:58<00:00, 12.90it/s]               


test accuracy 0.9919999837875366
