# MNIST For ML Beginners 
-------------------------------
[Tutorial link](https://www.tensorflow.org/get_started/mnist/beginners)



### Dependencies
 - python3.6
 - tensorflow 1.2
 - tqdm (pip install tqdm)


In [1]:
import tensorflow as tf
from tqdm import tqdm_notebook as tqdm
from tensorflow.examples.tutorials.mnist import input_data

In [2]:
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


## Create simple linear classifier
-----------------------------------
28 * 28 MNIST Image, first dimension is 'None',  
because we feed a batch of images (mini-batch) to our classification layer.  
Tensorflow will change dimension to multifly matirx onto this input batch.  
It's called 'broadcasting'

https://www.tensorflow.org/performance/xla/broadcasting

In [3]:
W = tf.Variable(tf.random_uniform([784, 10], -0.1, 0.1))
b = tf.Variable(tf.zeros([10]))
x = tf.placeholder(tf.float32, [None, 784]) # input images

### Softmax function
----------------------------------
https://en.wikipedia.org/wiki/Softmax_function

\begin{equation*}
\begin{split}
\boldsymbol{y} & = softmax(W * \boldsymbol{x} + \boldsymbol{b}) \\
y_i & = \frac {e^{z_i}} {\sum_{k=1}^K e^{z_k}}  \qquad\textrm{where}\  \boldsymbol{z} = W * \boldsymbol{x} + \boldsymbol{b} \\
\end{split}
\end{equation*}


In [4]:
y = tf.nn.softmax(tf.matmul(x, W) + b)
print(y)

Tensor("Softmax:0", shape=(?, 10), dtype=float32)


## Define cross entropy
------------------------------
https://en.wikipedia.org/wiki/Cross_entropy

Entropy
\begin{equation*}
H(\boldsymbol{p}) = -\sum \boldsymbol{p}\log{\boldsymbol{p}} \\
\end{equation*}

Cross entropy
\begin{equation*}
\begin{split}
H(\boldsymbol{p},\boldsymbol{q}) & = H(P) + D_{KL}(\boldsymbol{p}||\boldsymbol{q}) \\
& = -\sum \boldsymbol{p}\log{\boldsymbol{p}} + \sum \boldsymbol{p} \log{\frac{\boldsymbol{p}}{\boldsymbol{q}}} \\
& = -\sum \boldsymbol{p} \log(\boldsymbol{q})
\end{split}
\end{equation*}

## Define loss
-----------------------------
\begin{equation*}
L = -\frac{1}{N}\sum \boldsymbol{y'}\log{\boldsymbol{y}}
  \qquad \textrm{where} \ N \ \textrm{is size of mini-batch}
\end{equation*}


In [5]:
y_ = tf.placeholder(tf.float32, [None, 10]) # corrent labels

In [6]:
# cross-entropy
# 'axis=1' indicates that summation over each example
H = - tf.reduce_sum(y_ * tf.log(y), axis=1) 

# the mean over all the examples in the batch
# Note that these equations have numerical unstability on x < 0.
# So, tensorflow provides helper function to deal with it, tf.nn.softmax_cross_entropy_with_logits
# Next tutorial, you can see how to use this function.
L = tf.reduce_mean(H) 

In [7]:
# Note the tensor shapes
print(y_)
print(H)
print(L)

Tensor("Placeholder_1:0", shape=(?, 10), dtype=float32)
Tensor("Neg:0", shape=(?,), dtype=float32)
Tensor("Mean:0", shape=(), dtype=float32)


## Create gradient descent optimizer

In [8]:
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(L)

## Create 'InteractiveSession' for interactive python

In [9]:
sess = tf.InteractiveSession()
tf.global_variables_initializer().run() # init all variables

## Let's train!

In [10]:
for _ in tqdm(range(1000)):
    images, labels = mnist.train.next_batch(128) # get mini-batch images and corresponding labels
    sess.run(optimizer, feed_dict={x: images, y_: labels})




## evaluate our model

In [11]:
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [12]:
print(sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}))

0.9209


In [13]:
W.eval()

array([[-0.04298268, -0.00520787, -0.03291936, ...,  0.00568776,
        -0.04828663,  0.01818345],
       [-0.05319693,  0.05488672,  0.02772415, ...,  0.09843419,
        -0.00537233, -0.06864445],
       [ 0.04682956,  0.08482439, -0.09622722, ..., -0.0853005 ,
        -0.00257616, -0.09722564],
       ..., 
       [-0.02836733,  0.06304901, -0.07991686, ...,  0.01970718,
        -0.09005542, -0.0028522 ],
       [ 0.02615906, -0.02544034,  0.06845229, ..., -0.07427039,
         0.07561996,  0.0466641 ],
       [ 0.01169886, -0.01846943,  0.04258705, ..., -0.05760248,
        -0.08985155, -0.08340633]], dtype=float32)

In [14]:
b.eval()

array([-0.34494323,  0.35730207,  0.09391858, -0.28978905, -0.00995228,
        1.29448557, -0.0966873 ,  0.62683249, -1.41084206, -0.2203224 ], dtype=float32)