# Intro ANNs
## In TensorFlow

In [1]:
import tensorflow as tf
from tensorflow.contrib.layers import fully_connected
from tensorflow.examples.tutorials.mnist import input_data
from datetime import datetime

In [2]:
mnist = input_data.read_data_sets("/tmp/data/")

Successfully downloaded train-images-idx3-ubyte.gz 9912422 bytes.
Extracting /tmp/data/train-images-idx3-ubyte.gz
Successfully downloaded train-labels-idx1-ubyte.gz 28881 bytes.
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Successfully downloaded t10k-labels-idx1-ubyte.gz 4542 bytes.
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz


### Step 1. Building the Computation Graph

In [3]:
datetime.now().strftime("%Y_%m_%d")

'2018_05_25'

In [49]:
now = datetime.now().strftime("%Y%m%d")
outfile = f"./tf_logs/run-{now}"

tf.reset_default_graph()

n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")

with tf.name_scope("dnn"):
    hidden1 = fully_connected(X, n_hidden1, activation_fn=tf.nn.elu, 
                              scope="hidden1")
    hidden2 = fully_connected(hidden1, n_hidden2, activation_fn=tf.nn.elu,
                              scope="hidden2")
    logits = fully_connected(hidden2, n_outputs, scope="outputs",
                             activation_fn=None)

with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

learning_rate = 0.01
with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    train_step = optimizer.minimize(loss)
    
with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    summary_acc = tf.summary.scalar("Accuracy", accuracy)
    file_writer = tf.summary.FileWriter(outfile, tf.get_default_graph())
    
init = tf.global_variables_initializer()
saver = tf.train.Saver()

## Step 2. Execution Phase

In [50]:
n_epochs = 70
batch_size = 50
batch_iterations = mnist.train.num_examples // batch_size

with tf.Session() as sess:
    sess.run(init)
    for epoch in range(n_epochs):
        for iteration in range(batch_iterations):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run(train_step, feed_dict={X: X_batch, y:y_batch})
            
        acc_train = sess.run(accuracy, feed_dict={X:X_batch, y:y_batch})
        acc_test = sess.run(accuracy, feed_dict={X:mnist.train.images,
                                                 y:mnist.train.labels})
        end = "\n" if epoch % 10 == 0 else "\r"
        print(f"@Epoch {epoch}. Train Acc: {acc_train:0.3%} | Test Acc: {acc_test:0.3%}", end=end)
        tboard_loss = summary_acc.eval(feed_dict={X: X_batch, y:y_batch})
        file_writer.add_summary(tboard_loss, epoch)
    save_path = saver.save(sess, "./tmp/my_model_final.ckpt")
file_writer.close()

@Epoch 0. Train Acc: 90.000% | Test Acc: 89.345%
@Epoch 10. Train Acc: 98.000% | Test Acc: 94.942%
@Epoch 20. Train Acc: 94.000% | Test Acc: 96.642%%
@Epoch 30. Train Acc: 100.000% | Test Acc: 97.555%
@Epoch 40. Train Acc: 100.000% | Test Acc: 98.204%
@Epoch 50. Train Acc: 100.000% | Test Acc: 98.556%
@Epoch 60. Train Acc: 100.000% | Test Acc: 98.887%
@Epoch 69. Train Acc: 100.000% | Test Acc: 99.167%

## Batch Normalization in TensorFlow

In [5]:
from tensorflow.contrib.layers import fully_connected, batch_norm
from tensorflow.contrib.framework import arg_scope

In [53]:
tf.reset_default_graph()
n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")

is_training = tf.placeholder(tf.bool, shape=(), name="is_training")
bn_params = {
    "is_training": is_training,
    "decay": 0.999, # The exponential decay hyperparameter
    "updates_collections": None
}

with tf.name_scope("dnn"):
    ###  Argument Scopes ### 
    # In order to aviod repetition,
    # the first parameter is a list of functions,
    # and the other parameters will be passed to
    # these functions automatically.
    with arg_scope(
        [fully_connected],
        normalizer_fn=batch_norm,
        normalizer_params=bn_params
    ):
        hidden1 = fully_connected(X, n_hidden1, activation_fn=tf.nn.elu,
                                  scope="hidden1")
        hidden2 = fully_connected(hidden1, n_hidden2, activation_fn=tf.nn.elu,
                                  scope="hidden2")
        logits = fully_connected(hidden2, n_outputs, activation_fn=None,
                                 scope="logits")

with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels=y,
        logits=logits
    )
    loss = tf.reduce_mean(xentropy, name="loss")

learning_rate = 0.01
with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    train_step = optimizer.minimize(loss)

with tf.name_scope("evaluate"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

init = tf.global_variables_initializer()

In [55]:
epochs = 70
batch_size = 50
batch_iterations = mnist.train.num_examples // batch_size

with tf.Session() as sess:
    for epoch in range(epochs):
        sess.run(init)
        for it in range(batch_iterations):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run(train_step,
                     feed_dict={X:X_batch, y:y_batch, is_training:True})
        acc_mnist = sess.run(accuracy,
                             feed_dict={X:mnist.train.images,
                                        y:mnist.train.labels,
                                        is_training:False})
        end = "\n" if epoch % 10 == 0 else "\r"
        print(f"Accuracy @Epoch {epoch}: {acc_mnist:0.3%}", end=end)

Accuracy @Epoch 0: 87.695%
Accuracy @Epoch 10: 89.489%
Accuracy @Epoch 20: 87.689%
Accuracy @Epoch 30: 90.333%
Accuracy @Epoch 40: 88.845%
Accuracy @Epoch 50: 89.882%
Accuracy @Epoch 60: 89.269%
Accuracy @Epoch 69: 87.873%

## Gradient Clipping
In order to lessen the exploding gradients problem

In [15]:
tf.reset_default_graph()

n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")

with tf.name_scope("dnn"):
    hidden1 = fully_connected(X, n_hidden1, scope="hidden1")
    hidden2 = fully_connected(hidden1, n_hidden2, scope="hidden2")
    logits = fully_connected(hidden2, n_outputs, activation_fn=None,
                             scope="logits")
    
#### GRADIENT CLIPPING IMPLEMENTATION ####
# In tensorflow, the optimizer’s minimize() function
# takes care of both computing the gradients and applying them,
# so you must instead call the optimizer’s compute_gradients()
# method first, then create an operation to clip the gradients
# using the clip_by_value() function, and finally create an operation
# to apply the clipped gradients using the optimizer’s apply_gradients() method
with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")
    
learning_rate = 0.01
with tf.name_scope("train"):
    threshold = 1.0
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    grads_and_vars = optimizer.compute_gradients(loss)
    capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var)
                  for grad, var in grads_and_vars]
    train_step = optimizer.apply_gradients(capped_gvs)

with tf.name_scope("evaluate"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    
init = tf.global_variables_initializer()

In [21]:
n_epochs = 70
batch_size = 50
batch_iterations = mnist.train.num_examples // batch_size


with tf.Session() as sess:
    sess.run(init)
    for epoch in range(n_epochs):
        for it in range(batch_iterations):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run(train_step, feed_dict={X:X_batch, y:y_batch})
            
        test_acc = sess.run(accuracy,
                            feed_dict={X:mnist.train.images,
                                       y:mnist.train.labels})
        end = "\n" if epoch % 10 == 0 else "\r"
        print(f"@Epoch {epoch}. Test accuracy: {test_acc:0.4%}", end=end)

@Epoch 0. Test accuracy: 89.5000%
@Epoch 10. Test accuracy: 96.5891%
@Epoch 20. Test accuracy: 98.0891%
@Epoch 30. Test accuracy: 98.8727%
@Epoch 40. Test accuracy: 99.2764%
@Epoch 50. Test accuracy: 99.6564%
@Epoch 60. Test accuracy: 99.7618%
@Epoch 69. Test accuracy: 99.8909%

# Exercises

**1. Is it okay to initialize all the weights to the same value as long as that value is selected randomly using He initialization?**  
No, since initializing all weights to the same value would not allow backprop to break the *symmetry*, i.e., for each neuron in a given layer, their values would remain the same at evey step of the learning process.

**2. Is it okay to initialize the bias terms to 0?**  
Yes. Bias neurons do not suffer from the symmetry problem presented in other neurons.


**3. Name three advantages of the ELU activation function over ReLU.**
1. Smooth everywhere, which in turn helps speeding up gradient descent.
2. Has non-zero gradient at $z < 0$, which helps preventing the dying neuron problem
3. Considering initialization with standard normal, it helps with the vanishing gradient problem, since its average value is 0,


**4. In which cases would you want to use each of the following activation functions: ELU, leaky ReLU (and its variants), ReLU, tanh, logistic, and softmax?**

**5. What may happen if you set the momentum hyperparameter too close to 1 (e.g., 0.99999) when using a MomentumOptimizer?**  
We expect the learning rate to shoot up, since, as $\beta \to 1$ and $k\to\infty$, $m_k \to \infty$


**6. Name three ways you can produce a sparse model.**
1. Set small weights to 0
2. Use a strong $L_1$ regularization scheme
3. Dual Averaging

**7. Does dropout slow down training? Does it slow down inference (i.e., making predictions on new instances)?**


**8. Deep Learning.**

a. Build a DNN with five hidden layers of 100 neurons each, He initialization, and the ELU activation function.

b. Using Adam optimization and early stopping, try training it on MNIST but only on digits 0 to 4, as we will use transfer learning for digits 5 to 9 in the next exercise. You will need a softmax output layer with five neurons, and as always make sure to save checkpoints at regular intervals and save the final model so you can reuse it later.

c. Tune the hyperparameters using cross-validation and see what precision you can achieve.

d. Now try adding Batch Normalization and compare the learning curves: is it converging faster than before? Does it produce a better model?  
No, it underperforms slighly the complete feed forward neural network.

e. Is the model overfitting the training set? Try adding dropout to every layer and try again. Does it help?

In [3]:
from tensorflow.contrib.layers import fully_connected, variance_scaling_initializer, batch_norm
from tensorflow.contrib.framework import arg_scope
import numpy as np
from numpy.random import choice, seed
from math import ceil

In [4]:
train1_mask = mnist.train.labels <= 4
train1_y = mnist.train.labels[train1_mask]
train1_X = mnist.train.images[train1_mask,:]

seed(1643)
n_train = sum(train1_mask)
n_cv = ceil(0.2 * n_train)
indices_cv = np.random.choice(range(n_train), n_cv, replace=False)
indices_ytrain = [i for i in range(n_train) if i not in indices_cv]

train1_y_cv, train1_y = train1_y[indices_cv], train1_y[indices_ytrain]
train1_X_cv, train1_X = train1_X[indices_cv], train1_X[indices_ytrain]

In [89]:
%%time
now = datetime.now()
outfile_train = now.strftime("./tf_logs/run-%Y%m%d%H%M-BN-train")
outfile_cv = now.strftime("./tf_logs/run-%Y%m%d%H%M-BN-cv")

tf.reset_default_graph()

n_inputs = 28 * 28
n_hidden = 100
n_output = 5

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")

he_init = variance_scaling_initializer()
is_training = tf.placeholder(tf.bool, shape=(), name="is_training")
bn_params = {
    "is_training": is_training,
    "decay": 0.999,
    "updates_collections": None,    
}

with tf.name_scope("dnn"):
    with arg_scope(
        [fully_connected],
        normalizer_fn=batch_norm,
        normalizer_params=bn_params):
        with arg_scope(
            [fully_connected],
            weights_initializer = he_init,
            activation_fn = tf.nn.elu):
            hidden1 = fully_connected(X, n_hidden, scope="hidden1")
            hidden2 = fully_connected(hidden1, n_hidden, scope="hidden2")
            hidden3 = fully_connected(hidden2, n_hidden, scope="hidden3")
            hidden4 = fully_connected(hidden3, n_hidden, scope="hidden4")
            hidden5 = fully_connected(hidden4, n_hidden, scope="hidden5")
        output = fully_connected(hidden5, n_output, activation_fn=None,
                                 weights_initializer=he_init, scope="output")

with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels=y,
        logits=output)
    loss = tf.reduce_mean(xentropy, name="loss")

alpha = 0.001
with tf.name_scope("train"):
    optimizer = tf.train.AdamOptimizer(alpha)
    train_step = optimizer.minimize(loss, name="train_step")

with tf.name_scope("metrics"):
    correct = tf.nn.in_top_k(output, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    writer_train = tf.summary.FileWriter(outfile_train, tf.get_default_graph())
    writer_test = tf.summary.FileWriter(outfile_cv, tf.get_default_graph())
    summ_acc_train = tf.summary.scalar("Accuracy", accuracy)
    summ_acc_cv = tf.summary.scalar("Accuracy", accuracy)

saver = tf.train.Saver()
init = tf.global_variables_initializer()

epochs = 400
with tf.Session() as sess:
    sess.run(init)
    for epoch in range(epochs):
        # Running a training step
        sess.run(train_step, feed_dict={is_training: True, X:train1_X, y:train1_y})
        acc = sess.run(accuracy, feed_dict ={is_training:False,
                                             X:train1_X, y:train1_y})
        acc_cv = sess.run(accuracy, feed_dict={is_training: False,
                                               X:train1_X_cv, y:train1_y_cv})
        end = "\n" if epoch % 40 == 0 else "\r"
        """Tensorboard summaries"""
        tboard_train_loss = summ_acc_train.eval(feed_dict={is_training: False,
                                                           X:train1_X, y:train1_y})
        tboard_cv_loss = summ_acc_cv.eval(feed_dict={is_training: False,
                                                     X:train1_X_cv, y:train1_y_cv})
        writer_train.add_summary(tboard_train_loss, epoch)
        writer_test.add_summary(tboard_cv_loss, epoch)
        
        print(f"Accuracy @Epoch {epoch:03}... train: {acc:.2%} | cv: {acc_cv:0.2%}", end=end)
    saver.save(sess, "./tmp/my_model_final.ckpt")
writer_test.close()
writer_train.close()

Accuracy @Epoch 000... train: 26.71% | cv: 26.91%
Accuracy @Epoch 040... train: 96.07% | cv: 95.77%
Accuracy @Epoch 080... train: 97.54% | cv: 97.08%
Accuracy @Epoch 120... train: 98.16% | cv: 97.56%
Accuracy @Epoch 160... train: 98.45% | cv: 97.82%
Accuracy @Epoch 200... train: 98.64% | cv: 97.95%
Accuracy @Epoch 240... train: 98.80% | cv: 98.04%
Accuracy @Epoch 280... train: 98.89% | cv: 98.00%
Accuracy @Epoch 320... train: 98.93% | cv: 98.02%
Accuracy @Epoch 360... train: 98.94% | cv: 97.99%
CPU times: user 25min 54s, sys: 2min 27s, total: 28min 21s
Wall time: 5min 33s


**Accuracy Comparisson under training set**
![Training History](./images/Training_comparissons.png)

### An updated version of saving & restoring models

In [71]:
from jupyter_tf_graph import show_graph

We can get a hold of a pretrained model via the `tf.train.import_meta_graph` function.

In [72]:
tf.reset_default_graph()
# Recreating a graph saved in a MetaGraph protocol buffer (proto)
saver = tf.train.import_meta_graph("./tmp/my_model_final.ckpt.meta")
show_graph(tf.get_default_graph())

In order to continue the training of a saved model, it is necessary to restore the graph and obtain the operations required.

In [91]:
tf.reset_default_graph()
saver = tf.train.import_meta_graph("./tmp/my_model_final.ckpt.meta")

X = tf.get_default_graph().get_operation_by_name("X")
y = tf.get_default_graph().get_operation_by_name("y")
train_step = tf.get_default_graph().get_operation_by_name("train_step")

KeyError: "The name 'train_step' refers to an Operation not in the graph."

In [None]:
for op in _81.get_operations():
    print(op.name)

In [81]:
tf.get_default_graph()

<tensorflow.python.framework.ops.Graph at 0x1c4086ca58>

In [85]:
_81.get_operation_by_name("X")

<tf.Operation 'X' type=Placeholder>

In [87]:
_81.get_tensor_by_name("X:0")

<tf.Tensor 'X:0' shape=(?, 784) dtype=float32>

In [84]:
_81.get_operations()

[<tf.Operation 'X' type=Placeholder>,
 <tf.Operation 'y' type=Placeholder>,
 <tf.Operation 'is_training' type=Placeholder>,
 <tf.Operation 'hidden1/weights/Initializer/truncated_normal/shape' type=Const>,
 <tf.Operation 'hidden1/weights/Initializer/truncated_normal/mean' type=Const>,
 <tf.Operation 'hidden1/weights/Initializer/truncated_normal/stddev' type=Const>,
 <tf.Operation 'hidden1/weights/Initializer/truncated_normal/TruncatedNormal' type=TruncatedNormal>,
 <tf.Operation 'hidden1/weights/Initializer/truncated_normal/mul' type=Mul>,
 <tf.Operation 'hidden1/weights/Initializer/truncated_normal' type=Add>,
 <tf.Operation 'hidden1/weights' type=VariableV2>,
 <tf.Operation 'hidden1/weights/Assign' type=Assign>,
 <tf.Operation 'hidden1/weights/read' type=Identity>,
 <tf.Operation 'dnn/hidden1/MatMul' type=MatMul>,
 <tf.Operation 'dnn/hidden1/BatchNorm/Reshape/shape' type=Const>,
 <tf.Operation 'dnn/hidden1/BatchNorm/Reshape' type=Reshape>,
 <tf.Operation 'hidden1/BatchNorm/beta/Initia

In [None]:
epochs = 10
for epochs in range(epochs):
    sess.run()

In [31]:
reuse_vars_dict

{'hidden1/weights:0': 'hidden1/weights:0',
 'hidden1/BatchNorm/beta:0': 'hidden1/BatchNorm/beta:0',
 'hidden2/weights:0': 'hidden2/weights:0',
 'hidden2/BatchNorm/beta:0': 'hidden2/BatchNorm/beta:0',
 'hidden3/weights:0': 'hidden3/weights:0',
 'hidden3/BatchNorm/beta:0': 'hidden3/BatchNorm/beta:0'}

**9. Transfer learning.**

a. Create a new DNN that reuses all the pretrained hidden layers of the previous model, freezes them, and replaces the softmax output layer with a fresh new one.

b. Train this new DNN on digits 5 to 9, using only 100 images per digit, and time how long it takes. Despite this small number of examples, can you achieve high precision?

c. Try caching the frozen layers, and train the model again: how much faster is it now?

d. Try again reusing just four hidden layers instead of five. Can you achieve a higher precision?

e. Now unfreeze the top two hidden layers and continue training: can you get the model to perform even better?

**10. Pretraining on an auxiliary task.**

a. In this exercise you will build a DNN that compares two MNIST digit images and predicts whether they represent the same digit or not. Then you will reuse the lower layers of this network to train an MNIST classifier using very little training data. Start by building two DNNs (let’s call them DNN A and B), both similar to the one you built earlier but without the output layer: each DNN should have five hidden layers of 100 neurons each, He initialization, and ELU activation. Next, add a single output layer on top of both DNNs. You should use TensorFlow’s concat() function with axis=1 to concatenate the outputs of both DNNs along the horizontal axis, then feed the result to the output layer. This output layer should contain a single neuron using the logistic activation function.

b. Split the MNIST training set in two sets: split #1 should containing 55,000 images, and split #2 should contain contain 5,000 images. Create a function that generates a training batch where each instance is a pair of MNIST images picked from split #1. Half of the training instances should be pairs of images that belong to the same class, while the other half should be images from different classes. For each pair, the training label should be 0 if the images are from the same class, or 1 if they are from different classes.

c. Train the DNN on this training set. For each image pair, you can simultaneously feed the first image to DNN A and the second image to DNN B. The whole network will gradually learn to tell whether two images belong to the same class or not.

d. Now create a new DNN by reusing and freezing the hidden layers of DNN A and adding a softmax output layer on with 10 neurons. Train this network on split #2 and see if you can achieve high performance despite having only 500 images per class.