# Study of effects of Params on Training

1. Weight initialization (random normal - scaled by 1/sqrt(input nodes))
2. Activation functions (sigmoid, relu)
3. Impact of learning rate on various types of activation functions
4. Impact of vanishing gradients in input layer on training time (sigmoid activation vs rely activation)

Observe pre-activation/post-activation values in each layer (tensorboard)
Weights and gradients distribution (tensorboard)

For each run, make sure that you write to a different directory. Once you complete all runs, start tensorboard specifying all directories something like: 

sudo tensorboard --logdir relu_lr_0_01:./relu_lr_0_01,relu_lr_0_1:./relu_lr_0_1,relu_lr_0_1_rn:./relu_lr_0_1_rn,relu_lr_0_01_rns:./relu_lr_0_01_rns,sigmoid_lr_0_01:./sigmoid_lr_0_01,sigmoid_lr_0_1:./sigmoid_lr_0_1,sigmoid_lr_0_01_rns:./sigmoid_lr_0_01_rns

In [11]:
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf

# Read data set
mnist = input_data.read_data_sets("MNIST_data", one_hot=True)

# Inputs and labels
with tf.name_scope("input/labels"):
    x = tf.placeholder(tf.float32, [None, 784])
    y_ = tf.placeholder(tf.float32, [None, 10])

# Weights/bias for hidden layer
with tf.name_scope("hidden_layer_1_weights"):
        W1 = tf.Variable(tf.truncated_normal([784, 200])*(1/tf.sqrt(784.0)))
    b1 = tf.Variable(tf.truncated_normal([200]))
    w1_hist = tf.summary.histogram("hidden_layer_1_W1", W1)
    b1_hist = tf.summary.histogram("hidden_layer_1_B1", b1)
    regularizer_w1 = tf.nn.l2_loss(W1)

# Weights/bias for hidden layer
with tf.name_scope("hidden_layer_2_weights"):
    W2 = tf.Variable(tf.truncated_normal([200, 200])*(1/tf.sqrt(200.0)))
    b2 = tf.Variable(tf.truncated_normal([200]))
    w2_hist = tf.summary.histogram("hidden_layer_2_W2", W2)
    b2_hist = tf.summary.histogram("hidden_layer_2_B2", b2)
    regularizer_w2 = tf.nn.l2_loss(W2)

# Weights/bias for output layer
with tf.name_scope("output_layer_weights"):
    W3 = tf.Variable(tf.truncated_normal([200, 10])*(1/tf.sqrt(200.0)))
    b3 = tf.Variable(tf.truncated_normal([10]))
    w3_hist = tf.summary.histogram("output_layer_W3", W3)
    b3_hist = tf.summary.histogram("output_layer_B3", b3)
    regularizer_w3 = tf.nn.l2_loss(W3)

# Computational graph
with tf.name_scope("hidden_1"):
    Z2 = tf.matmul(x, W1) + b1
    A2 = tf.nn.relu(Z2)
    z2_hist = tf.summary.histogram("hidden_layer_1_Z2", Z2)
    a2_hist = tf.summary.histogram("hidden_layer_1_A2", A2)

# Computational graph
with tf.name_scope("hidden_2"):
    Z3 = tf.matmul(A2, W2) + b2
    A3 = tf.nn.relu(Z3)
    z3_hist = tf.summary.histogram("hidden_layer_2_Z3", Z3)
    a3_hist = tf.summary.histogram("hidden_layer_2_A3", A3)

# Computational graph
with tf.name_scope("output"):
    y = tf.matmul(A3, W3) + b3
    y_hist = tf.summary.histogram("output_y", y)

# Cost function
with tf.name_scope("cost"):
    regularizer = regularizer_w1 + regularizer_w2 + regularizer_w3
    ce_cost = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)
    cross_entropy = tf.reduce_mean(ce_cost + 0.01 * regularizer)

# Optimizer
with tf.name_scope("train"):
    w1_grads, b1_grads = tf.gradients(cross_entropy, [W1, b1])
    w2_grads, b2_grads = tf.gradients(cross_entropy, [W2, b2])
    w3_grads, b3_grads = tf.gradients(cross_entropy, [W3, b3])
    w1_grads_hist = tf.summary.histogram("w1_gradients", w1_grads)
    b1_grads_hist = tf.summary.histogram("b1_gradients", b1_grads)
    w2_grads_hist = tf.summary.histogram("w2_gradients", w2_grads)
    b2_grads_hist = tf.summary.histogram("b2_gradients", b2_grads)
    w3_grads_hist = tf.summary.histogram("w3_gradients", w3_grads)
    b3_grads_hist = tf.summary.histogram("b3_gradients", b3_grads)

    global_step = tf.Variable(0, trainable=False)
    starter_learning_rate = 0.01
    learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, 1000, 0.25, staircase=True)
    train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(cross_entropy)

# Create session
sess = tf.Session()
tf.set_random_seed(1)
sess.run(tf.global_variables_initializer())

#batch_xs, batch_ys = mnist.train.next_batch(100)
#dJdW = tf.matmul(tf.transpose(x), tf.gradients(cross_entropy, y)[0])
#a = sess.run(dJdW, feed_dict={x: batch_xs, y_: batch_ys})
#b = sess.run(tf.gradients(cross_entropy, W), feed_dict={x: batch_xs, y_: batch_ys})[0]

# Test trained model
with tf.name_scope("accuracy"):
    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

# Tensorboard
writer = tf.summary.FileWriter("./relu_lr_0_01_s", graph=tf.get_default_graph())
cost_summary = tf.summary.scalar("cross-entropy", cross_entropy)
acc_summary = tf.summary.scalar("accuracy", accuracy)
params_summary = tf.summary.merge([w1_hist, b1_hist, w2_hist, b2_hist, w3_hist, b3_hist, z2_hist, a2_hist, z3_hist, a3_hist, y_hist])
grad_summary = tf.summary.merge([w1_grads_hist, b1_grads_hist, w2_grads_hist, b2_grads_hist, w3_grads_hist, b3_grads_hist])

# Train
for step in range(1000):
    batch_xs, batch_ys = mnist.train.next_batch(100)
    _, cost, cross_entropy_summary, params_hist_summary, gradients_hist_summary = sess.run([train_step, cross_entropy, cost_summary, params_summary, grad_summary], feed_dict={x: batch_xs, y_: batch_ys})
    test_acc, test_acc_summary = sess.run([accuracy, acc_summary], feed_dict={x: mnist.test.images, y_: mnist.test.labels})

    writer.add_summary(cross_entropy_summary, step) 
    writer.add_summary(test_acc_summary, step) 
    writer.add_summary(params_hist_summary, step) 
    writer.add_summary(gradients_hist_summary, step) 
    print step, cost, test_acc 



Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz
0 4.91659 0.1024
1 4.41419 0.1064
2 4.59433 0.1037
3 4.38532 0.1064
4 4.29262 0.1082
5 4.30954 0.1095
6 4.13256 0.1098
7 4.02793 0.1106
8 3.97059 0.1121
9 3.97476 0.1146
10 3.96859 0.1164
11 4.02341 0.1153
12 3.96837 0.1178
13 3.96698 0.1194
14 3.92758 0.1257
15 3.99282 0.1296
16 3.90094 0.1386
17 3.91036 0.1584
18 3.92658 0.1596
19 3.92178 0.1659
20 3.8608 0.177
21 3.91181 0.1868
22 3.85249 0.1886
23 3.88418 0.1844
24 3.88471 0.1987
25 3.85113 0.1984
26 3.86654 0.1923
27 3.84546 0.1712
28 3.81033 0.2015
29 3.817 0.2245
30 3.80655 0.225
31 3.83993 0.2195
32 3.81013 0.2222
33 3.8378 0.2278
34 3.81357 0.2403
35 3.77572 0.235
36 3.80256 0.2272
37 3.83527 0.2461
38 3.79487 0.2754
39 3.771 0.2931
40 3.77536 0.3131
41 3.78498 0.3191
42 3.78226 0.3114
43 3.80447 0.3232
44 3.75306 0.3239
45 3.77484 0.36