In [1]:
!cat ../bookGithubRepoURL.txt

https://github.com/ageron/handson-ml


In [1]:
from functools import partial
from tensorflow.contrib.layers import dropout, variance_scaling_initializer 
from tensorflow.examples.tutorials.mnist import input_data
import matplotlib.pyplot as plt
import tensorflow as tf

%matplotlib inline
plt.rcParams['figure.figsize'] = [16, 9]

## Batch Normalization
### Implementation in TF

In [3]:
tf.reset_default_graph()

mnist = input_data.read_data_sets("/tmp/data/")
n_in      = 28 * 28 # MNIST
n_hidden1 = 300
n_hidden2 = 100
n_out     = 10
eta       = 0.01
momentum  = 0.25 

X = tf.placeholder(tf.float32, shape = (None, n_in), name = 'X')
y = tf.placeholder(tf.int64, shape = (None), name = 'y')
is_training = tf.placeholder(tf.bool, shape = (), name = 'is_training')

with tf.name_scope('dnn'):
    he_init = variance_scaling_initializer()
    my_batch_norm_layer = partial(tf.layers.batch_normalization,
                                  training = is_training,
                                  momentum = 0.9) # formerly 'decay'
    my_dense_layer = partial(tf.layers.dense, kernel_initializer = he_init)
    
    hidden1 = my_dense_layer(X, n_hidden1, name = 'hidden1')
    bn1 = tf.nn.elu(my_batch_norm_layer(hidden1))
    hidden2 = my_dense_layer(bn1, n_hidden2, name = 'hidden2')
    bn2 = tf.nn.elu(my_batch_norm_layer(hidden2))
    logits_before_bn = my_dense_layer(
        bn2, n_out, activation = None, name = 'outputs')
    logits = my_batch_norm_layer(logits_before_bn)
    extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz


In [4]:
with tf.name_scope('loss'):
    x_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels = y, logits = logits)
    loss = tf.reduce_mean(x_entropy, name = 'loss')

In [5]:
with tf.name_scope('train'):
    optimizer = tf.train.MomentumOptimizer(eta, momentum)
    training_op = optimizer.minimize(loss)

In [6]:
with tf.name_scope('eval'):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

init = tf.global_variables_initializer()
# saver = tf.train.Saver()

In [7]:
n_epochs = 40
batch_size = 200

with tf.Session() as s:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(len(mnist.test.labels) // batch_size):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            s.run(
                [training_op, extra_update_ops], 
                feed_dict = { is_training: True, X: X_batch, y: y_batch })
        acc_train = accuracy.eval(
            feed_dict = { is_training: False, X: X_batch, y: y_batch })
        acc_test = accuracy.eval(feed_dict = { is_training: False, 
                                               X: mnist.test.images, 
                                               y: mnist.test.labels })
        print('%2d Train accuracy: %.4f   Test accuracy: %.4f'
              %(epoch, acc_train, acc_test))
        # save_path = saver.save(s, 'my_batch_norm_mod.ckpt')

 0 Train accuracy: 0.7750   Test accuracy: 0.7606
 1 Train accuracy: 0.8150   Test accuracy: 0.8296
 2 Train accuracy: 0.8850   Test accuracy: 0.8550
 3 Train accuracy: 0.8900   Test accuracy: 0.8687
 4 Train accuracy: 0.8900   Test accuracy: 0.8798
 5 Train accuracy: 0.8450   Test accuracy: 0.8869
 6 Train accuracy: 0.9100   Test accuracy: 0.8935
 7 Train accuracy: 0.9000   Test accuracy: 0.8968
 8 Train accuracy: 0.8750   Test accuracy: 0.9017
 9 Train accuracy: 0.8750   Test accuracy: 0.9038
10 Train accuracy: 0.9350   Test accuracy: 0.9098
11 Train accuracy: 0.9000   Test accuracy: 0.9128
12 Train accuracy: 0.9250   Test accuracy: 0.9147
13 Train accuracy: 0.9400   Test accuracy: 0.9162
14 Train accuracy: 0.9250   Test accuracy: 0.9185
15 Train accuracy: 0.9000   Test accuracy: 0.9212
16 Train accuracy: 0.9200   Test accuracy: 0.9230
17 Train accuracy: 0.9300   Test accuracy: 0.9251
18 Train accuracy: 0.9350   Test accuracy: 0.9270
19 Train accuracy: 0.9450   Test accuracy: 0.9287


### Gradient Clipping
To prevent exploding/vanishing gradients, simply clip the gradient to some threshold value (mostly useful for recurrent nns). 
NOTE: Batch normalization is generally preferred

In [8]:
threshold = 1.
optimizer = tf.train.GradientDescentOptimizer(eta)
grads_and_vars = optimizer.compute_gradients(loss)
capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var)
              for grad, var in grads_and_vars]
training_op = optimizer.apply_gradients(capped_gvs)

# run this training_op at every training step

## Reusing Pretrained Layers
### Reusing a TF Model

```
with tf.Session() as s:
    saver.restore(s, './pretrained_model.ckpt')
    # ...```

To reuse only SOME layers, Saver must be configured accordingly

```
# <Build new model with same def as before for hidden layers 1 - 3>
init = tf.global_variables_initializer()

reuse_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                               scope = 'hidden[123]')
reuse_vars_dict = dict(
    [(var.name, var.name) for var in reuse_vars])
# saver to restore original mod
original_saver = tf.Saver(reuse_vars) 
# ...and to save new mod
new_saver = tf.Saver()                

with tf.Session() as s:
    s.run(init)
    # restore layers 1-3
    original_saver.restore('./pretrained_model.ckpt') 
    # <Train new mod>
    new_saver.save('./new_model.ckpt') # save whole model```

## Faster Optimizers

### Momentum Optimization

In [9]:
optimizer = tf.train.MomentumOptimizer(learning_rate = eta, momentum = 0.9)

### Nesterov Accelerated Gradient

In [10]:
optimizer = tf.train.MomentumOptimizer(
    learning_rate = eta, momentum = 0.9, use_nesterov = True)

### AdaGrad
### RMSProp

In [11]:
optimizer = tf.train.RMSPropOptimizer(
    learning_rate = eta, momentum = 0.9, decay = 0.9, epsilon = 1e-10)

### Adam Optimization
This is a combination of many of the above, and generally yields the best results

In [12]:
optimizer = tf.train.AdamOptimizer(learning_rate = eta)

## Learning Rate Scheduling

In [13]:
# e.g.
initial_learning_rate = 0.1
decay_steps = 10000
decay_rate = 1/10
global_step = tf.Variable(0, trainable = False)
learning_rate = tf.train.exponential_decay(
    initial_learning_rate, global_step, decay_steps, decay_rate)
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum = 0.9)
training_op = optimizer.minimize(loss, global_step = global_step)

## Regularization to Avoid Overfitting

### _L_<sub>1</sub> and _L_<sub>2</sub> Regularization

In [16]:
# ...constuct a network w/ say, one hidden layer with weights W1, and one 
# output layer with W2
base_loss = tf.reduce_mean(xentropy, name='avg_xentropy')
reg_losses = tf.reduce_sum(tf.abs(W1)) + tf.reduce_sum(tf.abs(W2))
loss = tf.add(base_loss, scale * reg_losses, name = 'loss')

In [None]:
# But for applying to many layers, the following is more efficient
with arg_scope(
    [fully_connected],
    weights_regularizer = tf.contrib.layers.l1_regularizer(scale=0.01)):
    
    hidden1 = fully_connected(X, n_hidden1, scope='hidden1')
    hidden2 = fully_connected(hidden1, n_hidden2, scope='hidden2')
    logits = fully_connected(
        hidden2, n_outputs, activation_fn=None, scope='out')
    
# Then add to loss function
reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
loss = tf.add_n([base_loss] + reg_losses, name='loss')

### Dropout 
Note: if overfitting, increase the dropout rate; if underfitting, decrease the dropout rate

In [None]:
#...
is_training = tf.placeholder(tf.bool, shape=(), name='is_training')
keep_prob = 0.5

X_drop = dropout(X, keep_prob, is_training=is_training)

hidden1 = fully_connected(X_drop, n_hidden1, scope='hidden1')
hidden1_drop = dropout(hidden1, keep_prob, is_training=is_training)

hidden2 = fully_connected(hidden1_drop, n_hidden2, scope='hidden2')
hidden2_drop = dropout(hidden2, keep_prob, is_training=is_training)

logits = fully_connected(
    hidden2_drop, n_outputs, activation_fn=None, scope='outputs')

### Max-Norm Regularization

In [None]:
threshold = 1. # max norm for w, the vector of incoming weights to a neuron
clipped_weights = tf.clip_by_norm(weights, clip_norm=threshold, axes=1)
clip_weights = tf.assign(weights, clipped_weights)

# Apply at each step
with tf.Session() as s:
    #...
    for epoch in range(n_epochs):
        #...
        for X_batch, y_batch in zip(X_batches, y_batches):
            s.run(training_op, feed_dict={ X: X_batch, y: y_batch })
            clip_weights.eval()

In [None]:
# to get access to the weights at each layer
hidden1 = fully_connected(X, n_hidden1, scope='hidden1')
with tf.variable_scope('hidden1', reuse=True):
    weights1 = tf.get_variable('weights')
    
# OR
hidden1 = fully_connected(X, n_hidden1, scope='hidden1')
hidden2 = fully_connected(hidden1, n_hidden2, scope='hidden2')
#...

with tf.variable_scope('', default_name='', reuse=True): # root scope
    w1 = tf.get_variable('hidden1/weights')
    w2 = tf.get_variable('hidden2/weights')

In [3]:
# To see a list of variables
for var in tf.global_variables():
    print(var.name)

In [4]:
# Cleaner solution
def max_norm_regularizer(
    threshold, axes=1, name='max_norm', collection='max_norm'):
    
    def max_norm(weights):
        clipped = tf.clip_by_norm(weights, clip_norm=threshold, axes=axes)
        clip_weights = tf.assign(weights, clipped, name=name)
        tf.add_to_collection(collection, clip_weights)
        return None # no regularization loss term
    return max_norm

In [None]:
# Can be used as...
max_norm_reg = max_norm_regularizer(threshold=1.)
hidden1 = fully_connected(
    X, n_hidden1, scope='hidden1', weights_regularizer=max_norm_reg)

In [None]:
# Clipping still has to be done after each step
clip_all_weights = tf.get_collection('max_norm')

with tf.Session() as s:
    # ...
    for epoch in range(n_epochs):
        # ...
        for X_batch, y_batch in zip(X_batches, y_batches):
            s.run(training_op, feed_dict={ X: X_batch, y: y_batch })
            s.run(clip_all_weights)