In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os
import sys

# to make this notebook's output stable across runs
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "autoencoders"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

import numpy.random as rnd
import tensorflow as tf
import time

  from ._conv import register_converters as _register_converters


In [2]:
def plot_image(image, shape=[28, 28]):
    plt.imshow(image.reshape(shape), cmap="Greys", interpolation="nearest")
    plt.axis("off")

In [3]:
def plot_multiple_images(images, n_rows, n_cols, pad=2):
    images = images - images.min()  # make the minimum == 0, so the padding looks white
    w,h = images.shape[1:]
    image = np.zeros(((w+pad)*n_rows+pad, (h+pad)*n_cols+pad))
    for y in range(n_rows):
        for x in range(n_cols):
            image[(y*(h+pad)+pad):(y*(h+pad)+pad+h),(x*(w+pad)+pad):(x*(w+pad)+pad+w)] = images[y*n_cols+x]
    plt.imshow(image, cmap="Greys", interpolation="nearest")
    plt.axis("off")

In [4]:
from IPython.display import clear_output, Image, display, HTML

def strip_consts(graph_def, max_const_size=32):
    """Strip large constant values from graph_def."""
    strip_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = strip_def.node.add() 
        n.MergeFrom(n0)
        if n.op == 'Const':
            tensor = n.attr['value'].tensor
            size = len(tensor.tensor_content)
            if size > max_const_size:
                tensor.tensor_content = "b<stripped %d bytes>"%size
    return strip_def

def show_graph(graph_def, max_const_size=32):
    """Visualize TensorFlow graph."""
    if hasattr(graph_def, 'as_graph_def'):
        graph_def = graph_def.as_graph_def()
    strip_def = strip_consts(graph_def, max_const_size=max_const_size)
    code = """
        <script>
          function load() {{
            document.getElementById("{id}").pbtxt = {data};
          }}
        </script>
        <link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
        <div style="height:600px">
          <tf-graph-basic id="{id}"></tf-graph-basic>
        </div>
    """.format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))

    iframe = """
        <iframe seamless style="width:1200px;height:620px;border:0" srcdoc="{}"></iframe>
    """.format(code.replace('"', '&quot;'))
    display(HTML(iframe))

In [5]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/")

Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz


In [6]:
def show_reconstructed_digits(X, outputs, model_path = None, n_test_digits = 2):
    with tf.Session() as sess:
        if model_path:
            saver.restore(sess, model_path)
        X_test = mnist.test.images[:n_test_digits]
        outputs_val = outputs.eval(feed_dict={X: X_test})

    fig = plt.figure(figsize=(8, 3 * n_test_digits))
    for digit_index in range(n_test_digits):
        plt.subplot(n_test_digits, 2, digit_index * 2 + 1)
        plot_image(X_test[digit_index])
        plt.subplot(n_test_digits, 2, digit_index * 2 + 2)
        plot_image(outputs_val[digit_index])

# PCA with Auto Linear encoder

Build data set:

In [None]:
def prepare3DDataSet(seed=4):
  rnd.seed(seed)
  m = 200
  w1, w2 = 0.1, 0.3
  noise = 0.1

  angles = rnd.rand(m) * 3 * np.pi / 2 - 0.5
  data = np.empty((m, 3))
  data[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * rnd.randn(m) / 2
  data[:, 1] = np.sin(angles) * 0.7 + noise * rnd.randn(m) / 2
  data[:, 2] = data[:, 0] * w1 + data[:, 1] * w2 + noise * rnd.randn(m)
  return data

In [None]:
rnd.rand(3)

In [None]:
def standardizeSplitData(data,trainRatio=0.5):
  from sklearn.preprocessing import StandardScaler
  total,dimSample = np.shape(data)
  trainSize = np.int32(total * trainRatio)
  scaler = StandardScaler()
  X_train = scaler.fit_transform(data[:trainSize])
  X_test = scaler.transform(data[trainSize:])
  return X_train, X_test

In [None]:
d=prepare3DDataSet()
X_train,X_test = standardizeSplitData(d,0.5)

In [None]:
reset_graph()

n_inputs = 3
n_hidden = 2  # codings
n_outputs = n_inputs

learning_rate = 0.01

with tf.name_scope("network"):
  X = tf.placeholder(tf.float32, shape=[None, n_inputs])
  hidden = tf.layers.dense(X, n_hidden)
  outputs = tf.layers.dense(hidden, n_outputs)

with tf.name_scope("loss"):
  reconstruction_loss = tf.reduce_mean(tf.square(outputs - X))

with tf.name_scope("train"):
  optimizer = tf.train.AdamOptimizer(learning_rate)
  training_op = optimizer.minimize(reconstruction_loss)

init = tf.global_variables_initializer()

In [None]:
#show_graph(tf.get_default_graph())

In [None]:
n_iterations = 1000
codings = hidden

with tf.Session() as sess:
    init.run()
    for iteration in range(n_iterations):
        training_op.run(feed_dict={X: X_train})
    codings_val = codings.eval(feed_dict={X: X_test})

the 3d data projected to 2d plane

In [None]:
fig = plt.figure(figsize=(4,3))
plt.plot(codings_val[:,0], codings_val[:, 1], "b.")
plt.xlabel("$z_1$", fontsize=18)
plt.ylabel("$z_2$", fontsize=18, rotation=0)

plt.show()

In [None]:

from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(4,3))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_train[:,0], X_train[:,1], X_train[:,2])
#ax.plot3D(X_train[:,0], X_train[:,1], X_train[:,2])


plt.show()

# Stacked Autoencoders

![title](img/Sandwich_autoencoder.png)

    Taking MNIST

In [None]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/")

In [None]:

reset_graph()

from functools import partial

n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 150  # codings
n_hidden3 = n_hidden1
n_outputs = n_inputs

learning_rate = 0.01
l2_reg = 0.0001
#l2_reg = 0.01
#l2_reg=0.0000000000000001
with  tf.name_scope("input"):
  X = tf.placeholder(tf.float32, shape=[None, n_inputs])

with tf.name_scope("network"):
  he_init = tf.contrib.layers.variance_scaling_initializer() # He initialization
  #Equivalent to:
  #he_init = lambda shape, dtype=tf.float32: tf.truncated_normal(shape, 0., stddev=np.sqrt(2/shape[0]))
  l2_regularizer = tf.contrib.layers.l2_regularizer(l2_reg)
  my_dense_layer = partial(tf.layers.dense,
                           activation=tf.nn.elu,
                           kernel_initializer=he_init,
                           kernel_regularizer=l2_regularizer)

  hidden1 = my_dense_layer(X, n_hidden1)
  hidden2 = my_dense_layer(hidden1, n_hidden2)
  hidden3 = my_dense_layer(hidden2, n_hidden3)
  outputs = my_dense_layer(hidden3, n_outputs, activation=None)

with tf.name_scope("loss"):
  reconstruction_loss = tf.reduce_mean(tf.square(outputs - X))
  reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
  loss = tf.add_n([reconstruction_loss] + reg_losses)
with tf.name_scope("train"):
  optimizer = tf.train.AdamOptimizer(learning_rate)
  training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()
saver = tf.train.Saver() # not shown in the book

In [None]:
n_epochs = 5
batch_size = 150

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        n_batches = mnist.train.num_examples // batch_size
        for iteration in range(n_batches):
            print("\r{}%".format(100 * iteration // n_batches), end="") # not shown in the book
            sys.stdout.flush()                                          # not shown
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run(training_op, feed_dict={X: X_batch})
        loss_train = reconstruction_loss.eval(feed_dict={X: X_batch})   # not shown
        print("\r{}".format(epoch), "Train MSE:", loss_train)           # not shown
        saver.save(sess, "./my_model_all_layers.ckpt")                  # not shown

In [None]:
show_reconstructed_digits(X, outputs, "./my_model_all_layers.ckpt")


# Typing weight

It is common to tie the weights of the encoder and the decoder (weights_decoder = tf.transpose(weights_encoder)). Unfortunately this makes it impossible (or very tricky) to use the tf.layers.dense() function, so we need to build the Autoencoder manually:

In [None]:
reset_graph()

n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 150  # codings
n_hidden3 = n_hidden1
n_outputs = n_inputs

learning_rate = 0.01
l2_reg = 0.0005

In [None]:
activation = tf.nn.elu
regularizer = tf.contrib.layers.l2_regularizer(l2_reg)
initializer = tf.contrib.layers.variance_scaling_initializer()

with tf.name_scope("input"):
  X = tf.placeholder(tf.float32, shape=[None, n_inputs])

with tf.name_scope("network"):
  weights1_init = initializer([n_inputs, n_hidden1])
  weights2_init = initializer([n_hidden1, n_hidden2])

  weights1 = tf.Variable(weights1_init, dtype=tf.float32, name="weights1")
  weights2 = tf.Variable(weights2_init, dtype=tf.float32, name="weights2")
  weights3 = tf.transpose(weights2, name="weights3")  # tied weights
  weights4 = tf.transpose(weights1, name="weights4")  # tied weights

  biases1 = tf.Variable(tf.zeros(n_hidden1), name="biases1")
  biases2 = tf.Variable(tf.zeros(n_hidden2), name="biases2")
  biases3 = tf.Variable(tf.zeros(n_hidden3), name="biases3")
  biases4 = tf.Variable(tf.zeros(n_outputs), name="biases4")

  hidden1 = activation(tf.matmul(X, weights1) + biases1)
  hidden2 = activation(tf.matmul(hidden1, weights2) + biases2)
  hidden3 = activation(tf.matmul(hidden2, weights3) + biases3)
  outputs = tf.matmul(hidden3, weights4) + biases4

with tf.name_scope("loss"):
  reconstruction_loss = tf.reduce_mean(tf.square(outputs - X))
  reg_loss = regularizer(weights1) + regularizer(weights2)
  loss = reconstruction_loss + reg_loss

with tf.name_scope("training"):
  optimizer = tf.train.AdamOptimizer(learning_rate)
  training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [None]:
#show_graph(tf.get_default_graph())

In [None]:

n_epochs = 10
batch_size = 150

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        n_batches = mnist.train.num_examples // batch_size
        for iteration in range(n_batches):
            print("\r{}%".format(100 * iteration // n_batches), end="")
            sys.stdout.flush()
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run(training_op, feed_dict={X: X_batch})
        loss_train = reconstruction_loss.eval(feed_dict={X: X_batch})
        print("\r{}".format(epoch), "Train MSE:", loss_train)
        saver.save(sess, "./my_model_tying_weights.ckpt")

In [None]:
show_reconstructed_digits(X, outputs, "./my_model_tying_weights.ckpt")

# Training one Autoencoder at a time in multiple graphs <br>
to speed up the training

There are many ways to train one Autoencoder at a time. The first approach it to train each Autoencoder using a different graph, then we create the Stacked Autoencoder by simply initializing it with the weights and biases copied from these Autoencoders.

Let's create a function that will train one autoencoder and return the transformed training set (i.e., the output of the hidden layer) and the model parameters.

![title](img/TrainAutoencoderMultipleGraphs.png)

In [None]:
reset_graph()

n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 150  # codings
n_hidden3 = n_hidden1
n_outputs = n_inputs

learning_rate = 0.01
l2_reg = 0.0005
from functools import partial

def train_autoencoder(X_train, n_neurons, n_epochs, batch_size,
                      learning_rate = 0.01, l2_reg = 0.0005,
                      activation=tf.nn.elu, seed=42):
    graph = tf.Graph()
    with graph.as_default():
        tf.set_random_seed(seed)

        n_inputs = X_train.shape[1]

        X = tf.placeholder(tf.float32, shape=[None, n_inputs])
        
        my_dense_layer = partial(
            tf.layers.dense,
            activation=activation,
            kernel_initializer=tf.contrib.layers.variance_scaling_initializer(),
            kernel_regularizer=tf.contrib.layers.l2_regularizer(l2_reg))

        hidden = my_dense_layer(X, n_neurons, name="hidden")
        outputs = my_dense_layer(hidden, n_inputs, activation=None, name="outputs")

        reconstruction_loss = tf.reduce_mean(tf.square(outputs - X))

        reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
        loss = tf.add_n([reconstruction_loss] + reg_losses)

        optimizer = tf.train.AdamOptimizer(learning_rate)
        training_op = optimizer.minimize(loss)

        init = tf.global_variables_initializer()

    with tf.Session(graph=graph) as sess:
        init.run()
        for epoch in range(n_epochs):
            n_batches = len(X_train) // batch_size
            for iteration in range(n_batches):
                print("\r{}%".format(100 * iteration // n_batches), end="")
                sys.stdout.flush()
                indices = rnd.permutation(len(X_train))[:batch_size]
                X_batch = X_train[indices]
                sess.run(training_op, feed_dict={X: X_batch})
            loss_train = reconstruction_loss.eval(feed_dict={X: X_batch})
            print("\r{}".format(epoch), "Train MSE:", loss_train)
        params = dict([(var.name, var.eval()) for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)])
        hidden_val = hidden.eval(feed_dict={X: X_train})
        return hidden_val, params["hidden/kernel:0"], params["hidden/bias:0"], params["outputs/kernel:0"], params["outputs/bias:0"]

Now let's train two Autoencoders. The first one is trained on the training data, and the second is trained on the previous Autoencoder's hidden layer output:

In [None]:
hidden_output, W1, b1, W4, b4 = train_autoencoder(mnist.train.images, n_neurons=300, n_epochs=4, batch_size=150)
_, W2, b2, W3, b3 = train_autoencoder(hidden_output, n_neurons=150, n_epochs=4, batch_size=150)

Finally, we can create a Stacked Autoencoder by simply reusing the weights and biases from the Autoencoders we just trained:

In [None]:
reset_graph()

n_inputs = 28*28

X = tf.placeholder(tf.float32, shape=[None, n_inputs])
hidden1 = tf.nn.elu(tf.matmul(X, W1) + b1)
hidden2 = tf.nn.elu(tf.matmul(hidden1, W2) + b2)
hidden3 = tf.nn.elu(tf.matmul(hidden2, W3) + b3)
outputs = tf.matmul(hidden3, W4) + b4

In [None]:
show_reconstructed_digits(X, outputs)

# Training one Autoencoder at a time in a single graph

Another approach is to use a single graph. To do this, we create the graph for the full Stacked Autoencoder, but then we also add operations to train each Autoencoder independently: phase 1 trains the bottom and top layer (ie. the first Autoencoder) and phase 2 trains the two middle layers (ie. the second Autoencoder).

![title](img/TrainAutoencoderSingleGraph.png)

In [None]:
reset_graph()

n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 150  # codings
n_hidden3 = n_hidden1
n_outputs = n_inputs

learning_rate = 0.01
l2_reg = 0.0001

activation = tf.nn.elu
regularizer = tf.contrib.layers.l2_regularizer(l2_reg)
initializer = tf.contrib.layers.variance_scaling_initializer() #HE initialization

with tf.name_scope("input"):
  X = tf.placeholder(tf.float32, shape=[None, n_inputs])

with tf.name_scope("initialize_weight_bias"):
  weights1_init = initializer([n_inputs, n_hidden1])
  weights2_init = initializer([n_hidden1, n_hidden2])
  weights3_init = initializer([n_hidden2, n_hidden3])
  weights4_init = initializer([n_hidden3, n_outputs])

  weights1 = tf.Variable(weights1_init, dtype=tf.float32, name="weights1")
  weights2 = tf.Variable(weights2_init, dtype=tf.float32, name="weights2")
  weights3 = tf.Variable(weights3_init, dtype=tf.float32, name="weights3")
  weights4 = tf.Variable(weights4_init, dtype=tf.float32, name="weights4")

  biases1 = tf.Variable(tf.zeros(n_hidden1), name="biases1")
  biases2 = tf.Variable(tf.zeros(n_hidden2), name="biases2")
  biases3 = tf.Variable(tf.zeros(n_hidden3), name="biases3")
  biases4 = tf.Variable(tf.zeros(n_outputs), name="biases4")

with tf.name_scope("network"):
  hidden1 = activation(tf.matmul(X, weights1) + biases1)
  hidden2 = activation(tf.matmul(hidden1, weights2) + biases2)
  hidden3 = activation(tf.matmul(hidden2, weights3) + biases3)
  outputs = tf.matmul(hidden3, weights4) + biases4

with tf.name_scope("loss"):
  reconstruction_loss = tf.reduce_mean(tf.square(outputs - X))

with tf.name_scope("optimizer"):
  optimizer = tf.train.AdamOptimizer(learning_rate)

with tf.name_scope("training_phase1"):
  phase1_outputs = tf.matmul(hidden1, weights4) + biases4  # bypass hidden2 and hidden3
  phase1_reconstruction_loss = tf.reduce_mean(tf.square(phase1_outputs - X))
  phase1_reg_loss = regularizer(weights1) + regularizer(weights4)
  phase1_loss = phase1_reconstruction_loss + phase1_reg_loss
  phase1_training_op = optimizer.minimize(phase1_loss)
  
with tf.name_scope("training_phase2"):
  phase2_reconstruction_loss = tf.reduce_mean(tf.square(hidden3 - hidden1))
  phase2_reg_loss = regularizer(weights2) + regularizer(weights3)
  phase2_loss = phase2_reconstruction_loss + phase2_reg_loss
  train_vars = [weights2, biases2, weights3, biases3]
  phase2_training_op = optimizer.minimize(phase2_loss, var_list=train_vars) # freeze hidden1

In [None]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [None]:
start=time.clock()
training_ops = [phase1_training_op, phase2_training_op]
reconstruction_losses = [phase1_reconstruction_loss, phase2_reconstruction_loss]
n_epochs = [4, 4]
batch_sizes = [150, 150]

with tf.Session() as sess:
    #init.run()
    sess.run(init)
    for phase in range(2):
        print("Training phase #{}".format(phase + 1))
        for epoch in range(n_epochs[phase]):
            n_batches = mnist.train.num_examples // batch_sizes[phase]
            for iteration in range(n_batches):
                print("\r{}%".format(100 * iteration // n_batches), end="")
                sys.stdout.flush()
                X_batch, y_batch = mnist.train.next_batch(batch_sizes[phase])
                sess.run(training_ops[phase], feed_dict={X: X_batch})
            loss_train = reconstruction_losses[phase].eval(feed_dict={X: X_batch})
            print("\r{}".format(epoch), "Train MSE:", loss_train)
            saver.save(sess, "./my_model_one_at_a_time.ckpt")
    loss_test = reconstruction_loss.eval(feed_dict={X: mnist.test.images})
    print("Test MSE:", loss_test)
end=time.clock()

In [None]:

print("{0},{1},{2}".format(start,end,end-start))

# Cache the frozen layer outputs

Since hidden layer 1 is frozen during phase 2, its output will always be the same for any given training instance. To avoid having to recompute the output of hidden layer 1 at every single epoch, you can compute it for the whole training set at the end of phase 1, then directly feed the cached output of hidden layer 1 during phase 2. This can give you a nice performance boost.

In [None]:
start=time.clock()
training_ops = [phase1_training_op, phase2_training_op]
reconstruction_losses = [phase1_reconstruction_loss, phase2_reconstruction_loss]
n_epochs = [10, 10]
batch_sizes = [150, 150]

with tf.Session() as sess:
    init.run()
    for phase in range(2):
        print("Training phase #{}".format(phase + 1))
        if phase == 1:
            hidden1_cache = hidden1.eval(feed_dict={X: mnist.train.images})
        for epoch in range(n_epochs[phase]):
            n_batches = mnist.train.num_examples // batch_sizes[phase]
            for iteration in range(n_batches):
                print("\r{}%".format(100 * iteration // n_batches), end="")
                sys.stdout.flush()
                if phase == 1:
                    indices = rnd.permutation(mnist.train.num_examples)
                    hidden1_batch = hidden1_cache[indices[:batch_sizes[phase]]]
                    feed_dict = {hidden1: hidden1_batch}
                    sess.run(training_ops[phase], feed_dict=feed_dict)
                else:
                    X_batch, y_batch = mnist.train.next_batch(batch_sizes[phase])
                    feed_dict = {X: X_batch}
                    sess.run(training_ops[phase], feed_dict=feed_dict)
            loss_train = reconstruction_losses[phase].eval(feed_dict=feed_dict)
            print("\r{}".format(epoch), "Train MSE:", loss_train)
            saver.save(sess, "./my_model_cache_frozen.ckpt")
    loss_test = reconstruction_loss.eval(feed_dict={X: mnist.test.images})
    print("Test MSE:", loss_test)
end=time.clock()

In [None]:
print("{0},{1},{2}".format(start,end,end-start))

In [None]:
show_reconstructed_digits(X, outputs,"./my_model_one_at_a_time.ckpt")

#Visualizing the feature

Show the first 5 neurons of first layer

In [None]:
with tf.Session() as sess:
    saver.restore(sess, "./my_model_one_at_a_time.ckpt")
    weights1_val = weights1.eval()
    numNeuron,dim = np.shape(weights1_val.T)
    print("Totally we have {0} neurons, each neurons has dimension:{1}".format(numNeuron,dim))
for i in range(5):
    plt.subplot(1, 5, i + 1)
    plot_image(weights1_val.T[i])
plt.show()

# Unsupervised pretraining

Current Deep Learning tsunami is the discovery in 2006 by Geoffrey Hinton et al. that deep neural networks can be pretrained in an unsupervised fashion. They used restricted Boltzmann machines for that (see Appendix E), but in 2007 Yoshua Bengio et al. showed2 that autoencoders worked just as well.

Below is an classifier use case of unsupervised training 

In [None]:
reset_graph()

n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 150
n_outputs = 10

learning_rate = 0.01
l2_reg = 0.0005

with tf.name_scope("set_para"):
  activation = tf.nn.elu
  regularizer = tf.contrib.layers.l2_regularizer(l2_reg)
  initializer = tf.contrib.layers.variance_scaling_initializer()

with tf.name_scope("input"):
  X = tf.placeholder(tf.float32, shape=[None, n_inputs])
  y = tf.placeholder(tf.int32, shape=[None])

with tf.name_scope("initialize_weight_bias"):
  weights1_init = initializer([n_inputs, n_hidden1])
  weights2_init = initializer([n_hidden1, n_hidden2])
  weights3_init = initializer([n_hidden2, n_outputs])

  weights1 = tf.Variable(weights1_init, dtype=tf.float32, name="weights1")
  weights2 = tf.Variable(weights2_init, dtype=tf.float32, name="weights2")
  weights3 = tf.Variable(weights3_init, dtype=tf.float32, name="weights3")

  biases1 = tf.Variable(tf.zeros(n_hidden1), name="biases1")
  biases2 = tf.Variable(tf.zeros(n_hidden2), name="biases2")
  biases3 = tf.Variable(tf.zeros(n_outputs), name="biases3")

with tf.name_scope("network"):
  hidden1 = activation(tf.matmul(X, weights1) + biases1)
  hidden2 = activation(tf.matmul(hidden1, weights2) + biases2)
  logits = tf.matmul(hidden2, weights3) + biases3

with tf.name_scope("loss"):
  cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
  reg_loss = regularizer(weights1) + regularizer(weights2) + regularizer(weights3)
  loss = cross_entropy + reg_loss

with tf.name_scope("training"):
  optimizer = tf.train.AdamOptimizer(learning_rate)
  training_op = optimizer.minimize(loss)

with tf.name_scope("validate"):
  correct = tf.nn.in_top_k(logits, y, 1)
  accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))



In [None]:
init = tf.global_variables_initializer()
pretrain_saver = tf.train.Saver([weights1, weights2, biases1, biases2])
saver = tf.train.Saver()

Using regulator training:

In [None]:
n_epochs = 4
batch_size = 150
n_labeled_instances = 20000

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        n_batches = n_labeled_instances // batch_size
        for iteration in range(n_batches):
            print("\r{}%".format(100 * iteration // n_batches), end="")
            sys.stdout.flush()
            indices = rnd.permutation(n_labeled_instances)[:batch_size]
            X_batch, y_batch = mnist.train.images[indices], mnist.train.labels[indices]
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        accuracy_val = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
        print("\r{}".format(epoch), "Train accuracy:", accuracy_val, end=" ")
        saver.save(sess, "./my_model_supervised.ckpt")
        accuracy_val = accuracy.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels})
        print("Test accuracy:", accuracy_val)

Now reusing the first two layers of the autoencoder we pretrained:

In [None]:
n_epochs = 4
batch_size = 150
n_labeled_instances = 20000

#training_op = optimizer.minimize(loss, var_list=[weights3, biases3])  # Freeze layers 1 and 2 (optional)

with tf.Session() as sess:
    init.run()
    pretrain_saver.restore(sess, "./my_model_cache_frozen.ckpt")
    for epoch in range(n_epochs):
        n_batches = n_labeled_instances // batch_size
        for iteration in range(n_batches):
            print("\r{}%".format(100 * iteration // n_batches), end="")
            sys.stdout.flush()
            indices = rnd.permutation(n_labeled_instances)[:batch_size]
            X_batch, y_batch = mnist.train.images[indices], mnist.train.labels[indices]
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        accuracy_val = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
        print("\r{}".format(epoch), "Train accuracy:", accuracy_val, end="\t")
        saver.save(sess, "./my_model_supervised_pretrained.ckpt")
        accuracy_val = accuracy.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels})
        print("Test accuracy:", accuracy_val)

# Stacked denoising Autoencoder

Using autoencoder

In [None]:
reset_graph()

n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 150  # codings
n_hidden3 = n_hidden1
n_outputs = n_inputs

learning_rate = 0.01

In [None]:
noise_level = 1.0

X = tf.placeholder(tf.float32, shape=[None, n_inputs])
X_noisy = X + noise_level * tf.random_normal(tf.shape(X))

hidden1 = tf.layers.dense(X_noisy, n_hidden1, activation=tf.nn.relu,
                          name="hidden1")
hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, # not shown in the book
                          name="hidden2")                            # not shown
hidden3 = tf.layers.dense(hidden2, n_hidden3, activation=tf.nn.relu, # not shown
                          name="hidden3")                            # not shown
outputs = tf.layers.dense(hidden3, n_outputs, name="outputs")        # not shown

reconstruction_loss = tf.reduce_mean(tf.square(outputs - X)) # MSE

In [None]:
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(reconstruction_loss)
    
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [None]:
n_epochs = 10
batch_size = 150

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        n_batches = mnist.train.num_examples // batch_size
        for iteration in range(n_batches):
            print("\r{}%".format(100 * iteration // n_batches), end="")
            sys.stdout.flush()
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run(training_op, feed_dict={X: X_batch})
        loss_train = reconstruction_loss.eval(feed_dict={X: X_batch})
        print("\r{}".format(epoch), "Train MSE:", loss_train)
        saver.save(sess, "./my_model_stacked_denoising_gaussian.ckpt")

Using dropout

In [None]:
reset_graph()

n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 150  # codings
n_hidden3 = n_hidden1
n_outputs = n_inputs

learning_rate = 0.01

In [None]:
dropout_rate = 0.3

training = tf.placeholder_with_default(False, shape=(), name='training')

X = tf.placeholder(tf.float32, shape=[None, n_inputs])
X_drop = tf.layers.dropout(X, dropout_rate, training=training)

hidden1 = tf.layers.dense(X_drop, n_hidden1, activation=tf.nn.relu,
                          name="hidden1")
hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, # not shown in the book
                          name="hidden2")                            # not shown
hidden3 = tf.layers.dense(hidden2, n_hidden3, activation=tf.nn.relu, # not shown
                          name="hidden3")                            # not shown
outputs = tf.layers.dense(hidden3, n_outputs, name="outputs")        # not shown

reconstruction_loss = tf.reduce_mean(tf.square(outputs - X)) # MSE

In [None]:
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(reconstruction_loss)
    
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [None]:
n_epochs = 10
batch_size = 150

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        n_batches = mnist.train.num_examples // batch_size
        for iteration in range(n_batches):
            print("\r{}%".format(100 * iteration // n_batches), end="")
            sys.stdout.flush()
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run(training_op, feed_dict={X: X_batch, training: True})
        loss_train = reconstruction_loss.eval(feed_dict={X: X_batch})
        print("\r{}".format(epoch), "Train MSE:", loss_train)
        saver.save(sess, "./my_model_stacked_denoising_dropout.ckpt")

In [None]:
show_reconstructed_digits(X, outputs, "./my_model_stacked_denoising_dropout.ckpt")

# Sparse Autoencoder
Reward the model to learn interesting feature by reducing activation neuron at coding layer which are too closed to each other

In [None]:
#KL has a stronger gradient of cost vs sparity as shown
p = 0.1
q = np.linspace(0.001, 0.999, 500)
kl_div = p * np.log(p / q) + (1 - p) * np.log((1 - p) / (1 - q))
mse = (p - q)**2
plt.plot([p, p], [0, 0.3], "k:")
plt.text(0.05, 0.32, "Target\nsparsity", fontsize=14)
plt.plot(q, kl_div, "b-", label="KL divergence")
plt.plot(q, mse, "r--", label="MSE")
plt.legend(loc="upper left")
plt.xlabel("Actual sparsity")
plt.ylabel("Cost", rotation=0)
plt.axis([0, 1, 0, 0.95])


In [None]:
def kl_divergence(p, q):
    # Kullback Leibler divergence
    return p * tf.log(p / q) + (1 - p) * tf.log((1 - p) / (1 - q))

In [None]:
learning_rate = 0.01
sparsity_target = 0.1
sparsity_weight = 0.2

X = tf.placeholder(tf.float32, shape=[None, n_inputs])            # not shown in the book

hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.sigmoid) # not shown
outputs = tf.layers.dense(hidden1, n_outputs)                     # not shown

hidden1_mean = tf.reduce_mean(hidden1, axis=0) # batch mean
sparsity_loss = tf.reduce_sum(kl_divergence(sparsity_target, hidden1_mean))
reconstruction_loss = tf.reduce_mean(tf.square(outputs - X)) # MSE
loss = reconstruction_loss + sparsity_weight * sparsity_loss

optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)

In [None]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [None]:
n_epochs = 100
batch_size = 1000

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        n_batches = mnist.train.num_examples // batch_size
        for iteration in range(n_batches):
            print("\r{}%".format(100 * iteration // n_batches), end="")
            sys.stdout.flush()
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run(training_op, feed_dict={X: X_batch})
        reconstruction_loss_val, sparsity_loss_val, loss_val = sess.run([reconstruction_loss, sparsity_loss, loss], feed_dict={X: X_batch})
        print("\r{}".format(epoch), "Train MSE:", reconstruction_loss_val, "\tSparsity loss:", sparsity_loss_val, "\tTotal loss:", loss_val)
        saver.save(sess, "./my_model_sparse.ckpt")

In [None]:
show_reconstructed_digits(X, outputs, "./my_model_sparse.ckpt")

# Variation Autoenoder

![title](img/Variation_Autoencoder.png)

In [11]:
reset_graph()

from functools import partial

n_inputs = 28 * 28
n_hidden1 = 500
n_hidden2 = 500
n_hidden3 = 20  # codings
n_hidden4 = n_hidden2
n_hidden5 = n_hidden1
n_outputs = n_inputs
learning_rate = 0.001

initializer = tf.contrib.layers.variance_scaling_initializer()

my_dense_layer = partial(
    tf.layers.dense,
    activation=tf.nn.elu,
    kernel_initializer=initializer)
with tf.name_scope("input"):
  X = tf.placeholder(tf.float32, [None, n_inputs])
with tf.name_scope("dnn"):
  hidden1 = my_dense_layer(X, n_hidden1)
  hidden2 = my_dense_layer(hidden1, n_hidden2)
  hidden3_mean = my_dense_layer(hidden2, n_hidden3, activation=None)
  hidden3_sigma = my_dense_layer(hidden2, n_hidden3, activation=None)
  noise = tf.random_normal(tf.shape(hidden3_sigma), dtype=tf.float32)
  hidden3 = hidden3_mean + hidden3_sigma * noise
  hidden4 = my_dense_layer(hidden3, n_hidden4)
  hidden5 = my_dense_layer(hidden4, n_hidden5)
  logits = my_dense_layer(hidden5, n_outputs, activation=None)



In [12]:
with tf.name_scope("loss"):
  outputs = tf.sigmoid(logits)
  xentropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=X, logits=logits)
  reconstruction_loss = tf.reduce_sum(xentropy)
  eps = 1e-10 # smoothing term to avoid computing log(0) which is NaN
  latent_loss = 0.5 * tf.reduce_sum(
      tf.square(hidden3_sigma) + tf.square(hidden3_mean)
      - 1 - tf.log(eps + tf.square(hidden3_sigma)))

  loss = reconstruction_loss + latent_loss

with tf.name_scope("training"):
  optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
  training_op = optimizer.minimize(loss)

In [13]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [None]:
from pathlib import Path
n_epochs = 50
batch_size = 150
fileName="./my_model_variational.ckpt"
my_file = Path(fileName+".meta")


  
with tf.Session() as sess:
    init.run()
    if my_file.is_file():
            print("model file found, restore for training")
            saver.restore(sess, fileName)
    for epoch in range(n_epochs):
        n_batches = mnist.train.num_examples // batch_size
        for iteration in range(n_batches):
            print("\r{}%".format(100 * iteration // n_batches), end="")
            sys.stdout.flush()
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run(training_op, feed_dict={X: X_batch})
        loss_val, reconstruction_loss_val, latent_loss_val = sess.run([loss, reconstruction_loss, latent_loss], feed_dict={X: X_batch})
        print("\r{}".format(epoch), "Train total loss:", loss_val, "\tReconstruction loss:", reconstruction_loss_val, "\tLatent loss:", latent_loss_val)
        saver.save(sess, fileName)

0 Train total loss: 28978.498 	Reconstruction loss: 23235.03 	Latent loss: 5743.4683
1 Train total loss: 30875.387 	Reconstruction loss: 23565.48 	Latent loss: 7309.9053
2 Train total loss: 25786.393 	Reconstruction loss: 21953.324 	Latent loss: 3833.069
3 Train total loss: 27540.684 	Reconstruction loss: 22282.691 	Latent loss: 5257.993
4 Train total loss: 28693.488 	Reconstruction loss: 21114.355 	Latent loss: 7579.1333
5 Train total loss: 24662.352 	Reconstruction loss: 20882.012 	Latent loss: 3780.3408
6 Train total loss: 24063.996 	Reconstruction loss: 20249.08 	Latent loss: 3814.916
7 Train total loss: 20269.416 	Reconstruction loss: 17301.715 	Latent loss: 2967.7012
8 Train total loss: 18524.4 	Reconstruction loss: 15618.0625 	Latent loss: 2906.3386
9 Train total loss: 17438.713 	Reconstruction loss: 14426.969 	Latent loss: 3011.7441
27%

In [None]:
num_Digits=80
with tf.Session() as sess:
    saver.restore(sess, "./my_model_variational.ckpt")
    codings_rnd = np.random.normal(size=[num_Digits, n_hidden3])
    outputs_val = outputs.eval(feed_dict={hidden3: codings_rnd})#insert into decoder

In [None]:
plt.figure(figsize=(8,50)) # not shown in the book
for iteration in range(num_Digits):
    plt.subplot(num_Digits, 10, iteration + 1)
    plot_image(outputs_val[iteration])