# VAE
Variational autoencoder [1] models inherit autoencoder architecture, but make strong assumptions concerning the distribution of latent variables. They use variational approach for latent representation learning

[1] Auto-Encoding Variational Bayes, Diederik P Kingma, Max Welling 2013

In [1]:
# Setup
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import os
from tensorflow.examples.tutorials.mnist import input_data
from tensorflow.contrib.slim import fully_connected as fc

  from ._conv import register_converters as _register_converters


In [2]:
# Load MNIST data
mnist = input_data.read_data_sets('../../MNIST_data', one_hot=True)
X_dim = mnist.train.images.shape[1]
y_dim = mnist.train.labels.shape[1]
num_data = mnist.train.num_examples

def plot(samples):
    fig = plt.figure(figsize=(4, 4))
    gs = gridspec.GridSpec(4, 4)
    gs.update(wspace=0.05, hspace=0.05)

    for i, sample in enumerate(samples):
        ax = plt.subplot(gs[i])
        plt.axis('off')
        ax.set_xticklabels([])
        ax.set_yticklabels([])
        ax.set_aspect('equal')
        plt.imshow(sample.reshape(28, 28), cmap='Greys_r')

    return fig

Extracting ../../MNIST_data/train-images-idx3-ubyte.gz
Extracting ../../MNIST_data/train-labels-idx1-ubyte.gz
Extracting ../../MNIST_data/t10k-images-idx3-ubyte.gz
Extracting ../../MNIST_data/t10k-labels-idx1-ubyte.gz


In [3]:
# Hyperparamters. Your job to find these.
num_epochs = 100
batch_size = 128
z_dim = 16
lr = 1e-3

# Encoder 
Implement the encoder network $Q_\phi(z \mid x)$. Use Tensorflow's `fully_connected` function [API guide](https://www.tensorflow.org/api_docs/python/tf/contrib/layers/fully_connected) to write less lines or code. Use 3 shared FC layers and 1 FC layer at the end for the mean $\mu(X_i)$ and another 1 FC layer at the end for the diagonal variance $\sigma(X_i)$.

Also, implement the reparameterization trick we talked about in the class.

- Do take a look at the API and make sure to not use any non-linearities at the final layer!
- Make it return log of the variance so we work with smaller numbers (for stability)!

In [4]:
X = tf.placeholder(tf.float32, shape=[None, X_dim])

def Q(X):
    # Your code here for encoder network.
    fc1 = fc(X, 256)
    fc2 = fc(fc1, 256)
    fc3 = fc(fc2, 256)
    z_mu = fc(fc3, z_dim, activation_fn = None)
    z_logvar = fc(fc3, z_dim, activation_fn = None)
    return z_mu, z_logvar
    
def sample_z(mu, log_var):
    # Your code here for the reparameterization trick.
    # reparameterized_sample = ...
    reparameterized_sample = mu+tf.sqrt(tf.exp(log_var))*tf.random_normal([z_dim])
    return reparameterized_sample

# Decoder
Implement the decoder network $P_\theta(X \mid z)$. Use Tensorflow's `fully_connected` function [API guide](https://www.tensorflow.org/api_docs/python/tf/contrib/layers/fully_connected) to write less lines or code as well.

In this exercise, we will use Bernoulli MLP decoder explained in Appendix C.1 in the original paper.

Use 3 FC layers and 1 FC layer at the end for the unnormalized and normalized logits.

In [5]:
def P(z):
    # Your code here for the decoder network.
    fc1 = fc(z, 256)
    fc2 = fc(fc1, 256)
    fc3 = fc(fc2, 256)
    logits = fc(fc3, 28*28, activation_fn = None)
    prob = tf.sigmoid(logits)
    return prob, logits

# Rest of the graph
We can define the rest of the network as below.

In [6]:
z_mu, z_logvar = Q(X)
z_sample = sample_z(z_mu, z_logvar)
X_samples, logits = P(z_sample)

# Graph loss function
You might find `tf.nn.sigmoid_cross_entropy_with_logits` function [API guide](https://www.tensorflow.org/api_docs/python/tf/nn/sigmoid_cross_entropy_with_logits) useful.

In [7]:
# Your code here for the reconstruction loss
recon_loss = tf.reduce_mean(tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits), 1))
# Your code here for the closed form KL loss discussed in class
kl_loss = tf.reduce_mean(tf.reduce_sum(0.5*(-1-z_logvar+tf.square(z_mu)+tf.exp(z_logvar)), 1))

# Total loss
vae_loss = recon_loss + kl_loss

In [8]:
# Optimizer and solver
solver = tf.train.AdamOptimizer(learning_rate=lr).minimize(vae_loss)
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

# Train a _great_ generative model on MNIST!

Now, it's your job to experiment with hyperparameters defined in the beggining. The code below will also generate images and save the results to `figures/` directory. We are attaching the example images from our run and we expect your images to be of similar quality. 
![](out/080.png)


In [9]:
if not os.path.exists('figures/'):
    os.makedirs('figures/')

for epoch in range(num_epochs):
    for it in range(num_data // batch_size):
        X_mb, _ = mnist.train.next_batch(batch_size)

        _, loss = sess.run([solver, vae_loss], feed_dict={X: X_mb})

    if epoch % 5 == 0:
        print('Epoch: {}, Loss: {:.4}'. format(epoch, loss))

        samples = sess.run(X_samples, feed_dict={z_sample: np.random.randn(16, z_dim)})

        fig = plot(samples)
        plt.savefig('figures/{}.png'.format(str(epoch).zfill(3)), bbox_inches='tight')
        plt.close(fig)

Epoch: 0, Loss: 177.4
Epoch: 5, Loss: 121.3
Epoch: 10, Loss: 114.8
Epoch: 15, Loss: 113.2
Epoch: 20, Loss: 119.6
Epoch: 25, Loss: 125.5
Epoch: 30, Loss: 103.0
Epoch: 35, Loss: 109.6
Epoch: 40, Loss: 117.5
Epoch: 45, Loss: 112.8
Epoch: 50, Loss: 108.0
Epoch: 55, Loss: 112.3
Epoch: 60, Loss: 110.2
Epoch: 65, Loss: 100.5
Epoch: 70, Loss: 104.3
Epoch: 75, Loss: 111.0
Epoch: 80, Loss: 108.6
Epoch: 85, Loss: 113.0
Epoch: 90, Loss: 101.4
Epoch: 95, Loss: 103.4
