Autoencoder
===

Data mining: Dimensionality reduction  
Author: Steven Van Vaerenbergh  
Universidad de Cantabria

Adapted from https://towardsdatascience.com/how-autoencoders-outperform-pca-in-dimensionality-reduction-1ae44c68b42f

In [5]:
import sys
!{sys.executable} -m pip install seaborn



In [13]:
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import mnist
from time import time

ModuleNotFoundError: No module named 'distutils'

## Prepare the data

In [7]:
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

# Reshape data
import numpy as np

train_images = np.reshape(train_images, (-1, 784))
test_images = np.reshape(test_images, (-1, 784))

# Normalize data
train_images = train_images.astype('float32') / 255
test_images = test_images.astype('float32') / 255

NameError: name 'mnist' is not defined

## Apply PCA with only two components

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca.fit(train_images)
compressed_images = pca.transform(test_images)
recovered_images = pca.inverse_transform(compressed_images)

## Visualize compressed MNIST digits after PCA

In [None]:
# Visualize compressed MNIST digits after PCA
n = 5
plt.figure(figsize=(9, 2))
for i in range(n):
  ax = plt.subplot(1, n, i+1)
  plt.imshow(recovered_images[i].reshape(28, 28), cmap="gray")
  ax.axis('off')

plt.show()

## Visualize test data using the two principal components

In [None]:
import seaborn as sns

plt.figure(figsize=(10, 7))

sns.scatterplot(x=compressed_images[:,0],
                y=compressed_images[:,1],
                hue=test_labels, palette='tab10')

plt.xlabel("First principal component")
plt.ylabel("Second principal component")

plt.legend(bbox_to_anchor=(1.01, 1),
           borderaxespad=0)

plt.show()

## Define the Autoencoder architecture

In [None]:
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Dense

input_dim = 28*28
latent_vec_dim = 2

input_layer = Input(shape=(input_dim,))

# Define the autoencoder architecture
# First build the encoder
enc_layer_1 = Dense(500, activation='sigmoid')(input_layer)
enc_layer_2 = Dense(300, activation='sigmoid')(enc_layer_1)
enc_layer_3 = Dense(100, activation='sigmoid')(enc_layer_2)
enc_layer_4 = Dense(latent_vec_dim, activation='tanh')(enc_layer_3)
encoder = enc_layer_4

# Then build the decoder
dec_layer_1 = Dense(100, activation='sigmoid')(encoder)
dec_layer_2 = Dense(300, activation='sigmoid')(dec_layer_1)
dec_layer_3 = Dense(500, activation='sigmoid')(dec_layer_2)
dec_layer_4 = Dense(input_dim, activation='sigmoid')(dec_layer_3)
decoder = dec_layer_4

# Connect both encoder and decoder
autoencoder = Model(input_layer, decoder, name="Deep_Autoencoder")

# Latent representation (Optional)
latent_model = Model(input_layer, encoder)

# Get summary
autoencoder.summary()

## Compile, train and monitor the loss function

In [None]:
# Compile the autoencoder model
autoencoder.compile(loss='binary_crossentropy', optimizer='adam')

# Train the autoencoder with MNIST data
t0 = time()
history = autoencoder.fit(train_images, train_images, epochs=70, batch_size=128,
                          shuffle=True, validation_data=(test_images, test_images))
t1 = time()
print("Autoencoder: %.2g sec" % (t1 - t0))

# Plot training and validation loss scores
# against the number of epochs.
import matplotlib.pyplot as plt
plt.plot(history.history['loss'], label='Train')
plt.plot(history.history['val_loss'], label='Validation')
plt.ylabel('Binary Cross Entropy Loss')
plt.xlabel('Epoch')
plt.title('Autoencoder Reconstruction Loss', pad=13)
plt.legend(loc='upper right')

## Visualize compressed MNIST digits after autoencoding

In [None]:
compressed_images = autoencoder.predict(test_images)
n = 5
plt.figure(figsize=(9, 2))
for i in range(n):
  ax = plt.subplot(1, n, i+1)
  plt.imshow(compressed_images[i].reshape(28, 28), cmap="gray")
  ax.axis('off')

plt.show()

## Visualize test data in the latent space

In [None]:
latent_representation = latent_model.predict(test_images)

import seaborn as sns

plt.figure(figsize=(10, 7))

sns.scatterplot(x=latent_representation[:,0],
                y=latent_representation[:,1],
                hue=test_labels, palette='tab10')

plt.xlabel("Encoder first dimension")
plt.ylabel("Encoder second dimension")

plt.legend(bbox_to_anchor=(1.01, 1),
           borderaxespad=0)
plt.show()