<div class="title">Deep Neural Networks</div>
<div class="subtitle">Métodos Avanzados en Aprendizaje Automático</div>
<div class="author">Carlos María Alaíz Gudín - Universidad Autónoma de Madrid</div>

---

**Initial Configuration**

This cell defines the configuration of Jupyter Notebooks.

In [1]:
%%html
<head><link rel="stylesheet" href="style.css"></head>

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

This cell imports the packages to be used (all of them quite standard except for `Utils`, which is provided with the notebook).

In [None]:
import numpy as np

import matplotlib
import matplotlib.pyplot as plt

from sklearn.datasets import load_sample_images
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

import tensorflow as tf
gpu_devices = tf.config.experimental.list_physical_devices("GPU")
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)
from tensorflow import keras

import logging
import warnings
from sklearn.exceptions import ConvergenceWarning

matplotlib.rc("figure", figsize=(15, 5))
matplotlib.rc("image", cmap="gray")
tf.get_logger().setLevel(logging.ERROR)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
seed = 123

# Autoencoders

## Dataset

The MNIST dataset will be used to illustrate the autoencoders.
This dataset is composed by hand-written digits of $28 \times 28$ pixels.

In [None]:
(x_tr, y_tr), (x_te, y_te) = keras.datasets.mnist.load_data()

print("Number of axis:   ", x_tr.ndim)
print("Dimension (train):", x_tr.shape)
print("Dimension (test): ", x_te.shape)
print("Data type:        ", x_tr.dtype)

plt.imshow(x_tr[0])
plt.axis("off")
plt.show()

# The pixels are transformed to the interval [0, 1].
x_tr = x_tr.astype("float32") / 255.
x_te = x_te.astype("float32") / 255.

# Each image is converted into a 1-dimensional vector.
x_tr_1D = x_tr.reshape(len(x_tr), -1)
x_te_1D = x_te.reshape(len(x_te), -1)

## Autoencoder Builder

* To build an AE in Keras the architecture should be specified:
    * Input layer, corresponding to the data to be encoded.
    * Encoder layers, which will compress the information.
    * Decoder layers, which will decompress the information.

In [None]:
def autoencoder_builder(inp_lay, enc_lays, dec_lays, optimizer="adam"):
    # AE.
    autoencoder = keras.Sequential([inp_lay] + enc_lays + dec_lays)
    autoencoder.compile(optimizer=optimizer, loss="mse", metrics=["mse"])

    # Encoder.
    encoder = keras.Sequential([inp_lay] + enc_lays)

    # Decoder.
    decoder = keras.Sequential([keras.Input(shape=enc_lays[-1].output_shape[1:])] + dec_lays)

    return [autoencoder, encoder, decoder]

## Simple Autoencoders

* The first approach to guarantee the information compression is forcing th ehidden layer to be much smaller than the input layer.

In [None]:
encoding_dim = 16

inp_lay = keras.Input(shape=(x_tr_1D.shape[1],))
enc_lays = [keras.layers.Dense(encoding_dim, activation="relu")]
dec_lays = [keras.layers.Dense(x_tr_1D.shape[1], activation="sigmoid")]

[autoencoder, encoder, decoder] = autoencoder_builder(inp_lay,
                                                      enc_lays,
                                                      dec_lays)
autoencoder.summary()

### Training

* The training consists simply in minimizing the reconstruction error (measured through the MSE).

In [None]:
hisory = autoencoder.fit(x_tr_1D, x_tr_1D, epochs=10, batch_size=256, shuffle=True)

### Prediction

* The images can be encoded/decoded applying the encoder/decoder subnetworks.

In [None]:
encoded_imgs = encoder.predict(x_te_1D)
decoded_imgs = decoder.predict(encoded_imgs)
print("Prediction error: %.3f" % autoencoder.evaluate(x_te_1D, x_te_1D, verbose=0)[1])

### Reconstruction

* The following cell shows some examples of original and reconstructed images.

In [None]:
n = 10
plt.figure(figsize=(20, 4))
for i in range(n):
    plt.subplot(2, n, i + 1)
    plt.imshow(x_te[i].reshape(28, 28))
    plt.axis("off")

    plt.subplot(2, n, i + 1 + n)
    plt.imshow(decoded_imgs[i].reshape(28, 28))
    plt.axis("off")

plt.show()

### Embedding

* The encoder can be used to reduce the dimensionality of the original data.

In [None]:
for i in range(10):
    plt.scatter(encoded_imgs[y_te==i, 0], encoded_imgs[y_te==i, 1], label="Digit %d" % i)

plt.legend()
plt.show()

<div class="qst">

* What happens if the process above is repeated setting the reduced dimension (`encoding_dim`) to $2$?
* Is the AE expressive enough?
* Is the resulting embedding better or worse?

</div>

## Sparse Autoencoders

In order to define a Sparse AE, a regularization is used in the encoder so that the compressed data become sparse.

In [None]:
encoding_dim = 800

inp_lay = keras.Input(shape=(x_tr_1D.shape[1],))
enc_lays = [keras.layers.Dense(encoding_dim,
                               activation="relu",
                               activity_regularizer=keras.regularizers.l1(1e-3))]
dec_lays = [keras.layers.Dense(x_tr_1D.shape[1], activation="sigmoid")]

[autoencoder, encoder, decoder] = autoencoder_builder(inp_lay,
                                                      enc_lays,
                                                      dec_lays)
autoencoder.summary()

### Training

In [None]:
hisory = autoencoder.fit(x_tr_1D, x_tr_1D, epochs=10, batch_size=256, shuffle=True)

### Prediction

In [None]:
encoded_imgs = encoder.predict(x_te_1D)
decoded_imgs = decoder.predict(encoded_imgs)
print("Prediction error: %.3f" % autoencoder.evaluate(x_te_1D, x_te_1D, verbose=0)[1])

### Reconstruction

In [None]:
n = 10
plt.figure(figsize=(20, 4))
for i in range(n):
    plt.subplot(2, n, i + 1)
    plt.imshow(x_te[i].reshape(28, 28))
    plt.axis("off")

    plt.subplot(2, n, i + 1 + n)
    plt.imshow(decoded_imgs[i].reshape(28, 28))
    plt.axis("off")

plt.show()

### Embedding

The encoding is sparse, so a certain ratio of the coordinates are identically $0$.

In [None]:
for i in range(10):
    plt.scatter(encoded_imgs[y_te==i, 0], encoded_imgs[y_te==i, 1], label="Digit %d" % i)

plt.legend()
plt.show()

In [None]:
print("Sparsity: %.2f%%" % (100 * (encoded_imgs == 0).mean()))

<div class="qst">

* What happens if the regularization if set to $0$?
* Is the AE compressing the information without regularization?

</div>

## Deep Autoencoders

In the Deep AEs, a DNN is used both for the encoder and the decoder.
Usually, both networks are symmetric.

In [None]:
inp_lay = keras.Input(shape=(784,))
enc_lays = [
    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dense(64, activation="relu"),
    keras.layers.Dense(2, activation="relu"),
]
dec_lays = [
    keras.layers.Dense(64, activation="relu"),
    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dense(784, activation="sigmoid"),
]

[autoencoder, encoder, decoder] = autoencoder_builder(inp_lay,
                                                      enc_lays,
                                                      dec_lays)
autoencoder.summary()

### Training

In [None]:
hisory = autoencoder.fit(x_tr_1D, x_tr_1D, epochs=10, batch_size=256, shuffle=True)

### Predicción

In [None]:
encoded_imgs = encoder.predict(x_te_1D)
decoded_imgs = decoder.predict(encoded_imgs)
print("Prediction error: %.3f" % autoencoder.evaluate(x_te_1D, x_te_1D, verbose=0)[1])

### Reconstruction

In [None]:
n = 10
plt.figure(figsize=(20, 4))
for i in range(n):
    plt.subplot(2, n, i + 1)
    plt.imshow(x_te[i].reshape(28, 28))
    plt.axis("off")

    plt.subplot(2, n, i + 1 + n)
    plt.imshow(decoded_imgs[i].reshape(28, 28))
    plt.axis("off")

plt.show()

### Embedding

In [None]:
for i in range(10):
    plt.scatter(encoded_imgs[y_te==i, 0], encoded_imgs[y_te==i, 1],label="Digit %d" % i)

plt.legend()
plt.show()

<div class="qst">

* Taking into account that the reduced dimension is $2$, is this embedding better or worse than the ones above? Why?

</div>

# Convolutional Neural Networks

## Convolution of Images

### Original Image

The following cell loads an example image.

In [None]:
china = load_sample_images().images[0]
china = china[:china.shape[0], :china.shape[0], :] / 255.0
plt.imshow(china)
plt.title("Original")
plt.axis("off")
plt.show()

### Convolution with Different Filters

Different filters can be applied to the image above, to see their effect.

In [None]:
kv = []
kv.append(np.array([[0, 0, 0],
                    [0, 1, 0],
                    [0, 0, 0]]))
kv.append(np.array([[1, 1, 1],
                    [0, 0, 0],
                    [-1, -1, -1]]))
kv.append(kv[-1].T)
kv.append(np.array([[-1, -1, -1],
                    [-1, 8, -1],
                    [-1, -1, -1]]))
kv.append(np.array([[0, -1, 0],
                    [-1, 5, -1],
                    [0, -1, 0]]))
kv.append(1 / 256 * np.array([[1, 4, 6, 4, 1],
                              [4, 16, 24, 16, 4],
                              [6, 24, 36, 24, 6],
                              [4, 16, 24, 16, 4],
                              [1, 4, 6, 4, 1]]))
lv = ("Identity", "Edge H", "Edge V", "Edges", "Sharpen", "Gaussian")

tf.get_logger().setLevel('ERROR')

inp = tf.constant([china])
for k, l in zip(kv, lv):
    def kernel_init(shape, dtype=None):
        kernel = np.zeros(shape)
        kernel[:, :, 0, 0] = k
        kernel[:, :, 1, 1] = k
        kernel[:, :, 2, 2] = k
        return kernel

    model = keras.Sequential([keras.layers.Conv2D(3,
                                                  k.shape,
                                                  kernel_initializer=kernel_init,
                                                  input_shape=china.shape)])
    model.build()
    out = model.predict(inp)[0]
    out = np.clip(out, 0, 1)
    
    plt.subplot(1, 3, 1)
    plt.imshow(china)
    plt.title("Original")
    plt.axis("off")

    plt.subplot(1, 3, 2)
    plt.imshow(k)
    plt.title("Kernel (%s)" % l)
    plt.axis("off")

    plt.subplot(1, 3, 3)
    plt.imshow(out)
    plt.title("Convoluted Image")
    plt.axis("off")

    plt.show()

<div class="qst">

* What effect will produce the filter corresponding to a $20 \times 20$ matriz with a constant value of $\frac{1}{400}$?

</div>

## Deep Convolutional Neural Network

### Dataset

The MNIST dataset is modified so that each sample has dimension $28 \times 28 \times 1$, since the convolutional layers assume that the last dimension is the channel (in this case, there is only one channel since the image is in greyscale).

In [None]:
x_tr = x_tr.reshape(-1, 28, 28, 1)
x_te = x_te.reshape(-1, 28, 28, 1)

y_tr = keras.utils.to_categorical(y_tr, num_classes=10)
y_te = keras.utils.to_categorical(y_te, num_classes=10)

The Deep CNNs are easily defined in Keras using convolutional layers.

In [None]:
cnn = keras.Sequential()

cnn.add(keras.layers.Conv2D(32, kernel_size=(3,3),  activation="relu", input_shape=(28, 28, 1)))
cnn.add(keras.layers.Conv2D(64, kernel_size=(3,3), activation="relu"))
cnn.add(keras.layers.MaxPooling2D(pool_size=(2,2)))
cnn.add(keras.layers.Dropout(0.2))
cnn.add(keras.layers.Flatten())
cnn.add(keras.layers.Dense(128, activation="relu"))
cnn.add(keras.layers.Dropout(0.4))
cnn.add(keras.layers.Dense(10, activation="softmax"))

cnn.summary()

In [None]:
cnn.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

### Training

The training is the standard of any DNN.

In [None]:
history = cnn.fit(x_tr, y_tr, validation_split=0.75, batch_size=256, epochs=5)

### Evaluation

The evolution of the errors can show over-fitting problems.

In [None]:
print("Test accuracy: %.3f%%" % (100 * cnn.evaluate(x_te, y_te, verbose=0)[1]))

In [None]:
plt.figure(figsize=(20, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history["accuracy"])
plt.plot(history.history["val_accuracy"])
plt.title("Accuracy")
plt.ylabel("Accuracy")
plt.xlabel("Epoch")
plt.legend(["Train", "Validation"])

plt.subplot(1, 2, 2)
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.title("Loss")
plt.ylabel("Loss")
plt.xlabel("Epoch")
plt.legend(["Train", "Validation"])

plt.show()

### Prediction

In [None]:
preds = cnn.predict(x_te)
y_te_t = np.argmax(y_te, axis=1)
y_te_p = np.argmax(preds, axis=1)

cm = confusion_matrix(y_te_t, y_te_p)

In [None]:
import seaborn as sn

preds = cnn.predict(x_te)
y_te_t = np.argmax(y_te, axis=1)
y_te_p = np.argmax(preds, axis=1)

cm = confusion_matrix(y_te_t, y_te_p)

plt.figure(figsize=(10, 8))
sn.heatmap(cm, annot=True)
plt.title("Confusion Matrix")
plt.axis("equal")
plt.axis("off")

plt.show()

# Recurrent Neural Networks

## Dataset

A simple temporal series is generated next as an example to illustrate the RNNs.

In [None]:
x = np.linspace(- 8 * np.pi, 8 * np.pi, 513)
x = np.sin(x)

y = x[1:].reshape(- 1, 1)
x = x[:-1].reshape(- 1, 1, 1)

x_tr, x_te, y_tr, y_te = train_test_split(x, y, test_size=0.3, shuffle=False)
plt.plot(range(len(y_tr.ravel())), y_tr.ravel())
plt.plot(range(len(y_tr.ravel()), len(y_tr.ravel()) + len(y_te.ravel())), y_te.ravel())
plt.show()

## Recurrent Neural Network

Keras provides an LSTM layer, which includes as many LSTM units as desired.

In [None]:
rnn = keras.Sequential()
rnn.add(keras.layers.LSTM(50, batch_input_shape=(1, 1, 1), stateful=True))
rnn.add(keras.layers.Dense(1))

rnn.summary()

In [None]:
rnn.compile(loss="mean_squared_error", optimizer="adam")

### Training

The network can be trained step by step, using batches of size $1$ and preserving the state of the network between batches.

In [None]:
history_rnn = rnn.fit(x_tr, y_tr, epochs=10, batch_size=1, shuffle=False)

### Prediction

Before predicting over the test set, the training set is used to initialize the state of the network.

In [None]:
rnn.reset_states()
rnn.predict(x_tr, batch_size=1)
preds_rnn = rnn.predict(x_te, batch_size=1)

This cell illustrate the prediction of the RNN, and it depicts the input versus both the predicted output and the target.

In [None]:
plt.plot(y_te.ravel(), label="Real")
plt.plot(preds_rnn.ravel(), label="Pred")
plt.legend()
plt.show()

plt.subplot(1, 2, 1)
plt.scatter(x_te, y_te)
plt.xlabel("Input")
plt.ylabel("Real")
plt.subplot(1, 2, 2)
plt.scatter(x_te, preds_rnn)
plt.xlabel("Input")
plt.ylabel("Pred")
plt.show()

<div class="qst">

* From the predictions above, does the RNN output depends only on the input (i.e., the value in the previous instant), or does it depend also on the context? Why?

</div>

# Generative Adversarial Network

## Discriminator

The discriminator is a DNN that takes as input an image, and classifies it between real and generated.

In [None]:
def create_discriminator():
    discriminator = keras.Sequential()

    discriminator.add(keras.Input(shape=(28, 28, 1)))
    discriminator.add(keras.layers.Conv2D(64, kernel_size=4, strides=2, padding="same"))
    discriminator.add(keras.layers.LeakyReLU(alpha=0.2))
    discriminator.add(keras.layers.Conv2D(128, kernel_size=4, strides=2, padding="same"))
    discriminator.add(keras.layers.LeakyReLU(alpha=0.2))
    discriminator.add(keras.layers.Conv2D(128, kernel_size=4, strides=1, padding="same"))
    discriminator.add(keras.layers.LeakyReLU(alpha=0.2))
    discriminator.add(keras.layers.Flatten())
    discriminator.add(keras.layers.Dropout(0.2))
    discriminator.add(keras.layers.Dense(1, activation="sigmoid"))

    discriminator.compile(loss="binary_crossentropy", optimizer="rmsprop")

    return discriminator

discriminator = create_discriminator()
discriminator.summary()

## Generator

The generator is a DNN that takes as input a random vector, and produces as output an image of the desired size.
Usually, its architecture is symmetric to that of the discriminator.

In [None]:
def create_generator():
    generator = keras.Sequential()
    
    generator.add(keras.Input(shape=(100, )))
    generator.add(keras.layers.Dense(7 * 7 * 128))
    generator.add(keras.layers.Reshape((7, 7, 128)))
    generator.add(keras.layers.Conv2DTranspose(128, kernel_size=4, strides=1, padding="same"))
    generator.add(keras.layers.LeakyReLU(alpha=0.2))
    generator.add(keras.layers.Conv2DTranspose(256, kernel_size=4, strides=2, padding="same"))
    generator.add(keras.layers.LeakyReLU(alpha=0.2))
    generator.add(keras.layers.Conv2DTranspose(512, kernel_size=4, strides=2, padding="same"))
    generator.add(keras.layers.LeakyReLU(alpha=0.2))
    generator.add(keras.layers.Conv2D(1, kernel_size=5, padding="same", activation="sigmoid"))
    
    return generator

generator = create_generator()
generator.summary()

## GAN

The GAN is simply the concatenation of the generator and the discriminator.

In [None]:
def create_gan(discriminator, generator):
    gan_input = keras.Input(shape=(100,))
    gan = keras.Model(inputs=gan_input, outputs=discriminator(generator(gan_input)))
    gan.compile(loss="binary_crossentropy", optimizer="rmsprop")
    return gan

gan = create_gan(discriminator, generator)
gan.summary()

## Training

This function allows to visualize some generated samples.

In [None]:
def plot_generated_images(generator, dim=(5, 5), figsize=(5, 5)):
    examples = np.prod(dim)
    noise = np.random.normal(loc=0, scale=1, size=[examples, 100])
    generated_images = generator.predict(noise)
    generated_images = generated_images.reshape(examples, 28, 28)
    plt.figure(figsize=figsize)
    for i in range(generated_images.shape[0]):
        image = generated_images[i]

        plt.subplot(dim[0], dim[1], i + 1)
        plt.imshow(image)
        plt.axis("off")
    plt.tight_layout()
    plt.show()

The following cell trains the GAN, alternatively training the discriminator and the generator.

In [None]:
max_iter = 51
batch_size = 128

(x_tr, y_tr), (x_te, y_te) = keras.datasets.mnist.load_data()
x = x_tr[y_tr == 4].astype("float32") / 255.

for i in range(max_iter):

    print("Iteration: %d" % i, end="\r")
    if (i % 10) == 0:
        plot_generated_images(generator)

    noise = np.random.normal(0, 1, [batch_size, 100])

    generated_images = generator.predict(noise)
    real_images = x[np.random.randint(low=0, high=x.shape[0], size=batch_size)]

    X = np.concatenate([real_images, generated_images[:, :, :, 0]])
    X = X.reshape(X.shape[0], X.shape[1], X.shape[2], 1)

    y_dis = np.zeros(2 * batch_size)
    y_dis[:batch_size] = 1.0

    discriminator.trainable=True
    discriminator.train_on_batch(X, y_dis)

    noise = np.random.normal(0, 1, [batch_size, 100])
    y_gen = np.ones(batch_size)

    discriminator.trainable=False
    gan.train_on_batch(noise, y_gen)

<div class="qst">

* Analyse the training of the GAN.
* Why are the labels of the samples modified?

</div>