This notebook builds on `1_Neural Style Transfer - Baseline.ipynb` by experimenting with different optimizer settings.

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import IPython.display as display

# Getting the Images

In [2]:
# *****************************************************************************
# To get this code to work on your system, you may need to re-write this cell,
# depending on where you plan to run these files from. As you can see, I chose
# to put the files on my Google Drive and run them from there using Google
# Colab. You may choose another approach, such as running files locally.
# *****************************************************************************
from google.colab import drive
drive.mount('/content/drive')

base_image_path = "/content/drive/MyDrive/Colab Notebooks/CSCI_S-89/Final_Project/Images/Hinton.jpg"
style_image_path = "/content/drive/MyDrive/Colab Notebooks/CSCI_S-89/Final_Project/Images/GeorgeFloyd.jpg"

Mounted at /content/drive


In [3]:
# Make sure the images are a similar size. Widely differing sizes can make the
# style transfer more difficult.
original_width, original_height = keras.utils.load_img(base_image_path).size
img_height = 400
img_width = round(original_width * img_height / original_height)

In [4]:
# Opens, reizes, and converts an image into a numpy array.
def preprocess_image(image_path):
    img = keras.utils.load_img(
        image_path, target_size=(img_height, img_width))
    img = keras.utils.img_to_array(img)
    img = np.expand_dims(img, axis=0)
    # Transform the the array (for use with VGG19 ImageNet).
    img = keras.applications.vgg19.preprocess_input(img)
    return img

# Converts a numpy array back into an image.
def deprocess_image(img):
    img = img.reshape((img_height, img_width, 3))
    # Zero-center the values by removing the ImageNet mean pixel values.
    img[:, :, 0] += 103.939
    img[:, :, 1] += 116.779
    img[:, :, 2] += 123.68
    # Convert the images from BGR to RGB.
    img = img[:, :, ::-1]
    img = np.clip(img, 0, 255).astype("uint8")
    return img

In [5]:
base_image = preprocess_image(base_image_path)
style_image = preprocess_image(style_image_path)

# Make sure that the tensor for the generated image can be changed.
generated_image = tf.Variable(preprocess_image(base_image_path))

# Setting up the Network

In [6]:
model = keras.applications.vgg19.VGG19(weights="imagenet", include_top=False)
outputs_dict = dict([(layer.name, layer.output) for layer in model.layers])
feature_extractor = keras.Model(inputs=model.inputs, outputs=outputs_dict)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg19/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m80134624/80134624[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 0us/step


# Choosing the Content and Style Layers

In [7]:
content_layer_name = "block5_conv2"

style_layer_names = [
    "block1_conv1",
    "block2_conv1",
    "block3_conv1",
    "block4_conv1",
    "block5_conv1",
]

# Defining the Loss Functions

In [8]:
content_weight = 2.5e-8
style_weight = 1e-6
total_variation_weight = 1e-6

def content_loss(base_img, generated_img):
    # Compute the mean squared error between the two images.
    return tf.reduce_sum(tf.square(generated_img - base_img))

def gram_matrix(x):
    # Compute the Gram matrix.
    x = tf.transpose(x, (2, 0, 1))
    features = tf.reshape(x, (tf.shape(x)[0], -1))
    gram = tf.matmul(features, tf.transpose(features))
    return gram

def style_loss(style_img, generated_img):
    # Compute the Gram matrices of the style and combination images.
    S = gram_matrix(style_img)
    C = gram_matrix(generated_img)
    channels = 3
    size = img_height * img_width
    return tf.reduce_sum(tf.square(S - C)) / (4.0 * (channels ** 2) * (size ** 2))

def total_variation_loss(x):
    # Compute the total variation loss, which operates on the generated image.
    a = tf.square(x[:, : img_height - 1, : img_width - 1, :] - x[:, 1:, : img_width - 1, :])
    b = tf.square(x[:, : img_height - 1, : img_width - 1, :] - x[:, : img_height - 1, 1:, :])
    return tf.reduce_sum(tf.pow(a + b, 1.25))

def compute_loss(generated_image, base_image, style_image):
    input_tensor = tf.concat([base_image, style_image, generated_image], axis=0)
    features = feature_extractor(input_tensor)

    # Initialize the loss to zero.
    loss = tf.zeros(shape=())

    # Add the content loss.
    layer_features = features[content_layer_name]
    base_image_features = layer_features[0, :, :, :]
    generated_features = layer_features[2, :, :, :]
    loss = loss + content_weight * content_loss(base_image_features, generated_features)

    # Add the style loss.
    for layer_name in style_layer_names:
        layer_features = features[layer_name]
        style_features = layer_features[1, :, :, :]
        generated_features = layer_features[2, :, :, :]
        style_loss_value = style_loss(style_features, generated_features)
        loss += (style_weight / len(style_layer_names)) * style_loss_value

    # Add the total variation loss.
    loss += total_variation_weight * total_variation_loss(generated_image)
    return loss

## Stochastic Gradient Descent

### Stochastic Gradient Descent, learning rate=100

In [9]:
@tf.function
def compute_loss_and_grads(generated_image, base_image, style_image):
    with tf.GradientTape() as tape:
        loss = compute_loss(generated_image, base_image, style_image)
    grads = tape.gradient(loss, generated_image)
    return loss, grads

optimizer = keras.optimizers.SGD(
    keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=100.0, decay_steps=100, decay_rate=0.96
    )
)

iterations = 4000
for i in range(1, iterations + 1):
    loss, grads = compute_loss_and_grads(generated_image, base_image, style_image)
    optimizer.apply_gradients([(grads, generated_image)])
    print(".", end='', flush=True)
    if i % 100 == 0:
        print(f"Iteration {i}: loss={loss:.2f}")
        img = deprocess_image(generated_image.numpy())
        display.clear_output(wait=True)
        display.display(img)
        print("Train step: {}".format(i))

Train step: 4000


This generated image has been created using the settings that Chollet chose. It will as as a baseline image, fom which we will try to improve. We will start by experimenting with the choice of optimizer, as well as the optimizer settings.

To start with, let's try SGD with a smaller learning rate. I must admit that I was surprised to see Chollet's choice of 100 for the learning rate. It seems so high. Let's try a smaller setting and see how it looks. We will keep the same decay settings, but let's add some momentum.

### Stochastic Gradient Descent, learning rate=10

In [10]:
optimizer = keras.optimizers.SGD(
    learning_rate=tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=10, decay_steps=100, decay_rate=0.96
    ),
    momentum=0.9
)

generated_image = tf.Variable(preprocess_image(base_image_path))

iterations = 4000
for i in range(1, iterations + 1):
    loss, grads = compute_loss_and_grads(generated_image, base_image, style_image)
    optimizer.apply_gradients([(grads, generated_image)])
    print(".", end='', flush=True)
    if i % 100 == 0:
        print(f"Iteration {i}: loss={loss:.2f}")
        img = deprocess_image(generated_image.numpy())
        display.clear_output(wait=True)
        display.display(img)
        print("Train step: {}".format(i))

Train step: 4000


Well, that didn't work out at all. Okay, let's try a different optimizer. We will first try the Adam optimizer with what looks like a reasonable set of initial settings.

## Adam, learning rate=0.001

In [11]:
optimizer = keras.optimizers.Adam(learning_rate=0.001, beta_1=0.99, beta_2=0.999, epsilon=1e-1)

generated_image = tf.Variable(preprocess_image(base_image_path))

iterations = 4000
for i in range(1, iterations + 1):
    loss, grads = compute_loss_and_grads(generated_image, base_image, style_image)
    optimizer.apply_gradients([(grads, generated_image)])
    print(".", end='', flush=True)
    if i % 100 == 0:
        print(f"Iteration {i}: loss={loss:.2f}")
        img = deprocess_image(generated_image.numpy())
        display.clear_output(wait=True)
        display.display(img)
        print("Train step: {}".format(i))

Train step: 4000


Okay, perhaps I choose too small a learning rate. Let's try again, this time with a bigger learning rate.

## Adam, learning rate=0.01

In [12]:
optimizer = keras.optimizers.Adam(learning_rate=0.01, beta_1=0.99, beta_2=0.999, epsilon=1e-1)

generated_image = tf.Variable(preprocess_image(base_image_path))

iterations = 4000
for i in range(1, iterations + 1):
    loss, grads = compute_loss_and_grads(generated_image, base_image, style_image)
    optimizer.apply_gradients([(grads, generated_image)])
    print(".", end='', flush=True)
    if i % 100 == 0:
        print(f"Iteration {i}: loss={loss:.2f}")
        img = deprocess_image(generated_image.numpy())
        display.clear_output(wait=True)
        display.display(img)
        print("Train step: {}".format(i))

Train step: 4000


That's a bit better, and it looks nicer than the image generated with the Stochastic Gradient Descent optimizer. The structure of the face is better preserved, but we need something closer to the straight lines in the style image. Let's try the combination of a more aggressive learning rate (0.1) together with an exponential decay schedule to gradually reduce the learning rate over time.

## Adam, learning rate=0.1, decay

In [13]:
from keras.optimizers.schedules import ExponentialDecay

optimizer = keras.optimizers.Adam(
    learning_rate=ExponentialDecay(initial_learning_rate=0.1, decay_steps=100, decay_rate=0.96),
    beta_1=0.99,
    beta_2=0.999,
    epsilon=1e-1
)

generated_image = tf.Variable(preprocess_image(base_image_path))

iterations = 4000
for i in range(1, iterations + 1):
    loss, grads = compute_loss_and_grads(generated_image, base_image, style_image)
    optimizer.apply_gradients([(grads, generated_image)])
    print(".", end='', flush=True)
    if i % 100 == 0:
        print(f"Iteration {i}: loss={loss:.2f}")
        img = deprocess_image(generated_image.numpy())
        display.clear_output(wait=True)
        display.display(img)
        print("Train step: {}".format(i))

Train step: 4000


That's a little bit better again. There's a more definition for the different straight line sections of the face. Let's try again, this time with an even bigger learning rate.

## Adam, learning rate=0.75, decay

In [14]:
optimizer = keras.optimizers.Adam(
    learning_rate=ExponentialDecay(initial_learning_rate=0.75, decay_steps=100, decay_rate=0.9),
    beta_1=0.99,
    beta_2=0.999,
    epsilon=1e-1
)

generated_image = tf.Variable(preprocess_image(base_image_path))

iterations = 4000
for i in range(1, iterations + 1):
    loss, grads = compute_loss_and_grads(generated_image, base_image, style_image)
    optimizer.apply_gradients([(grads, generated_image)])
    print(".", end='', flush=True)
    if i % 100 == 0:
        print(f"Iteration {i}: loss={loss:.2f}")
        img = deprocess_image(generated_image.numpy())
        display.clear_output(wait=True)
        display.display(img)
        print("Train step: {}".format(i))

Train step: 4000


This is slightly better again. It's subtle, but there is a little more definition. I suspect we are close to the limit on improving thusing Adam optimizationis, and there is no sign of the colors from the style image coming through. Let's try another optimizer.

## RMSProp, learning rate=0.01, decay

Let's try RMSProp optimization, with a learning rate of 0.01 together with an exponential decay schedule.

In [15]:
optimizer = keras.optimizers.RMSprop(
    learning_rate=tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=0.01, decay_steps=100, decay_rate=0.96
    ),
    rho=0.9,
    epsilon=1e-7
)

generated_image = tf.Variable(preprocess_image(base_image_path))

iterations = 4000
for i in range(1, iterations + 1):
    loss, grads = compute_loss_and_grads(generated_image, base_image, style_image)
    optimizer.apply_gradients([(grads, generated_image)])
    print(".", end='', flush=True)
    if i % 100 == 0:
        print(f"Iteration {i}: loss={loss:.2f}")
        img = deprocess_image(generated_image.numpy())
        display.clear_output(wait=True)
        display.display(img)
        print("Train step: {}".format(i))

Train step: 4000


Interestingly, this generated image is very similar to the generated image for the Adam optimizer run with a learning rate of 0.75. But here, we are using an RMSProp optimizer with a earning rate of only 0.01. Let's see what it looks like with RMSProp optimization and a higher learning rate.

## RMSProp, learning rate=0.1, decay

In [16]:
optimizer = keras.optimizers.RMSprop(
    learning_rate=tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=0.1, decay_steps=100, decay_rate=0.96
    ),
    rho=0.9,
    epsilon=1e-7
)

generated_image = tf.Variable(preprocess_image(base_image_path))

iterations = 4000
for i in range(1, iterations + 1):
    loss, grads = compute_loss_and_grads(generated_image, base_image, style_image)
    optimizer.apply_gradients([(grads, generated_image)])
    print(".", end='', flush=True)
    if i % 100 == 0:
        print(f"Iteration {i}: loss={loss:.2f}")
        img = deprocess_image(generated_image.numpy())
        display.clear_output(wait=True)
        display.display(img)
        print("Train step: {}".format(i))

Train step: 4000


Okay, that's a bit more like it. We're starting to get more elements from the style image coming through. This is better than our baseline image because the elements of the style image are a little more pronounced. For example, look at the forehead.

Let's see if increasing the learning rate to 0.5 continues the improvement...

## RMSProp, learning rate=0.5, decay

In [17]:
optimizer = keras.optimizers.RMSprop(
    learning_rate=tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=0.5, decay_steps=100, decay_rate=0.96
    ),
    rho=0.9,
    epsilon=1e-7
)

generated_image = tf.Variable(preprocess_image(base_image_path))

iterations = 4000
for i in range(1, iterations + 1):
    loss, grads = compute_loss_and_grads(generated_image, base_image, style_image)
    optimizer.apply_gradients([(grads, generated_image)])
    print(".", end='', flush=True)
    if i % 100 == 0:
        print(f"Iteration {i}: loss={loss:.2f}")
        img = deprocess_image(generated_image.numpy())
        display.clear_output(wait=True)
        display.display(img)
        print("Train step: {}".format(i))

Train step: 4000


This image is actually not as good as the previous one (RMSProp, learning rate=0.1, decay). Some elements of the style are getting washed out in this image.

It looks like we've found the bounds on RMSProp optimization. Let's try another optimizer. Nadam combines the best features of Adam and RMSprop. Let's give it a spin...

## Nadam, learning rate=0.01, decay

In [18]:
optimizer = keras.optimizers.Nadam(
    learning_rate=tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=0.01, decay_steps=100, decay_rate=0.96
    ),
    beta_1=0.9,
    beta_2=0.999,
    epsilon=1e-7
)

generated_image = tf.Variable(preprocess_image(base_image_path))

iterations = 4000
for i in range(1, iterations + 1):
    loss, grads = compute_loss_and_grads(generated_image, base_image, style_image)
    optimizer.apply_gradients([(grads, generated_image)])
    print(".", end='', flush=True)
    if i % 100 == 0:
        print(f"Iteration {i}: loss={loss:.2f}")
        img = deprocess_image(generated_image.numpy())
        display.clear_output(wait=True)
        display.display(img)
        print("Train step: {}".format(i))

Train step: 4000


Okay, it looks like we need to increase the learning rate.

## Nadam, learning rate=0.1, decay

In [19]:
optimizer = keras.optimizers.Nadam(
    learning_rate=tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=0.1, decay_steps=100, decay_rate=0.96
    ),
    beta_1=0.9,
    beta_2=0.999,
    epsilon=1e-7
)

generated_image = tf.Variable(preprocess_image(base_image_path))

iterations = 4000
for i in range(1, iterations + 1):
    loss, grads = compute_loss_and_grads(generated_image, base_image, style_image)
    optimizer.apply_gradients([(grads, generated_image)])
    print(".", end='', flush=True)
    if i % 100 == 0:
        print(f"Iteration {i}: loss={loss:.2f}")
        img = deprocess_image(generated_image.numpy())
        display.clear_output(wait=True)
        display.display(img)
        print("Train step: {}".format(i))

Train step: 4000


That's pretty good, but if you look closely it's still not as good as RMSProp with a learning rate of 0.1. Let's try increasing the learning rate again...

## Nadam, learning rate=0.5, decay

In [20]:
optimizer = keras.optimizers.Nadam(
    learning_rate=tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=0.5, decay_steps=100, decay_rate=0.96
    ),
    beta_1=0.9,
    beta_2=0.999,
    epsilon=1e-7
)

generated_image = tf.Variable(preprocess_image(base_image_path))

iterations = 4000
for i in range(1, iterations + 1):
    loss, grads = compute_loss_and_grads(generated_image, base_image, style_image)
    optimizer.apply_gradients([(grads, generated_image)])
    print(".", end='', flush=True)
    if i % 100 == 0:
        print(f"Iteration {i}: loss={loss:.2f}")
        img = deprocess_image(generated_image.numpy())
        display.clear_output(wait=True)
        display.display(img)
        print("Train step: {}".format(i))

Train step: 4000


This is quite good. In my opinion, it rivals the generated image using RMSProp with a learning rate of 0.1. However, if you look very closely, there are slightly sharper lines and bettwer color contrasts  the image generated using RMSProp with a learning rate of 0.1.