In [1]:
import numpy as np

# load a pretrained image classification model
from keras.applications.vgg19 import VGG19
from keras.preprocessing import image
from keras.applications.vgg19 import preprocess_input
from keras.models import Model

import keras.backend as kb
from sklearn.metrics.pairwise import pairwise_distances

import matplotlib.pyplot as plt

Using TensorFlow backend.


#### First extract and store content and style features

In [2]:
# define shape
im_shape = (224, 224, 3)
# content
content_path = 'best-artworks-of-all-time/images/images/Vincent_van_Gogh/Vincent_van_Gogh_27.jpg'
content_image = image.load_img(content_path, target_size=(im_shape[0], im_shape[1]))
# original image
content = image.img_to_array(content_image)
content = np.expand_dims(content, axis=0)
# create keras constant for content image
content = kb.constant(preprocess_input(content))

# style
style_path = 'data/normal_Le_lac_de_Montriond_0.jpg'
style_image = image.load_img(style_path, target_size=im_shape)
style = image.img_to_array(style_image)
style = np.expand_dims(style, axis=0)
# create a keras constant for style image
style = kb.constant(preprocess_input(style))

# generated image
# x = np.random.randint(0, 256, im_shape)
# x = np.ones(im_shape)
# x = np.expand_dims(x, axis=0)
# create a keras placehoder generated image
target = kb.placeholder((1, im_shape[0], im_shape[1], im_shape[2]))

In [3]:
# let's make an input tensor from it all
input_tensor = kb.concatenate([target, content, style], axis=0)

### Load model

In [4]:
model = VGG19(weights='imagenet', input_tensor=input_tensor, include_top=False)
# model = Model(inputs=base_model.input, outputs=base_model.get_layer('block4_conv2').output)
model.summary()

Model: "vgg19"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, None, None, 3)     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, None, None, 64)    1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, None, None, 64)    36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, None, None, 64)    0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, None, None, 128)   73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, None, None, 128)   147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, None, None, 128)   0     

### Style transfer 

We need to specify which layers we want to use for style transfer. <br>
From the style image we will compute a style representation using the netwrok. Let's call this representation <b><i>Al</i></b><br>

#### What is a <i>style</i>

The style of an image is defined as the correlation between its different channels of the same layer.<br>
We want our target image to have the same correlation as the style image, that is the same <i>style</i>.<br>
To do so we need to update our image pixels in such way that the difference between style representations of our target image on one hand and the style image on the other hand be small.<br>
To measure this difference we're going to define a cost function.<br>

Before this, we need to compute the correlation between every two channels of the same layer. That is exactly what the Gram matrix does.<br>
The Gram matrix is the matrix of all possible inner products, 
i.e. \begin{equation*} g_{ij} = {v_i}^Tv_j \end{equation*}

In [5]:
def gram_matrix(x):
    x = kb.permute_dimensions(x, (2, 0, 1))
    return kb.dot(x, kb.transpose(x))

### Let's define the style loss function, one part of our final cost function

\begin{equation*}
\frac{1}{4N_l^2M_l^2}\sum_{i,j}(G_{ij}^l-A_{ij}^l)^2
\end{equation*}
with:<br>
<b>N</b>: number of filters or feature maps<br>
<b>M</b>: size of a feature map

In [6]:
def style_loss(style, target):
    s = gram_matrix(style)
    t = gram_matrix(target)
    tot = kb.sum(kb.square(s-t))
    N2 = im_shape[2]**2
    M2 = (im_shape[0]*im_shape[1])**2
    
    return tot / (4. * N2 * M2)

### Content loss, the other part of our final cost function

We want the target image to look like the content image. For this we measure similarity between their feature representation matrices.<br>
We don't want the target image to be a copy of the content but to share only key features. Hence the measure should take place at an intermediate layer: too close to the input layer only weak features are learnt, too close to the output layer only strong features are learnt.<br>
The studied paper advices the <i>block4_conv2</i> layer.That's exactly what we'll do!

In [7]:
def content_loss(content, target):
    return kb.sum(kb.square(target - content)) / 2.

### Total loss

The final loss function is a combination of both cost functions. Minimizing it insures a target image that shares key features of the content image and style features of the style image.<br>
One can prefer style over content or the other way around. This is controlled through <i>a</i> and <i>b</i>, style and content weights.<br>
A high ratio <i>a/b</i> emphasizes on style.

In [8]:
def total_loss(target, style, content, a, b):
    return a*content_loss(content, target) + b*style_loss(style, target)

### Let's specify layers of interests

In [9]:
style_layers = ['block1_conv2', 'block2_conv2', 'block3_conv2', 'block4_conv2', 'block5_conv2']
content_layer = 'block4_conv2'
# A = [Model(inputs=base_model.input, outputs=base_model.get_layer(layer).output).predict(a) for layer in style_layers]

#### The style image <b><i>a</i></b> is passed through the network and its style representation <b><i>A</i></b> of all layers included are computed and stored

### Remember how we defined an input tensor and fed it to our model:

 > input_tensor = kb.concatenate([target, content, style], axis=0) <br>

Now we will extract some features as follow (as named in the paper):<br>
- <b>P</b>: content features from the content image
- <b>A</b>: style features from the style image
- <b>G</b>: Target image's style features
- <b>F</b>: Target image's content features

In [10]:
# del w_content, w_style, loss_content, loss_style, loss_tot
# del A, G

In [11]:
# remember input_tensor at position 1 is the content image
P = model.get_layer(content_layer).output[1, :, :, :] # content
F = model.get_layer(content_layer).output[0, :, :, :] # target

# content and style weights
w_content = 0.0001
w_style = 0.9

# we have enough info to compute our losses
loss_content, loss_style, loss_tot = kb.variable(0.), kb.variable(0.), kb.variable(0.)

loss_content = w_content * content_loss(P, F)
# style image at position 2
for layer in style_layers:
    A = model.get_layer(layer).output[2, :, :, :] # style
    G = model.get_layer(layer).output[0, :, :, :] # target
    loss_style = loss_style + (w_style / len(style_layers)) * style_loss(A, G)

loss_tot = loss_content + loss_style

In [12]:
gradients = kb.gradients(loss_tot, target)[0]
loss_and_gradients = kb.function([target], [loss_tot, gradients])
# apply loss_tot and gradients tensors to target tensor

In [13]:
class Evaluator(object):
    
    def __init__(self):
        self.loss_value = None
        self.grads_values = None
    
    def loss(self, x):
        assert self.loss_value is None
        x = x.reshape((1, im_shape[0], im_shape[1], im_shape[2]))
        outs = loss_and_gradients([x])
    
        loss_value = outs[0]
        grad_values = outs[1].flatten().astype('float64')
        self.loss_value = loss_value
        self.grad_values = grad_values
        return self.loss_value
    
    def grads(self, x):
        assert self.loss_value is not None
        grad_values = np.copy(self.grad_values)
        self.loss_value = None
        self.grad_values = None
        return grad_values

evaluator = Evaluator()

In [14]:
from imageio import imwrite

In [15]:
def deprocess_image(x):
    x[:, :, 0] += 103.939
    x[:, :, 1] += 116.779
    x[:, :, 2] += 123.68
    x = x[:, :, ::-1]
    x = np.clip(x, 0, 255).astype('uint8')
    return x

In [None]:
from scipy.optimize import fmin_l_bfgs_b
# from scipy.misc import imsave
import time

result_prefix = 'my_result'
iterations = 20

content_path = 'best-artworks-of-all-time/images/images/Vincent_van_Gogh/Vincent_van_Gogh_27.jpg'
content_image = image.load_img(content_path, target_size=(im_shape[0], im_shape[1]))
# original image
x = image.img_to_array(content_image)
x = np.expand_dims(content, axis=0)

# x = preprocess_image(target_image_path)
x = x.flatten()
for i in range(iterations):
    print('Start of iteration', i)
    start_time = time.time()
    
    
    x, min_val, info = fmin_l_bfgs_b(evaluator.loss,
                                     x,
                                     fprime=evaluator.grads,
                                     maxfun=20)
    print('Current loss value:', min_val)
    img = x.copy().reshape(im_shape)  
    img = deprocess_image(img)
    fname = result_prefix + '_at_iteration_%d.png' % i
    imwrite(fname, img)
    print('Image saved as', fname)
    end_time = time.time()
    print('Iteration %d completed in %ds' % (i, end_time - start_time))

Start of iteration 0
Current loss value: 6017288000.0
Image saved as my_result_at_iteration_0.png
Iteration 0 completed in 216s
Start of iteration 1
Current loss value: 3144883700.0
Image saved as my_result_at_iteration_1.png
Iteration 1 completed in 212s
Start of iteration 2
Current loss value: 2243999000.0
Image saved as my_result_at_iteration_2.png
Iteration 2 completed in 212s
Start of iteration 3
Current loss value: 1769144400.0
Image saved as my_result_at_iteration_3.png
Iteration 3 completed in 212s
Start of iteration 4
Current loss value: 1462278400.0
Image saved as my_result_at_iteration_4.png
Iteration 4 completed in 206s
Start of iteration 5
