In [2]:
from __future__ import division
import numpy as np
import tensorflow as tf
#import matplotlib.pyplot as plt
from glob import glob
import time
import os

from ops import *
from utils import *
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
config =  tf.ConfigProto()
config.gpu_options.allow_growth=True
#config.gpu_options.per_process_gpu_memory_fraction = 0.6
#tf.Session(config=config))


Today we are going to train GAN for generating faces and then we will make fun playing with it. Generative adversarial networks (GANs) are deep neural net architectures comprised of two nets, pitting one against the other (thus the “adversarial”). One neural network, called the generator, generates new faces, while the other, the discriminator,  decides whether each instance of face it reviews belongs to the actual training dataset or not.

Firstly download aligned faces of celebrities from here <a href="https://yadi.sk/d/xjuClJJH3MAVXh">link</a> and extract them into folder near ipython notebook.

# Implementation of GAN

Constant variables below depends on your dataset and choosing of architecture.

In [3]:
tf.reset_default_graph()

z_dim = 100
image_size=64
batch_size=64
sample_size=64
owres=8
gf_dim=64
df_dim=64
gfc_dim=1024
dfc_dim=1024
c_dim=3
lam=0.1

image_shape = [image_size, image_size, c_dim]
d_bns = [batch_norm(name='d_bn{}'.format(i,)) for i in range(4)]

log_size = int(math.log(image_size) / math.log(2))
g_bns = [ batch_norm(name='g_bn{}'.format(i,)) for i in range(log_size)]


Let's define GAN. To do it, we need to define generator, discriminator and loss functions.

Here are some tips on the architecture of the generator:
1. The deeper is convolution, the less filters is using.
2. Apply deconvolutions-relu layers to achieve input image shape.
3. Use batch normalization before nonlinearity for speed and stability of learning.
4. Use tanh activation at the end of network (in this case images should be scaled to [-1, 1])
5. To force generator not to collapse and produce different outputs initialize bias with zero (see linear layer).

Other useful tips: https://github.com/soumith/ganhacks. Example of architecture see below. You may also use defined layers from ops.py. <b> Please, use names for layers started with "g\_" for generator and "d_" for discriminator.</b>

<img src="https://raw.githubusercontent.com/carpedm20/DCGAN-tensorflow/master/DCGAN.png">

## Writing generator function (1 point)

In [10]:
def generator(z):
    with tf.variable_scope("generator", reuse=tf.AUTO_REUSE  ) as scope:
        z_, h0_w, h0_b = linear(z, gf_dim*8*4*4, 'g_h0_lin', with_w=True)

        hs = [None]
        hs[0] = tf.reshape(z_, [-1, 4, 4, gf_dim * 8])
        hs[0] = tf.nn.relu(g_bns[0](hs[0], is_training))

        i = 1 # Iteration number.
        depth_mul = 8  # Depth decreases as spatial component increases.
        size = 8  # Size increases as depth decreases.

        while size < image_size:
            hs.append(None)
            name = 'g_h{}'.format(i)
            hs[i], _, _ = deconv2d(hs[i-1],
                [batch_size, size, size, gf_dim*depth_mul], name=name, with_w=True)
            hs[i] = tf.nn.relu(g_bns[i](hs[i], is_training))

            i += 1
            depth_mul //= 2
            size *= 2

        hs.append(None)
        name = 'g_h{}'.format(i)
        hs[i], _, _ = deconv2d(hs[i - 1],
            [batch_size, size, size, 3], name=name, with_w=True)

        return tf.nn.tanh(hs[i])

Now let's define discriminator. Discriminator takes 3d tensor as input and outputs one number - probability that this is an image.

Some advice for discriminator's architecture:
1. Use batch normalization between convolutions and nonlinearities.
2. Use leaky relu with the leak about 0.2.
3. The deeper the layer, the more filters you can use.

If you use batch normalization, please define every layer in their own scope and pass is_training parameter there. Or you may use class of batch normalization from ops.py. Do not forget to fratten tensor after the convolution blocks.

## Writing discriminator function (1 point)

In [11]:
def discriminator(image, reuse=False):
    with tf.variable_scope("discriminator", reuse=tf.AUTO_REUSE  ) as scope:
        if reuse:
            scope.reuse_variables()

        h0 = lrelu(conv2d(image, df_dim, name='d_h0_conv'))
        h1 = lrelu(d_bns[0](conv2d(h0, df_dim*2, name='d_h1_conv'), is_training))
        h2 = lrelu(d_bns[1](conv2d(h1, df_dim*4, name='d_h2_conv'), is_training))
        h3 = lrelu(d_bns[2](conv2d(h2, df_dim*8, name='d_h3_conv'), is_training))
        h4 = linear(tf.reshape(h3, [-1, 8192]), 1, 'd_h4_lin')

        return tf.nn.sigmoid(h4), h4


Now let's define generator and discriminator.

In [12]:
is_training = tf.placeholder(tf.bool, name='is_training')
images = tf.placeholder(
    tf.float32, [None] + image_shape, name='real_images')

z = tf.placeholder(tf.float32, [None, z_dim], name='z')
z_sum = tf.summary.histogram("z", z)

G = generator(z)

D, D_logits = discriminator(images)

D_, D_logits_ = discriminator(G, reuse=True)

d_sum = tf.summary.histogram("d", D)
d__sum = tf.summary.histogram("d_", D_)
G_sum = tf.summary.image("G", G)


Write definition of loss funstions according to formulas:
$$ D\_loss = \frac{-1}{m} \sum_{i=1}^{m}[\log{D(x_i)} + \log{(1 - D(G(z_i)))}]$$
$$ G\_loss = \frac{1}{m} \sum_{i=1}^{m} \log{(1 - D(G(z_i)))}$$

Or for better learning you may try other loss for generator:
$$ G\_loss = \frac{-1}{m} \sum_{i=1}^{m} \log{(D(G(z_i)))}$$

## Writing loss functions (2 points)

In [13]:
"""
        You code goes here. Define discriminator and generator losses
"""
d_loss_real = tf.reduce_mean(
    tf.nn.sigmoid_cross_entropy_with_logits(logits=D_logits,
                                            labels=tf.ones_like(D)))
d_loss_fake = tf.reduce_mean(
    tf.nn.sigmoid_cross_entropy_with_logits(logits=D_logits_,
                                            labels=tf.zeros_like(D_)))
g_loss = tf.reduce_mean(
    tf.nn.sigmoid_cross_entropy_with_logits(logits=D_logits_,
                                            labels=tf.ones_like(D_)))

d_loss_real_sum = tf.summary.scalar("d_loss_real", d_loss_real)
d_loss_fake_sum = tf.summary.scalar("d_loss_fake", d_loss_fake)

d_loss = d_loss_real + d_loss_fake

g_loss_sum = tf.summary.scalar("g_loss", g_loss)
d_loss_sum = tf.summary.scalar("d_loss", d_loss)

t_vars = tf.trainable_variables()

d_vars = [var for var in t_vars if 'd_' in var.name]
g_vars = [var for var in t_vars if 'g_' in var.name]

saver = tf.train.Saver(max_to_keep=1)


Create optimizers. We use different optimizers for discriminator and generator, so we needed a separate prefix for the discriminator and generator variables (g_ for generator, d_ for disciminator).

In [14]:
learning_rate=0.0002
beta1=0.5

d_optim = tf.train.AdamOptimizer(learning_rate, beta1=beta1) \
                  .minimize(d_loss, var_list=d_vars)
g_optim = tf.train.AdamOptimizer(learning_rate, beta1=beta1) \
                  .minimize(g_loss, var_list=g_vars)                

Load data:

In [17]:
import cv2
def center_crop(x, crop_h, crop_w=None):
    # The original images are 218*178. If I crop directly 64*64 at the center, I will probably get only
    # a small part of a face, and the result won't be good. So I crop 128*128, and I resize at 64*64
    if crop_w is None:
        crop_w = crop_h
    h, w = x.shape[:2]
    #j = int(round((h - crop_h)/2.))
    #i = int(round((w - crop_w)/2.))
    j = h//2 - crop_h
    i = w//2 - crop_w
    result = cv2.resize(x[j:j+crop_h*2, i:i+crop_w*2], (crop_h, crop_w))
    return result

def transform(image, npx=64, is_crop=True):
    # npx : # of pixels width/height of image
    if is_crop:
        cropped_image = center_crop(image, npx)
    else:
        cropped_image = image
    return np.array(cropped_image)/127.5 - 1.
 
def get_image(image_path, image_size, is_crop=True):
    return transform(imread(image_path), image_size, is_crop)

DATA_PATH = '/data/luodan_data/img_align_celeba/' # Path to the dataset with celebA faces
data = glob(os.path.join(DATA_PATH, "*.jpg"))
N = len(data)+1
images = np.zeros((N, image_size, image_size, 3), dtype=np.float32)

i = 0 
for filepath in data:
    filename = filepath.replace(DATA_PATH,'')
    imgid = int(filename.replace('.jpg',''))
    images[imgid] = get_image(filepath,image_size) 
np.save('images.npy',images)
del(images)

In [18]:
#data = glob(os.path.join(DATA_PATH, "*.png"))
#assert(len(data) > 0), "Length of training data should be more than zero"

data = np.load('images.npy')
np.random.shuffle(data)

In [19]:
data.shape

(202600, 64, 64, 3)

Functions for training and evaluations.

In [20]:
def save(checkpoint_dir, step):
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    saver.save(sess,
                    os.path.join(checkpoint_dir, 'FVK_DCGAN_TF'),
                    global_step=step)

def load(checkpoint_dir):
    print(" [*] Reading checkpoints...")

    ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
    if ckpt and ckpt.model_checkpoint_path:
        saver.restore(sess, ckpt.model_checkpoint_path)
        return True
    else:
        return False


## Training GAN (1 point + 2 for good results)

In [23]:
epoch=2
checkpoint_dir="checkpoint"
sample_size = 64
sample_dir = 'samples'


if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)
if not os.path.exists(sample_dir):
    os.makedirs(sample_dir)

with tf.Session(config=config) as sess:

    try:
        tf.global_variables_initializer().run()
    except:
        tf.initialize_all_variables().run()

    g_sum = tf.summary.merge(
        [z_sum, d__sum, G_sum, d_loss_fake_sum, g_loss_sum])
    d_sum = tf.summary.merge(
        [z_sum, d_sum, d_loss_real_sum, d_loss_sum])
    writer = tf.summary.FileWriter("./logs", sess.graph)

    sample_z = np.random.normal(loc=0.,scale=1., size=(sample_size , z_dim))    
    sample_images = data[0:sample_size]

    counter = 1
    start_time = time.time()

#     if load(checkpoint_dir):
#         print("OLD MODEL")
#     else:
#         print("NEW MODEL")

    for epoch in range(epoch):
        length = data.shape[0]
        batch_idxs = length // batch_size

        for idx in range(batch_idxs):
            
            batch_images = data[idx*batch_size:(idx+1)*batch_size]
            
            batch_z = np.random.normal(loc=0., scale=1., size=[batch_size, z_dim]) \
                        .astype(np.float32)

            # Update D network
            _, summary_str = sess.run([d_optim, d_sum],
                feed_dict={ images: batch_images, z: batch_z, is_training: True })
            #writer.add_summary(summary_str, counter)

            # Update G network
            _, summary_str = sess.run([g_optim, g_sum],
                feed_dict={ z: batch_z, is_training: True })
            #writer.add_summary(summary_str, counter)

            # Run g_optim twice to make sure that d_loss does not go to zero 
            _, summary_str = sess.run([g_optim, g_sum],
                feed_dict={ z: batch_z, is_training: True })
            #writer.add_summary(summary_str, counter)

            errD_fake = d_loss_fake.eval({z: batch_z, is_training: False})
            errD_real = d_loss_real.eval({images: batch_images, is_training: False})
            errG = g_loss.eval({z: batch_z, is_training: False})

            counter += 1
            if np.mod(counter, 500) == 2:
                print("Epoch: [{:2d}] [{:4d}/{:4d}] time: {:4.4f}, d_loss: {:.8f}, g_loss: {:.8f}".format(
                    epoch, idx, batch_idxs, time.time() - start_time, errD_fake+errD_real, errG))
                 # Update D network
                summary_str = sess.run([d_sum],
                    feed_dict={ images: batch_images, z: batch_z, is_training: True })
                writer.add_summary(summary_str, counter)

                # Update G network
                summary_str = sess.run([g_sum],
                    feed_dict={ z: batch_z, is_training: True })
                writer.add_summary(summary_str, counter)

            if np.mod(counter, 5000) == 2:
                save(checkpoint_dir, counter)

#             if counter == 1  or np.mod(counter, 100) == 1:
#                 samples, d_loss_s, g_loss_s = sess.run(
#                     [G, d_loss, g_loss],
#                     feed_dict={z: sample_z, images: sample_images, is_training: False}
#                 )
#                 save_images(samples, [8, 8],
#                             './samples/train_{:02d}_{:04d}.png'.format(epoch, idx))
#                 print("[Sample] d_loss: {:.8f}, g_loss: {:.8f}".format(d_loss_s, g_loss_s))


NameError: name 'xrange' is not defined

If you generated something that looks like a face - it's cool! Add 2 points to your mark.

In [13]:
%%HTML
<video width="512" height="512" autoplay>
   <source src="fit.mp4" type="video/mp4">
</video> 
<!-- EXECUTE THIS CELL TO WATCH THE VIDEO -->

## Face interpolation (1 point)

Let's interpolate between faces: generate two vectors $z_1$ and $z_2$ and get a batch of vectors of the form $\alpha\cdot z_1 + (1- \alpha)\cdot  z_2, \alpha \in [0,1].$ Generate faces on them and look at results. The generator displays pictures in the range from -1 to 1, so use the inverse transform function from the file utils.py.

In [96]:
checkpoint_dir="checkpoint"

with tf.Session() as sess:

    try:
        tf.global_variables_initializer().run()
    except:
        tf.initialize_all_variables().run()
        
    load(checkpoint_dir)
    sample_size = 64
    sample_images = data[0:sample_size]
    z1 = np.random.normal(loc=0.,scale=1., size=(sample_size , z_dim))
    z2 = np.random.normal(loc=0.,scale=1., size=(sample_size , z_dim))
    
    for i,alpha in enumerate(np.linspace(0,1,30)):
    
        sample_z = alpha*z1 +(1-alpha)*z2

        samples, d_loss_s, g_loss_s = sess.run(
            [G, d_loss, g_loss],
            feed_dict={z: sample_z, images: sample_images, is_training: False}
        )
        save_images(samples, [8, 8], './mixed%02d.png' % i)



 [*] Reading checkpoints...


In [None]:
! ffmpeg -framerate 1  -i mixed%02d.png -c:v libx264 -r 30 -pix_fmt yuv420p mixed.mp4

In [98]:
%%HTML
<video width="512" height="512" autoplay>
   <source src="mixed.mp4" type="video/mp4">
</video> 
<!-- EXECUTE THIS CELL TO WATCH THE VIDEO -->

## Adding a smile (1 point + 1 point for good results)

Let's make face smiling. Find several vectors z, such that the generator generates smiling faces and not. Five vectors in every group should be enough (but the more, the better).

Calculate "smile vector" as mean of vectors z with generated smile on it minus mean of vectors z with generated not smile on it.

Look at the result of applying the smile vector: compare the results of generation before and after the addition of the smile vector.

In [18]:
with tf.Session() as sess:

    try:
        tf.global_variables_initializer().run()
    except:
        tf.initialize_all_variables().run()
        
    load(checkpoint_dir)
    sample_size = 64
    sample_images = data[1100:sample_size]
    #sample_z = np.random.uniform(-1,1, size=(sample_size, z_dim))
    sample_z = np.random.normal(loc=0, scale=1.0, size=(sample_size, z_dim))

    samples, d_loss_s, g_loss_s = sess.run(
        [G, d_loss, g_loss],
        feed_dict={z: sample_z, images: sample_images, is_training: False}
    )
    save_images(samples, [4,16], 'people.png')      

 [*] Reading checkpoints...


In [19]:
%%HTML
<img src='people.png'>


In [70]:
#pick it manualy
smile_id = [1,6,12,13, 3*16, 16*3+15]
no_smile_id = [2,16+11,14,16,16+14]
man_id = [3,3*16+4,3*16+6,3*16+13 , 3*16+5]

In [90]:
#smiling woman
wo_smile = np.mean(sample_z[np.array(smile_id)], axis=0)
#not smiling woman
wo_no_smile = np.mean(sample_z[np.array(no_smile_id)], axis=0)
#not smiling man
man_no_smile = np.mean(sample_z[np.array(man_id)], axis=0)
#smiling man
man_smile = wo_smile - wo_no_smile +  man_no_smile
man_smile_wn = np.zeros((sample_size,z_dim))
man_smile_wn[:] = man_smile
man_smile_wn += np.random.uniform(-.01, 0.01, size=(sample_size,z_dim))


In [91]:
with tf.Session() as sess:

    try:
        tf.global_variables_initializer().run()
    except:
        tf.initialize_all_variables().run()
        
    load(checkpoint_dir)
    sample_size = 64
    sample_images = data[0:sample_size]

    samples, d_loss_s, g_loss_s = sess.run(
        [G, d_loss, g_loss],
        feed_dict={z:man_smile_wn , images: sample_images, is_training: False}
    )
    save_images(samples, [4, 16], 'man_smile.png')


 [*] Reading checkpoints...


In [95]:
%%HTML
<img src='man_smile.png'>


If faces looks really cool, add bonus 1 point to your score.