# M2177.003100 Deep Learning <br> Final Proejct: Text to Image Synthesis (Tensorflow)

Copyright (C) Data Science & AI Laboratory, Seoul National University. This material is for educational uses only. Some contents are based on the material provided by other paper/book authors and may be copyrighted by them. 

**For understanding of this work, please carefully look at given PPT file.**

**Note**: certain details are missing or ambiguous on purpose, in order to test your knowledge on the related materials. However, if you really feel that something essential is missing and cannot proceed to the next step, then contact the teaching staff with clear description of your problem.

### Submitting your work:
<font color=red>**DO NOT clear the training process **</font> so that TAs can grade both your code and results.  
**The TA will set a config file as 'eval_birds.yml' when evaluating the code using 'hidden test dataset'. Thus, please make sure that your code can generate proper data to measure inception score and R-precision of 'hidden test dataset'.**

## 1. Load datasets
The Birds dataset will be downloaded automatically if it is not located in the *data* directory. <br>

In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import os, nltk
from miscc.config import cfg, cfg_from_file
import pprint
import datetime
import dateutil.tz
import numpy as np
import scipy
from utils.data_utils import CUBDataset
from utils.loss import cosine_similarity
import pandas as pd
from scipy.io import loadmat
import re
import string
import random
import time

#################################################
# DO NOT CHANGE 
from utils.model import CNN_ENCODER, RNN_ENCODER, GENERATOR, DISCRIMINATOR
#################################################

%matplotlib inline

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# Set a config file as 'train_birds.yml' in training, as 'eval_birds.yml' for evaluation
cfg_from_file('cfg/train_birds.yml') # eval_birds.yml

print('Using config:')
pprint.pprint(cfg)

os.environ['CUDA_VISIBLE_DEVICES'] = cfg.GPU_ID

now = datetime.datetime.now(dateutil.tz.tzlocal())
timestamp = now.strftime('%Y_%m_%d_%H_%M_%S')
output_dir = 'sample/%s_%s_%s' % (cfg.DATASET_NAME, cfg.CONFIG_NAME, timestamp)

Using config:
{'BATCH_SIZE': 64,
 'CHECKPOINT_DIR': './checkpoint',
 'CHECKPOINT_NAME': 'model.ckpt',
 'CNN': {'EMBEDDING_DIM': 0, 'H_DIM': 0},
 'CONFIG_NAME': 'text-to-image',
 'CUDA': False,
 'DATASET_NAME': 'birds',
 'DATA_DIR': 'data/birds',
 'EMBEDDING_TYPE': 'cnn-rnn',
 'GAN': {'B_ATTENTION': False,
         'B_CONDITION': False,
         'B_DCGAN': False,
         'CONDITION_DIM': 0,
         'DF_DIM': 0,
         'EMBEDDING_DIM': 0,
         'GF_DIM': 0,
         'R_NUM': 0,
         'Z_DIM': 512},
 'GPU_ID': '0',
 'IMAGE_SIZE': 256,
 'NUM_BATCH_FOR_TEST': 0,
 'RANDOM_SEED': 0,
 'RNN': {'EMBEDDING_DIM': 0,
         'H_DIM': 0,
         'TYPE': '',
         'VOCAB_SIZE': 0,
         'WORD_EMBEDDING_DIM': 0},
 'R_PRECISION_DIR': './evaluation',
 'R_PRECISION_FILE': 'r_precision.npz',
 'R_PRECISION_FILE_HIDDEN': 'r_precision_hidden.npz',
 'TEST': {'B_EXAMPLE': False,
          'GENERATED_HIDDEN_TEST_IMAGES': './evaluation/generated_images_hidden',
          'GENERATED_TEST_IMAGES'

  yaml_cfg = edict(yaml.load(f))


In [3]:
train_dataset = CUBDataset(cfg.DATA_DIR, split='train')
test_dataset = CUBDataset(cfg.DATA_DIR, split='test')

print(f'\ntrain data directory:\n{train_dataset.split_dir}')
print(f'test data directory:\n{test_dataset.split_dir}\n')

print(f'# of train filenames:{train_dataset.filenames.shape}')
print(f'# of test filenames:{test_dataset.filenames.shape}\n')

print(f'example of filename of train image:{train_dataset.filenames[0]}')
print(f'example of filename of valid image:{test_dataset.filenames[0]}\n')

print(f'example of caption and its ids:\n{train_dataset.captions[0]}\n{train_dataset.captions_ids[0]}\n')
print(f'example of caption and its ids:\n{test_dataset.captions[0]}\n{test_dataset.captions_ids[0]}\n')

print(f'# of train captions:{np.asarray(train_dataset.captions).shape}')
print(f'# of test captions:{np.asarray(test_dataset.captions).shape}\n')

print(f'# of train caption ids:{np.asarray(train_dataset.captions_ids).shape}')
print(f'# of test caption ids:{np.asarray(test_dataset.captions_ids).shape}\n')

print(f'# of train images:{train_dataset.images.shape}')
print(f'# of test images:{test_dataset.images.shape}\n')

self.current_dir:
/home/chszerg/final-project-deep-learning-19-tf

self.data_dir:
/home/chszerg/final-project-deep-learning-19-tf/data/birds

self.image_dir:
/home/chszerg/final-project-deep-learning-19-tf/data/birds/CUB_200_2011.tgz

Dataset already exists
self.image_dir:
/home/chszerg/final-project-deep-learning-19-tf/data/birds/CUB_200_2011/images

Load from:  data/birds/captions.pickle
self.current_dir:
/home/chszerg/final-project-deep-learning-19-tf

self.data_dir:
/home/chszerg/final-project-deep-learning-19-tf/data/birds

self.image_dir:
/home/chszerg/final-project-deep-learning-19-tf/data/birds/CUB_200_2011.tgz

Dataset already exists
self.image_dir:
/home/chszerg/final-project-deep-learning-19-tf/data/birds/CUB_200_2011/images

Load from:  data/birds/captions.pickle

train data directory:
/home/chszerg/final-project-deep-learning-19-tf/data/birds/train
test data directory:
/home/chszerg/final-project-deep-learning-19-tf/data/birds/test

# of train filenames:(8855,)
# of test f

In [4]:
train_images = train_dataset.images
test_images = test_dataset.images
train_captions = np.asarray(train_dataset.captions_ids)
test_captions = np.asarray(test_dataset.captions_ids)
print(train_images.shape)
print(test_images.shape)
print(train_captions.shape)
print(test_captions.shape)

(8855, 256, 256, 3)
(2933, 256, 256, 3)
(88550, 20)
(29330, 20)


In [5]:
from skimage.transform import resize
train_images_64 = []
for train_image in train_images:
    train_images_64.append(resize(train_image, (64, 64, 3)))
train_images_64 = np.asarray(train_images_64)
print(train_images_64.shape)
assert train_images_64.shape[0] == train_images.shape[0]
test_images_64 = []
for test_image in test_images:
    test_images_64.append(resize(test_image, (64, 64, 3)))
test_images_64 = np.asarray(test_images_64)
print(test_images_64.shape)
assert test_images_64.shape[0] == test_images.shape[0]

(8855, 64, 64, 3)
(2933, 64, 64, 3)


In [6]:
train_images = train_images_64
test_images = test_images_64
n_captions_train = len(train_captions)
n_captions_per_image = 10
n_images_train = len(train_images)

In [7]:
import scipy.misc
import threading
import scipy.ndimage as ndi
from skimage import transform
from skimage import exposure
import skimage
from nltk.tokenize import RegexpTokenizer

def sent2ID(sample_sentence):
    caption = []
    cap = sample_sentence
    if len(cap) == 0:
        exit()
    cap = cap.replace("\ufffd\ufffd", " ")
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(cap.lower())
    tokens_new = []
    for t in tokens:
        t = t.encode('ascii', 'ignore').decode('ascii')
        if len(t) > 0:
            tokens_new.append(t)
    caption.append(tokens_new)
    caption_new = []
    t = caption[0]
    rev = []
    for w in t:
        if w in train_dataset.wordtoix:
            rev.append(train_dataset.wordtoix[w])
    x, x_len = train_dataset.get_caption(rev)
    caption_new.append(np.squeeze(x, axis=1))
    return caption_new

def ID2sent(sample_caption):
    sentence = []
    for ID in sample_caption:
        if ID != train_dataset.ixtoword['<PAD>']:
            sentence.append(train_dataset.ixtoword[ID])
    return sentence

def get_random_int(min=0, max=10, number=5):
    return [random.randint(min,max) for p in range(0,number)]

def merge(images, size):
    h, w = images.shape[1], images.shape[2]
    img = np.zeros((h * size[0], w * size[1], 3))
    for idx, image in enumerate(images):
        i = idx % size[1]
        j = idx // size[1]
        img[j*h:j*h+h, i*w:i*w+w, :] = image
    return img

def imsave(images, size, path):
    return scipy.misc.imsave(path, merge(images, size))

def save_images(images, size, image_path):
    return imsave(images, size, image_path)

def threading_data(data=None, fn=None, **kwargs):
    def apply_fn(results, i, data, kwargs):
        results[i] = fn(data, **kwargs)
    results = [None] * len(data)
    threads = []
    for i in range(len(data)):
        t = threading.Thread(
                        name='threading_and_return',
                        target=apply_fn,
                        args=(results, i, data[i], kwargs)
                        )
        t.start()
        threads.append(t)
    for t in threads:
        t.join()
    return np.asarray(results)

def apply_transform(x, transform_matrix, channel_index=2, fill_mode='nearest', cval=0., order=1):
    x = np.rollaxis(x, channel_index, 0)
    final_affine_matrix = transform_matrix[:2, :2]
    final_offset = transform_matrix[:2, 2]
    channel_images = [ndi.interpolation.affine_transform(x_channel, final_affine_matrix,
                      final_offset, order=order, mode=fill_mode, cval=cval) for x_channel in x]
    x = np.stack(channel_images, axis=0)
    x = np.rollaxis(x, 0, channel_index + 1)
    return x

def transform_matrix_offset_center(matrix, x, y):
    o_x = float(x) / 2 + 0.5
    o_y = float(y) / 2 + 0.5
    offset_matrix = np.array([[1, 0, o_x], [0, 1, o_y], [0, 0, 1]])
    reset_matrix = np.array([[1, 0, -o_x], [0, 1, -o_y], [0, 0, 1]])
    transform_matrix = np.dot(np.dot(offset_matrix, matrix), reset_matrix)
    return transform_matrix

def rotation(x, rg=20, is_random=False, row_index=0, col_index=1, channel_index=2,
                    fill_mode='nearest', cval=0.):
    if is_random:
        theta = np.pi / 180 * np.random.uniform(-rg, rg)
    else:
        theta = np.pi / 180 * rg
    rotation_matrix = np.array([[np.cos(theta), -np.sin(theta), 0],
                                [np.sin(theta), np.cos(theta), 0],
                                [0, 0, 1]])
    h, w = x.shape[row_index], x.shape[col_index]
    transform_matrix = transform_matrix_offset_center(rotation_matrix, h, w)
    x = apply_transform(x, transform_matrix, channel_index, fill_mode, cval)
    return x

def crop(x, wrg, hrg, is_random=False, row_index=0, col_index=1, channel_index=2):
    h, w = x.shape[row_index], x.shape[col_index]
    assert (h > hrg) and (w > wrg), "The size of cropping should smaller than the original image"
    if is_random:
        h_offset = int(np.random.uniform(0, h-hrg) - 1)
        w_offset = int(np.random.uniform(0, w-wrg) - 1)
        return x[h_offset: hrg + h_offset ,w_offset: wrg + w_offset]
    else:
        h_offset = int(np.floor((h - hrg)/ 2.))
        w_offset = int(np.floor((w - wrg)/ 2.))
        h_end = h_offset + hrg
        w_end = w_offset + wrg
        return x[h_offset: h_end, w_offset: w_end]

def flip_axis(x, axis, is_random=False):
    if is_random:
        factor = np.random.uniform(-1, 1)
        if factor > 0:
            x = np.asarray(x).swapaxes(axis, 0)
            x = x[::-1, ...]
            x = x.swapaxes(0, axis)
            return x
        else:
            return x
    else:
        x = np.asarray(x).swapaxes(axis, 0)
        x = x[::-1, ...]
        x = x.swapaxes(0, axis)
        return x

def imresize(x, size=[100, 100], interp='bilinear', mode=None):
    if x.shape[-1] == 1:
        x = scipy.misc.imresize(x[:, :, 0], size, interp=interp, mode=mode)
        return x[:, :, np.newaxis]
    elif x.shape[-1] == 3:
        return scipy.misc.imresize(x, size, interp=interp, mode=mode)
    else:
        raise Exception("Unsupported channel %d" % x.shape[-1])

def prepro_img(x, mode=None):
    if mode=='train':
        x = flip_axis(x, axis=1, is_random=True)
        x = rotation(x, rg=16, is_random=True, fill_mode='nearest')
        x = imresize(x, size=[64 + 15, 64 + 15], interp='bilinear', mode=None)
        x = crop(x, wrg=64, hrg=64, is_random=True)
        x = x / (255. / 2.)
        x = x - 1.
    return x

def combine_and_save_image_sets(image_sets, directory):
    for i in range(len(image_sets[0])):
        combined_image = []
        for set_no in range(len(image_sets)):
            combined_image.append(image_sets[set_no][i])
            combined_image.append(np.zeros((image_sets[set_no][i].shape[0], 5, 3)))
        combined_image = np.concatenate(combined_image, axis = 1)
        scipy.misc.imsave(os.path.join(directory, 'combined_{}.jpg'.format(i)), combined_image)

def save(saver, sess, logdir, step):
    model_name = 'model.ckpt'
    checkpoint_path = os.path.join(logdir, model_name)
    if not os.path.exists(logdir):
        os.makedirs(logdir)
    saver.save(sess, checkpoint_path, global_step=step)
    print('The checkpoint has been created.')

def load(saver, sess, ckpt_path):
    saver.restore(sess, ckpt_path)
    print("Restored model parameters from {}".format(ckpt_path))

In [8]:
train_samples_dir = 'train_samples_last_gan'
if os.path.exists(train_samples_dir) == False:
    os.makedirs(train_samples_dir)
checkpoint_dir = './checkpoint_last_gan'
z_dim = 512
image_size = 64
c_dim = 3
batch_size = 64
ni = 8
sample_size = batch_size
sample_seed = np.random.normal(loc=0.0, scale=1.0, size=(sample_size, z_dim)).astype(np.float32)
sample_sentence = ["a black bird with oily black feathers and rounded black beak."] * int(sample_size/ni) + \
                  ["a medium sized black bird, with a white belly, and webbed feet."] * int(sample_size/ni) + \
                  ["this is a white bird with black webbed feet and a black beak."] * int(sample_size/ni) + \
                  ["a small dully colored bird that has a grey head and nape, an oatmeal colored breast, belly and yellow and oatmeal-grey colored wings and tail."] * int(sample_size/ni) + \
                  ["a black bird with oily black feathers and rounded black beak."] * int(sample_size/ni) + \
                  ["a medium sized black bird, with a white belly, and webbed feet."] * int(sample_size/ni) + \
                  ["this is a white bird with black webbed feet and a black beak."] * int(sample_size/ni) + \
                  ["a small dully colored bird that has a grey head and nape, an oatmeal colored breast, belly and yellow and oatmeal-grey colored wings and tail."] * int(sample_size/ni)
for i, sent in enumerate(sample_sentence):
    sample_sentence[i] = sent2ID(sent)
sample_sentence = np.asarray(sample_sentence)
sample_sentence = np.reshape(sample_sentence, (sample_size, 20))
print(sample_sentence.shape)

(64, 20)


In [9]:
class Text2Img:
    def __init__(self):
        """ Information """
        self.lr = 2e-4
        self.z_dim = 512
        self.image_size = 64
        self.c_dim = 3
        self.batch_size = 64
        self.alpha = 0.2
        
        """ Place Holders """
        self.t_real_image = tf.placeholder('float32', [self.batch_size, self.image_size, image_size, 3], name = 'real_image')
        self.t_wrong_image = tf.placeholder('float32', [self.batch_size ,self.image_size, image_size, 3], name = 'wrong_image')
        self.t_real_caption = tf.placeholder(dtype=tf.int64, shape=[self.batch_size, None], name='real_caption_input')
        self.t_wrong_caption = tf.placeholder(dtype=tf.int64, shape=[self.batch_size, None], name='wrong_caption_input')
        self.t_z = tf.placeholder(tf.float32, [self.batch_size, self.z_dim], name='z_noise')
        
        """ Training Phase - CNN - RNN mapping """
        net_cnn = CNN_ENCODER(self.t_real_image, is_training=True, reuse=False)
        x = net_cnn.outputs
        v = RNN_ENCODER(self.t_real_caption, is_training=True, reuse=False).outputs
        x_w = CNN_ENCODER(self.t_wrong_image, is_training=True, reuse=True).outputs
        v_w = RNN_ENCODER(self.t_wrong_caption, is_training=True, reuse=True).outputs
        self.rnn_loss = tf.reduce_mean(tf.maximum(0., self.alpha - cosine_similarity(x, v) + cosine_similarity(x, v_w))) + \
                    tf.reduce_mean(tf.maximum(0., self.alpha - cosine_similarity(x, v) + cosine_similarity(x_w, v)))
        
        """ Training Phase - GAN """
        self.net_rnn = RNN_ENCODER(self.t_real_caption, is_training=False, reuse=True)
        net_fake_image = GENERATOR(self.t_z, self.net_rnn.outputs, is_training=True, reuse=False)
        net_disc_fake = DISCRIMINATOR(net_fake_image.outputs, self.net_rnn.outputs, is_training=True, reuse=False)
        disc_fake_logits = net_disc_fake.logits
        net_disc_real = DISCRIMINATOR(self.t_real_image, self.net_rnn.outputs, is_training=True, reuse=True)
        disc_real_logits = net_disc_real.logits
        net_disc_mismatch = DISCRIMINATOR(self.t_real_image, RNN_ENCODER(self.t_wrong_caption, is_training=False, reuse=True).outputs,
                                        is_training=True, reuse=True)
        disc_mismatch_logits = net_disc_mismatch.logits
        d_loss1 = tf.reduce_mean(tf.nn.relu(1.0 - disc_real_logits))
        d_loss2 = tf.reduce_mean(tf.nn.relu(1.0 + disc_mismatch_logits))
        d_loss3 = tf.reduce_mean(tf.nn.relu(1.0 + disc_fake_logits))
        """
        d_loss1 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=disc_real_logits,     labels=tf.ones_like(disc_real_logits),      name='d1'))
        d_loss2 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=disc_mismatch_logits, labels=tf.zeros_like(disc_mismatch_logits), name='d2'))
        d_loss3 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=disc_fake_logits,     labels=tf.zeros_like(disc_fake_logits),     name='d3'))
        self.g_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=disc_fake_logits, labels=tf.ones_like(disc_fake_logits), name='g'))
        """
        self.d_loss = d_loss1 + (d_loss2 + d_loss3) * 0.5
        self.g_loss = -tf.reduce_mean(disc_fake_logits)
        
        """ Testing Phase """
        self.net_g = GENERATOR(self.t_z, RNN_ENCODER(self.t_real_caption, is_training=False, reuse=True).outputs,
                            is_training=False, reuse=True)
        
        """ Training """
        rnn_vars = [var for var in tf.trainable_variables() if 'rnnencoder' in var.name]
        cnn_vars = [var for var in tf.trainable_variables() if 'cnnencoder' in var.name]
        d_vars = [var for var in tf.trainable_variables() if 'discriminator' in var.name]
        g_vars = [var for var in tf.trainable_variables() if 'generator' in var.name]
        update_ops_CNN = [var for var in tf.get_collection(tf.GraphKeys.UPDATE_OPS) if 'cnnencoder' in var.name]
        update_ops_D = [var for var in tf.get_collection(tf.GraphKeys.UPDATE_OPS) if 'discriminator' in var.name]
        update_ops_G = [var for var in tf.get_collection(tf.GraphKeys.UPDATE_OPS) if 'generator' in var.name]
        with tf.variable_scope('learning_rate'):
            self.lr_v = tf.Variable(self.lr, trainable=False)
        with tf.control_dependencies(update_ops_CNN):
            grads, _ = tf.clip_by_global_norm(tf.gradients(self.rnn_loss, rnn_vars + cnn_vars), 10)
            optimizer = tf.train.AdamOptimizer(self.lr_v, beta1=0.5)
            self.rnn_optim = optimizer.apply_gradients(zip(grads, rnn_vars + cnn_vars))
        with tf.control_dependencies(update_ops_D):
            self.d_optim = tf.train.AdamOptimizer(2e-4, beta1=0.0, beta2=0.9).minimize(self.d_loss, var_list=d_vars)
        with tf.control_dependencies(update_ops_G):
            self.g_optim = tf.train.AdamOptimizer(2e-4, beta1=0.0, beta2=0.9).minimize(self.g_loss, var_list=g_vars)

In [None]:
tf.reset_default_graph()
model = Text2Img()

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.allow_soft_placement = True
sess = tf.Session(config=config)
init = tf.global_variables_initializer()
sess.run(init)
saver = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=10)
ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
if ckpt and ckpt.model_checkpoint_path:
    loader = tf.train.Saver(var_list=tf.global_variables())
    load_step = int(os.path.basename(ckpt.model_checkpoint_path).split('-')[1])
    load(loader, sess, ckpt.model_checkpoint_path)
else:
    print('no checkpoints find.')

n_epoch = 2000
n_batch_epoch = int(n_images_train / batch_size)
for epoch in range(n_epoch + 1):
    start_time = time.time()
    for step in range(n_batch_epoch):
        step_time = time.time()
        idexs = get_random_int(min=0, max=n_captions_train-1, number=batch_size)
        b_real_caption = train_captions[idexs]
        b_real_images = train_images[np.floor(np.asarray(idexs).astype('float') / n_captions_per_image).astype('int')]
        idexs = get_random_int(min=0, max=n_captions_train-1, number=batch_size)
        b_wrong_caption = train_captions[idexs]
        idexs2 = get_random_int(min=0, max=n_images_train-1, number=batch_size)
        b_wrong_images = train_images[idexs2]
        b_z = np.random.normal(loc=0.0, scale=1.0, size=(batch_size, z_dim)).astype(np.float32)
        b_real_images = threading_data(b_real_images, prepro_img, mode='train')
        b_wrong_images = threading_data(b_wrong_images, prepro_img, mode='train')        
        if epoch < 100:
            errRNN, _ = sess.run([model.rnn_loss, model.rnn_optim], feed_dict={
                                            model.t_real_image : b_real_images,
                                            model.t_wrong_image : b_wrong_images,
                                            model.t_real_caption : b_real_caption,
                                            model.t_wrong_caption : b_wrong_caption})
        else:
            errRNN = 0
        errD, _ = sess.run([model.d_loss, model.d_optim], feed_dict={
                            model.t_real_image : b_real_images,
                            model.t_wrong_caption : b_wrong_caption,
                            model.t_real_caption : b_real_caption,
                            model.t_z : b_z})
        errG, _ = sess.run([model.g_loss, model.g_optim], feed_dict={
                            model.t_real_caption : b_real_caption,
                            model.t_z : b_z})
    print("Epoch: [%d/%d] time: %4.4fs, d_loss: %.8f, g_loss: %.8f, rnn_loss: %.8f" \
                        % (epoch, n_epoch, time.time() - step_time, errD, errG, errRNN))
    if (epoch + 1) % 1 == 0:
        print(" ** Epoch %d took %fs" % (epoch, time.time()-start_time))
        img_gen, rnn_out = sess.run([model.net_g.outputs, model.net_rnn.outputs], feed_dict={
                                        model.t_real_caption : sample_sentence,
                                        model.t_z : sample_seed})
        save_images(img_gen, [ni, ni], 'train_samples_last_gan/train_{:02d}.png'.format(epoch))
    if (epoch != 0) and (epoch % 50) == 0:
        save(saver, sess, checkpoint_dir, epoch)
        print("[*] Save checkpoints SUCCESS!")
checkpoint_path = os.path.join(cfg.CHECKPOINT_DIR, cfg.CHECKPOINT_NAME)
saver.save(sess, checkpoint_path, global_step=epoch)
print('The checkpoint has been created.')

Instructions for updating:
This class is deprecated, please use tf.nn.rnn_cell.LSTMCell, which supports all the feature this cell currently has. Please replace the existing code with tf.nn.rnn_cell.LSTMCell(name='basic_lstm_cell').
no checkpoints find.


`imresize` is deprecated in SciPy 1.0.0, and will be removed in 1.3.0.
Use Pillow instead: ``numpy.array(Image.fromarray(arr).resize())``.


Epoch: [0/2000] time: 0.6448s, d_loss: 2.41165090, g_loss: -0.52209276, rnn_loss: 0.29912937
 ** Epoch 0 took 107.678850s


`imsave` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``imageio.imwrite`` instead.


Epoch: [1/2000] time: 0.6398s, d_loss: 1.82504320, g_loss: 0.91173375, rnn_loss: 0.27283305
 ** Epoch 1 took 87.974666s
Epoch: [2/2000] time: 0.6528s, d_loss: 1.97624636, g_loss: 0.79536945, rnn_loss: 0.29435828
 ** Epoch 2 took 89.082275s
Epoch: [3/2000] time: 0.6424s, d_loss: 1.90790009, g_loss: 0.36982119, rnn_loss: 0.29710805
 ** Epoch 3 took 88.648509s
Epoch: [4/2000] time: 0.6362s, d_loss: 1.95481372, g_loss: 0.23233420, rnn_loss: 0.22478952
 ** Epoch 4 took 88.208620s
Epoch: [5/2000] time: 0.6499s, d_loss: 1.90160251, g_loss: 0.23809411, rnn_loss: 0.21761549
 ** Epoch 5 took 87.960535s
Epoch: [6/2000] time: 0.6494s, d_loss: 1.78657043, g_loss: 0.30586457, rnn_loss: 0.21422032
 ** Epoch 6 took 88.150791s
Epoch: [7/2000] time: 0.6319s, d_loss: 2.15114570, g_loss: -0.17194062, rnn_loss: 0.25098729
 ** Epoch 7 took 88.000345s
Epoch: [8/2000] time: 0.6380s, d_loss: 1.58100545, g_loss: -1.11637771, rnn_loss: 0.18536358
 ** Epoch 8 took 87.879308s
Epoch: [9/2000] time: 0.6336s, d_loss:

Epoch: [68/2000] time: 0.6318s, d_loss: 0.93331707, g_loss: 1.12794232, rnn_loss: 0.16681527
 ** Epoch 68 took 87.164877s
Epoch: [69/2000] time: 0.6296s, d_loss: 1.10689425, g_loss: -0.01933331, rnn_loss: 0.13701554
 ** Epoch 69 took 87.621413s
Epoch: [70/2000] time: 0.6292s, d_loss: 1.65472376, g_loss: -0.17475992, rnn_loss: 0.15100332
 ** Epoch 70 took 87.144378s
Epoch: [71/2000] time: 0.6332s, d_loss: 0.92351770, g_loss: 0.34861565, rnn_loss: 0.13049668
 ** Epoch 71 took 87.138511s
Epoch: [72/2000] time: 0.6297s, d_loss: 1.03632307, g_loss: -0.38667688, rnn_loss: 0.15453096
 ** Epoch 72 took 87.228329s
Epoch: [73/2000] time: 0.6331s, d_loss: 0.89032412, g_loss: -0.05895571, rnn_loss: 0.10791834
 ** Epoch 73 took 87.732697s
Epoch: [74/2000] time: 0.6383s, d_loss: 0.92143804, g_loss: 1.35954428, rnn_loss: 0.13127509
 ** Epoch 74 took 87.344467s
Epoch: [75/2000] time: 0.6311s, d_loss: 1.08213806, g_loss: 0.95438057, rnn_loss: 0.11386327
 ** Epoch 75 took 87.400697s
Epoch: [76/2000] tim

Epoch: [134/2000] time: 0.5898s, d_loss: 1.74463665, g_loss: 0.06658374, rnn_loss: 0.00000000
 ** Epoch 134 took 81.641809s
Epoch: [135/2000] time: 0.5904s, d_loss: 1.10826671, g_loss: 1.79035294, rnn_loss: 0.00000000
 ** Epoch 135 took 81.758289s
Epoch: [136/2000] time: 0.5904s, d_loss: 0.82929718, g_loss: 1.83862329, rnn_loss: 0.00000000
 ** Epoch 136 took 81.536198s
Epoch: [137/2000] time: 0.5903s, d_loss: 0.97374743, g_loss: 1.68270350, rnn_loss: 0.00000000
 ** Epoch 137 took 81.662028s
Epoch: [138/2000] time: 0.5935s, d_loss: 0.47249654, g_loss: 0.47300038, rnn_loss: 0.00000000
 ** Epoch 138 took 81.509124s
Epoch: [139/2000] time: 0.5934s, d_loss: 0.83905131, g_loss: 1.81350148, rnn_loss: 0.00000000
 ** Epoch 139 took 81.787178s
Epoch: [140/2000] time: 0.5893s, d_loss: 0.36184788, g_loss: 0.69829857, rnn_loss: 0.00000000
 ** Epoch 140 took 81.413826s
Epoch: [141/2000] time: 0.5906s, d_loss: 0.62442529, g_loss: 0.61822486, rnn_loss: 0.00000000
 ** Epoch 141 took 81.735027s
Epoch: [

Epoch: [200/2000] time: 0.5952s, d_loss: 0.33186769, g_loss: 1.23734665, rnn_loss: 0.00000000
 ** Epoch 200 took 82.245413s
The checkpoint has been created.
[*] Save checkpoints SUCCESS!
Epoch: [201/2000] time: 0.5898s, d_loss: 0.18040176, g_loss: 2.53436160, rnn_loss: 0.00000000
 ** Epoch 201 took 81.941207s
Epoch: [202/2000] time: 0.5949s, d_loss: 0.61766064, g_loss: 2.68062115, rnn_loss: 0.00000000
 ** Epoch 202 took 82.187438s
Epoch: [203/2000] time: 0.5929s, d_loss: 1.10656798, g_loss: 1.81166744, rnn_loss: 0.00000000
 ** Epoch 203 took 82.194505s
Epoch: [204/2000] time: 0.5962s, d_loss: 0.73760515, g_loss: 3.64925241, rnn_loss: 0.00000000
 ** Epoch 204 took 82.697806s
Epoch: [205/2000] time: 0.5958s, d_loss: 0.75053614, g_loss: 1.84230316, rnn_loss: 0.00000000
 ** Epoch 205 took 82.191838s
Epoch: [206/2000] time: 0.5946s, d_loss: 0.15967757, g_loss: 2.28901958, rnn_loss: 0.00000000
 ** Epoch 206 took 81.978326s
Epoch: [207/2000] time: 0.6000s, d_loss: 0.91317111, g_loss: 2.613904

Epoch: [266/2000] time: 0.6076s, d_loss: 0.62846118, g_loss: 0.26538214, rnn_loss: 0.00000000
 ** Epoch 266 took 82.667799s
Epoch: [267/2000] time: 0.6014s, d_loss: 0.77930200, g_loss: 3.47288418, rnn_loss: 0.00000000
 ** Epoch 267 took 82.965747s
Epoch: [268/2000] time: 0.5922s, d_loss: 0.33471602, g_loss: 0.93282878, rnn_loss: 0.00000000
 ** Epoch 268 took 82.314738s
Epoch: [269/2000] time: 0.5955s, d_loss: 0.74607128, g_loss: 4.35686159, rnn_loss: 0.00000000
 ** Epoch 269 took 82.259747s
Epoch: [270/2000] time: 0.5963s, d_loss: 0.84189522, g_loss: 0.73834956, rnn_loss: 0.00000000
 ** Epoch 270 took 82.334436s
Epoch: [271/2000] time: 0.5884s, d_loss: 0.66490871, g_loss: 3.78460550, rnn_loss: 0.00000000
 ** Epoch 271 took 82.772235s
Epoch: [272/2000] time: 0.5974s, d_loss: 0.43351683, g_loss: 2.59260941, rnn_loss: 0.00000000
 ** Epoch 272 took 82.316700s
Epoch: [273/2000] time: 0.6035s, d_loss: 0.22562337, g_loss: 2.77558899, rnn_loss: 0.00000000
 ** Epoch 273 took 82.183624s
Epoch: [

Epoch: [332/2000] time: 0.5957s, d_loss: 0.78358150, g_loss: 0.42208704, rnn_loss: 0.00000000
 ** Epoch 332 took 82.501358s
Epoch: [333/2000] time: 0.5928s, d_loss: 0.11717428, g_loss: 3.26254511, rnn_loss: 0.00000000
 ** Epoch 333 took 82.367194s
Epoch: [334/2000] time: 0.5995s, d_loss: 0.26039308, g_loss: 2.74592733, rnn_loss: 0.00000000
 ** Epoch 334 took 82.347531s
Epoch: [335/2000] time: 0.6017s, d_loss: 1.07354796, g_loss: 2.72179556, rnn_loss: 0.00000000
 ** Epoch 335 took 82.646213s
Epoch: [336/2000] time: 0.5925s, d_loss: 0.11353476, g_loss: 3.19148397, rnn_loss: 0.00000000
 ** Epoch 336 took 82.351674s
Epoch: [337/2000] time: 0.5984s, d_loss: 0.43115282, g_loss: 4.46650410, rnn_loss: 0.00000000
 ** Epoch 337 took 82.295585s
Epoch: [338/2000] time: 0.5996s, d_loss: 0.88394755, g_loss: 3.29506373, rnn_loss: 0.00000000
 ** Epoch 338 took 81.867448s
Epoch: [339/2000] time: 0.5979s, d_loss: 1.08222270, g_loss: 3.36529684, rnn_loss: 0.00000000
 ** Epoch 339 took 82.682969s
Epoch: [

Epoch: [398/2000] time: 0.5915s, d_loss: 0.20178831, g_loss: 0.42894322, rnn_loss: 0.00000000
 ** Epoch 398 took 81.832989s
Epoch: [399/2000] time: 0.6052s, d_loss: 0.57507801, g_loss: 4.08513308, rnn_loss: 0.00000000
 ** Epoch 399 took 82.202709s
Epoch: [400/2000] time: 0.6074s, d_loss: 0.59442824, g_loss: 4.08554173, rnn_loss: 0.00000000
 ** Epoch 400 took 82.092720s
The checkpoint has been created.
[*] Save checkpoints SUCCESS!
Epoch: [401/2000] time: 0.5956s, d_loss: 0.58105898, g_loss: -0.27271831, rnn_loss: 0.00000000
 ** Epoch 401 took 81.984270s
Epoch: [402/2000] time: 0.5950s, d_loss: 0.36699152, g_loss: 4.09683132, rnn_loss: 0.00000000
 ** Epoch 402 took 81.948226s
Epoch: [403/2000] time: 0.6141s, d_loss: 0.03241655, g_loss: 1.96162391, rnn_loss: 0.00000000
 ** Epoch 403 took 82.220506s
Epoch: [404/2000] time: 0.5936s, d_loss: 0.40260017, g_loss: 4.47894573, rnn_loss: 0.00000000
 ** Epoch 404 took 82.135648s
Epoch: [405/2000] time: 0.5941s, d_loss: 0.84495294, g_loss: 3.82357

## 3. Evaluation metric

In [None]:
def generate_r_precision_data():
    caption_ids = np.reshape(np.asarray(test_dataset.captions_ids), (-1, cfg.TEXT.WORDS_NUM))
    captions_ids_wrong = np.reshape(test_dataset.random_wrong_captions(), (-1, cfg.WRONG_CAPTION, cfg.TEXT.WORDS_NUM))

    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    init = tf.global_variables_initializer()
    sess.run(init)

    # load the trained checkpoint
    checkpoint_dir = cfg.CHECKPOINT_DIR
    if checkpoint_dir is not None:
        loader = tf.train.Saver(var_list=tf.global_variables())
        ckpt_path = os.path.join(cfg.CHECKPOINT_DIR, CHECKPOINT_NAME)
        loader.restore(sess, ckpt_path)
        print("Restored model parameters from {}".format(ckpt_path))
    else:
        print('no checkpoints find.')

    n_caption_test = len(caption_ids)
    num_batches = n_caption_test // cfg.BATCH_SIZE

    true_cnn_features = np.zeros((num_batches, cfg.BATCH_SIZE, cfg.TEXT.EMBEDDING_DIM), dtype=float)
    true_rnn_features = np.zeros((num_batches, cfg.BATCH_SIZE, cfg.TEXT.EMBEDDING_DIM), dtype=float)
    wrong_rnn_features = np.zeros((num_batches, cfg.WRONG_CAPTION, cfg.BATCH_SIZE, cfg.TEXT.EMBEDDING_DIM), dtype=float)

    for i in range(num_batches):
        test_cap = caption_ids[i * cfg.BATCH_SIZE: (i + 1) * cfg.BATCH_SIZE]

        z = np.random.normal(loc=0.0, scale=1.0, size=(cfg.BATCH_SIZE, cfg.GAN.Z_DIM)).astype(np.float32)
        
        rnn_features = sess.run(rnn_encoder.outputs, feed_dict={t_real_caption: test_cap})
        gen = sess.run(generator.outputs, feed_dict={t_real_caption: test_cap, t_z: z})
        cnn_features = sess.run(cnn_encoder.outputs, feed_dict={t_real_image: gen})

        true_cnn_features[i] = cnn_features
        true_rnn_features[i] = rnn_features

        for per_wrong_caption in range(cfg.WRONG_CAPTION):
            test_cap = captions_ids_wrong[i * cfg.BATCH_SIZE: (i + 1) * cfg.BATCH_SIZE]
            rnn_features = sess.run(rnn_encoder.outputs, feed_dict={t_real_caption: test_cap[:, per_wrong_caption]})
            wrong_rnn_features[i, per_wrong_caption] = rnn_features
    
    # if exists, remove the existing file first
    try:
        os.remove(os.path.join(cfg.R_PRECISION_DIR, cfg.R_PRECISION_FILE))
    except OSError:
        pass
    np.savez(os.path.join(cfg.R_PRECISION_DIR, cfg.R_PRECISION_FILE), true_cnn=true_cnn_features, true_rnn=true_rnn_features,
             wrong_rnn=wrong_rnn_features)

In [None]:
def generate_inception_score_data():
    caption_ids = np.reshape(np.asarray(test_dataset.captions_ids),
                             (-1, cfg.TEXT.CAPTIONS_PER_IMAGE, cfg.TEXT.WORDS_NUM))
    
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    init = tf.global_variables_initializer()
    sess.run(init)

    checkpoint_dir = cfg.CHECKPOINT_DIR
    if checkpoint_dir is not None:
        loader = tf.train.Saver(var_list=tf.global_variables())
        ckpt_path = os.path.join(cfg.CHECKPOINT_DIR, cfg.CHECKPOINT_NAME)
        loader.restore(sess, ckpt_path)
        print("Restored model parameters from {}".format(ckpt_path))
    else:
        print('no checkpoints find.')

    n_caption_test = len(caption_ids)
    num_batches = n_caption_test // cfg.BATCH_SIZE

    for i in range(num_batches):
        for per_caption in range(cfg.TEXT.CAPTIONS_PER_IMAGE):
            test_cap = caption_ids[i * cfg.BATCH_SIZE: (i + 1) * cfg.BATCH_SIZE, per_caption]
            test_directory = test_dataset.filenames[i * cfg.BATCH_SIZE: (i + 1) * cfg.BATCH_SIZE]

            z = np.random.normal(loc=0.0, scale=1.0, size=(cfg.BATCH_SIZE, cfg.GAN.Z_DIM)).astype(np.float32)
            gen = sess.run(generator.outputs, feed_dict={t_real_caption: test_cap, t_z: z})
            
            for j in range(cfg.BATCH_SIZE):
                if not os.path.exists(os.path.join(cfg.TEST.GENERATED_TEST_IMAGES, test_directory[j].split('/')[0])):
                    os.mkdir(os.path.join(cfg.TEST.GENERATED_TEST_IMAGES, test_directory[j].split('/')[0]))

                scipy.misc.imsave(os.path.join(cfg.TEST.GENERATED_TEST_IMAGES, test_directory[j] + '_{}.png'.format(per_caption)), gen[j])

In [None]:
generate_r_precision_data()

In [None]:
generate_inception_score_data()

## 3. Measure Inception score and R-precision of given test dataset

After set the config file as 'eval_birds.yml' and run the 'generate_inception_score_data()' and 'generate_r_precision_data()', the synthesized images based on given captions and set of image and caption features should be saved inside a 'evaluation' folder, specifically in 'evaluation/generated_images/..' and as 'evaluation/r_precision.npz' respectively.

**Then, go to the 'evaluation' folder and run each 'inception_score.ipynb' and 'r_precision.ipynb' file in order to measure inception score and r-precision score.**