# M2177.003100 Deep Learning <br> Final Proejct: Text to Image Synthesis (Tensorflow)

Copyright (C) Data Science & AI Laboratory, Seoul National University. This material is for educational uses only. Some contents are based on the material provided by other paper/book authors and may be copyrighted by them. 

**For understanding of this work, please carefully look at given PPT file.**

**Note**: certain details are missing or ambiguous on purpose, in order to test your knowledge on the related materials. However, if you really feel that something essential is missing and cannot proceed to the next step, then contact the teaching staff with clear description of your problem.

### Submitting your work:
<font color=red>**DO NOT clear the training process **</font> so that TAs can grade both your code and results.  
**The TA will set a config file as 'eval_birds.yml' when evaluating the code using 'hidden test dataset'. Thus, please make sure that your code can generate proper data to measure inception score and R-precision of 'hidden test dataset'.**

## 1. Load datasets
The Birds dataset will be downloaded automatically if it is not located in the *data* directory. <br>

In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import os, nltk
from miscc.config import cfg, cfg_from_file
import pprint
import datetime
import dateutil.tz
import numpy as np
import scipy
from utils.data_utils import CUBDataset
from utils.loss import cosine_similarity
import pandas as pd
from scipy.io import loadmat
import re
import string
import random
import time

#################################################
# DO NOT CHANGE 
from utils.model import CNN_ENCODER, RNN_ENCODER, GENERATOR, DISCRIMINATOR
#################################################

%matplotlib inline

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# Set a config file as 'train_birds.yml' in training, as 'eval_birds.yml' for evaluation
cfg_from_file('cfg/train_birds.yml') # eval_birds.yml

print('Using config:')
pprint.pprint(cfg)

os.environ['CUDA_VISIBLE_DEVICES'] = cfg.GPU_ID

now = datetime.datetime.now(dateutil.tz.tzlocal())
timestamp = now.strftime('%Y_%m_%d_%H_%M_%S')
output_dir = 'sample/%s_%s_%s' % (cfg.DATASET_NAME, cfg.CONFIG_NAME, timestamp)

Using config:
{'BATCH_SIZE': 64,
 'CHECKPOINT_DIR': './checkpoint',
 'CHECKPOINT_NAME': 'model.ckpt',
 'CNN': {'EMBEDDING_DIM': 0, 'H_DIM': 0},
 'CONFIG_NAME': 'text-to-image',
 'CUDA': False,
 'DATASET_NAME': 'birds',
 'DATA_DIR': 'data/birds',
 'EMBEDDING_TYPE': 'cnn-rnn',
 'GAN': {'B_ATTENTION': False,
         'B_CONDITION': False,
         'B_DCGAN': False,
         'CONDITION_DIM': 0,
         'DF_DIM': 0,
         'EMBEDDING_DIM': 0,
         'GF_DIM': 0,
         'R_NUM': 0,
         'Z_DIM': 512},
 'GPU_ID': '0',
 'IMAGE_SIZE': 256,
 'NUM_BATCH_FOR_TEST': 0,
 'RANDOM_SEED': 0,
 'RNN': {'EMBEDDING_DIM': 0,
         'H_DIM': 0,
         'TYPE': '',
         'VOCAB_SIZE': 0,
         'WORD_EMBEDDING_DIM': 0},
 'R_PRECISION_DIR': './evaluation',
 'R_PRECISION_FILE': 'r_precision.npz',
 'R_PRECISION_FILE_HIDDEN': 'r_precision_hidden.npz',
 'TEST': {'B_EXAMPLE': False,
          'GENERATED_HIDDEN_TEST_IMAGES': './evaluation/generated_images_hidden',
          'GENERATED_TEST_IMAGES'

  yaml_cfg = edict(yaml.load(f))


In [3]:
train_dataset = CUBDataset(cfg.DATA_DIR, split='train')
test_dataset = CUBDataset(cfg.DATA_DIR, split='test')

print(f'\ntrain data directory:\n{train_dataset.split_dir}')
print(f'test data directory:\n{test_dataset.split_dir}\n')

print(f'# of train filenames:{train_dataset.filenames.shape}')
print(f'# of test filenames:{test_dataset.filenames.shape}\n')

print(f'example of filename of train image:{train_dataset.filenames[0]}')
print(f'example of filename of valid image:{test_dataset.filenames[0]}\n')

print(f'example of caption and its ids:\n{train_dataset.captions[0]}\n{train_dataset.captions_ids[0]}\n')
print(f'example of caption and its ids:\n{test_dataset.captions[0]}\n{test_dataset.captions_ids[0]}\n')

print(f'# of train captions:{np.asarray(train_dataset.captions).shape}')
print(f'# of test captions:{np.asarray(test_dataset.captions).shape}\n')

print(f'# of train caption ids:{np.asarray(train_dataset.captions_ids).shape}')
print(f'# of test caption ids:{np.asarray(test_dataset.captions_ids).shape}\n')

print(f'# of train images:{train_dataset.images.shape}')
print(f'# of test images:{test_dataset.images.shape}\n')

self.current_dir:
/home/chszerg/final-project-deep-learning-19-tf

self.data_dir:
/home/chszerg/final-project-deep-learning-19-tf/data/birds

self.image_dir:
/home/chszerg/final-project-deep-learning-19-tf/data/birds/CUB_200_2011.tgz

Dataset already exists
self.image_dir:
/home/chszerg/final-project-deep-learning-19-tf/data/birds/CUB_200_2011/images

Load from:  data/birds/captions.pickle
self.current_dir:
/home/chszerg/final-project-deep-learning-19-tf

self.data_dir:
/home/chszerg/final-project-deep-learning-19-tf/data/birds

self.image_dir:
/home/chszerg/final-project-deep-learning-19-tf/data/birds/CUB_200_2011.tgz

Dataset already exists
self.image_dir:
/home/chszerg/final-project-deep-learning-19-tf/data/birds/CUB_200_2011/images

Load from:  data/birds/captions.pickle

train data directory:
/home/chszerg/final-project-deep-learning-19-tf/data/birds/train
test data directory:
/home/chszerg/final-project-deep-learning-19-tf/data/birds/test

# of train filenames:(8855,)
# of test f

In [4]:
train_images = train_dataset.images
test_images = test_dataset.images
train_captions = np.asarray(train_dataset.captions_ids)
test_captions = np.asarray(test_dataset.captions_ids)
print(train_images.shape)
print(test_images.shape)
print(train_captions.shape)
print(test_captions.shape)

(8855, 256, 256, 3)
(2933, 256, 256, 3)
(88550, 20)
(29330, 20)


In [5]:
n_captions_train = len(train_captions)
n_captions_per_image = 10
n_images_train = len(train_images)

In [6]:
import scipy.misc
import threading
import scipy.ndimage as ndi
from skimage import transform
from skimage import exposure
import skimage
from nltk.tokenize import RegexpTokenizer

def sent2ID(sample_sentence):
    caption = []
    cap = sample_sentence
    if len(cap) == 0:
        exit()
    cap = cap.replace("\ufffd\ufffd", " ")
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(cap.lower())
    tokens_new = []
    for t in tokens:
        t = t.encode('ascii', 'ignore').decode('ascii')
        if len(t) > 0:
            tokens_new.append(t)
    caption.append(tokens_new)
    caption_new = []
    t = caption[0]
    rev = []
    for w in t:
        if w in train_dataset.wordtoix:
            rev.append(train_dataset.wordtoix[w])
    x, x_len = train_dataset.get_caption(rev)
    caption_new.append(np.squeeze(x, axis=1))
    return caption_new

def ID2sent(sample_caption):
    sentence = []
    for ID in sample_caption:
        if ID != train_dataset.ixtoword['<PAD>']:
            sentence.append(train_dataset.ixtoword[ID])
    return sentence

def get_random_int(min=0, max=10, number=5):
    return [random.randint(min,max) for p in range(0,number)]

def merge(images, size):
    h, w = images.shape[1], images.shape[2]
    img = np.zeros((h * size[0], w * size[1], 3))
    for idx, image in enumerate(images):
        i = idx % size[1]
        j = idx // size[1]
        img[j*h:j*h+h, i*w:i*w+w, :] = image
    return img

def imsave(images, size, path):
    return scipy.misc.imsave(path, merge(images, size))

def save_images(images, size, image_path):
    return imsave(images, size, image_path)

def threading_data(data=None, fn=None, **kwargs):
    def apply_fn(results, i, data, kwargs):
        results[i] = fn(data, **kwargs)
    results = [None] * len(data)
    threads = []
    for i in range(len(data)):
        t = threading.Thread(
                        name='threading_and_return',
                        target=apply_fn,
                        args=(results, i, data[i], kwargs)
                        )
        t.start()
        threads.append(t)
    for t in threads:
        t.join()
    return np.asarray(results)

def apply_transform(x, transform_matrix, channel_index=2, fill_mode='nearest', cval=0., order=1):
    x = np.rollaxis(x, channel_index, 0)
    final_affine_matrix = transform_matrix[:2, :2]
    final_offset = transform_matrix[:2, 2]
    channel_images = [ndi.interpolation.affine_transform(x_channel, final_affine_matrix,
                      final_offset, order=order, mode=fill_mode, cval=cval) for x_channel in x]
    x = np.stack(channel_images, axis=0)
    x = np.rollaxis(x, 0, channel_index + 1)
    return x

def transform_matrix_offset_center(matrix, x, y):
    o_x = float(x) / 2 + 0.5
    o_y = float(y) / 2 + 0.5
    offset_matrix = np.array([[1, 0, o_x], [0, 1, o_y], [0, 0, 1]])
    reset_matrix = np.array([[1, 0, -o_x], [0, 1, -o_y], [0, 0, 1]])
    transform_matrix = np.dot(np.dot(offset_matrix, matrix), reset_matrix)
    return transform_matrix

def rotation(x, rg=20, is_random=False, row_index=0, col_index=1, channel_index=2,
                    fill_mode='nearest', cval=0.):
    if is_random:
        theta = np.pi / 180 * np.random.uniform(-rg, rg)
    else:
        theta = np.pi / 180 * rg
    rotation_matrix = np.array([[np.cos(theta), -np.sin(theta), 0],
                                [np.sin(theta), np.cos(theta), 0],
                                [0, 0, 1]])
    h, w = x.shape[row_index], x.shape[col_index]
    transform_matrix = transform_matrix_offset_center(rotation_matrix, h, w)
    x = apply_transform(x, transform_matrix, channel_index, fill_mode, cval)
    return x

def crop(x, wrg, hrg, is_random=False, row_index=0, col_index=1, channel_index=2):
    h, w = x.shape[row_index], x.shape[col_index]
    assert (h > hrg) and (w > wrg), "The size of cropping should smaller than the original image"
    if is_random:
        h_offset = int(np.random.uniform(0, h-hrg) - 1)
        w_offset = int(np.random.uniform(0, w-wrg) - 1)
        return x[h_offset: hrg + h_offset ,w_offset: wrg + w_offset]
    else:
        h_offset = int(np.floor((h - hrg)/ 2.))
        w_offset = int(np.floor((w - wrg)/ 2.))
        h_end = h_offset + hrg
        w_end = w_offset + wrg
        return x[h_offset: h_end, w_offset: w_end]

def flip_axis(x, axis, is_random=False):
    if is_random:
        factor = np.random.uniform(-1, 1)
        if factor > 0:
            x = np.asarray(x).swapaxes(axis, 0)
            x = x[::-1, ...]
            x = x.swapaxes(0, axis)
            return x
        else:
            return x
    else:
        x = np.asarray(x).swapaxes(axis, 0)
        x = x[::-1, ...]
        x = x.swapaxes(0, axis)
        return x

def imresize(x, size=[100, 100], interp='bilinear', mode=None):
    if x.shape[-1] == 1:
        x = scipy.misc.imresize(x[:, :, 0], size, interp=interp, mode=mode)
        return x[:, :, np.newaxis]
    elif x.shape[-1] == 3:
        return scipy.misc.imresize(x, size, interp=interp, mode=mode)
    else:
        raise Exception("Unsupported channel %d" % x.shape[-1])

def prepro_img(x, mode=None):
    if mode=='train':
        x = flip_axis(x, axis=1, is_random=True)
        x = rotation(x, rg=16, is_random=True, fill_mode='nearest')
        x = imresize(x, size=[256 + 60, 256 + 60], interp='bilinear', mode=None)
        x = crop(x, wrg=256, hrg=256, is_random=True)
        x = x / (255. / 2.)
        x = x - 1.
    return x

def combine_and_save_image_sets(image_sets, directory):
    for i in range(len(image_sets[0])):
        combined_image = []
        for set_no in range(len(image_sets)):
            combined_image.append(image_sets[set_no][i])
            combined_image.append(np.zeros((image_sets[set_no][i].shape[0], 5, 3)))
        combined_image = np.concatenate(combined_image, axis = 1)
        scipy.misc.imsave(os.path.join(directory, 'combined_{}.jpg'.format(i)), combined_image)

def save(saver, sess, logdir, step):
    model_name = 'model.ckpt'
    checkpoint_path = os.path.join(logdir, model_name)
    if not os.path.exists(logdir):
        os.makedirs(logdir)
    saver.save(sess, checkpoint_path, global_step=step)
    print('The checkpoint has been created.')

def load(saver, sess, ckpt_path):
    saver.restore(sess, ckpt_path)
    print("Restored model parameters from {}".format(ckpt_path))

In [7]:
train_samples_dir = 'train_samples_1217_b'
if os.path.exists(train_samples_dir) == False:
    os.makedirs(train_samples_dir)

lr = 2e-4
lr_decay = 0.5      
decay_every = 80
beta1 = 0.5
checkpoint_dir = './checkpoint_1217_b'
z_dim = 512
image_size = 256
c_dim = 3
batch_size = 16
ni = 4

sample_size = batch_size
sample_seed = np.random.normal(loc=0.0, scale=1.0, size=(sample_size, z_dim)).astype(np.float32)
sample_sentence = ["a black bird with oily black feathers and rounded black beak."] * int(sample_size/ni) + \
                  ["a medium sized black bird, with a white belly, and webbed feet."] * int(sample_size/ni) + \
                  ["this is a white bird with black webbed feet and a black beak."] * int(sample_size/ni) + \
                  ["a small dully colored bird that has a grey head and nape, an oatmeal colored breast, belly and yellow and oatmeal-grey colored wings and tail."] * int(sample_size/ni)
for i, sent in enumerate(sample_sentence):
    sample_sentence[i] = sent2ID(sent)
sample_sentence = np.asarray(sample_sentence)
sample_sentence = np.reshape(sample_sentence, (sample_size, 20))
print(sample_sentence.shape)

(16, 20)


In [8]:
class Text2Img:
    def __init__(self):
        """ Information """
        self.lr = 2e-4
        self.lr_decay = 0.5
        self.decay_every = 80
        self.beta1 = 0.5
        self.z_dim = 512
        self.image_size = 256
        self.c_dim = 3
        self.batch_size = 16
        self.alpha = 0.2
        
        """ Place Holders """
        self.t_real_image = tf.placeholder('float32', [self.batch_size, self.image_size, image_size, 3], name = 'real_image')
        self.t_wrong_image = tf.placeholder('float32', [self.batch_size ,self.image_size, image_size, 3], name = 'wrong_image')
        self.t_real_caption = tf.placeholder(dtype=tf.int64, shape=[self.batch_size, None], name='real_caption_input')
        self.t_wrong_caption = tf.placeholder(dtype=tf.int64, shape=[self.batch_size, None], name='wrong_caption_input')
        self.t_z = tf.placeholder(tf.float32, [self.batch_size, self.z_dim], name='z_noise')
        
        """ Training Phase - CNN - RNN mapping """
        net_cnn = CNN_ENCODER(self.t_real_image, is_training=True, reuse=False)
        x = net_cnn.outputs
        v = RNN_ENCODER(self.t_real_caption, is_training=True, reuse=False).outputs
        x_w = CNN_ENCODER(self.t_wrong_image, is_training=True, reuse=True).outputs
        v_w = RNN_ENCODER(self.t_wrong_caption, is_training=True, reuse=True).outputs
        self.rnn_loss = tf.reduce_mean(tf.maximum(0., self.alpha - cosine_similarity(x, v) + cosine_similarity(x, v_w))) + \
                    tf.reduce_mean(tf.maximum(0., self.alpha - cosine_similarity(x, v) + cosine_similarity(x_w, v)))
        
        """ Training Phase - GAN """
        self.net_rnn = RNN_ENCODER(self.t_real_caption, is_training=False, reuse=True)
        net_fake_image = GENERATOR(self.t_z, self.net_rnn.outputs, is_training=True, reuse=False)
        net_disc_fake = DISCRIMINATOR(net_fake_image.outputs, self.net_rnn.outputs, is_training=True, reuse=False)
        disc_fake_logits = net_disc_fake.logits
        net_disc_real = DISCRIMINATOR(self.t_real_image, self.net_rnn.outputs, is_training=True, reuse=True)
        disc_real_logits = net_disc_real.logits
        net_disc_mismatch = DISCRIMINATOR(self.t_real_image, RNN_ENCODER(self.t_wrong_caption, is_training=False, reuse=True).outputs,
                                        is_training=True, reuse=True)
        disc_mismatch_logits = net_disc_mismatch.logits
        d_loss1 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=disc_real_logits,     labels=tf.ones_like(disc_real_logits),      name='d1'))
        d_loss2 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=disc_mismatch_logits, labels=tf.zeros_like(disc_mismatch_logits), name='d2'))
        d_loss3 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=disc_fake_logits,     labels=tf.zeros_like(disc_fake_logits),     name='d3'))
        self.d_loss = d_loss1 + (d_loss2 + d_loss3) * 0.5
        self.g_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=disc_fake_logits, labels=tf.ones_like(disc_fake_logits), name='g'))
        
        """ Testing Phase """
        self.net_g = GENERATOR(self.t_z, RNN_ENCODER(self.t_real_caption, is_training=False, reuse=True).outputs,
                            is_training=False, reuse=True)
        
        """ Training """
        rnn_vars = [var for var in tf.trainable_variables() if 'rnnencoder' in var.name]
        cnn_vars = [var for var in tf.trainable_variables() if 'cnnencoder' in var.name]
        d_vars = [var for var in tf.trainable_variables() if 'discriminator' in var.name]
        g_vars = [var for var in tf.trainable_variables() if 'generator' in var.name]
        update_ops_CNN = [var for var in tf.get_collection(tf.GraphKeys.UPDATE_OPS) if 'cnnencoder' in var.name]
        update_ops_D = [var for var in tf.get_collection(tf.GraphKeys.UPDATE_OPS) if 'discriminator' in var.name]
        update_ops_G = [var for var in tf.get_collection(tf.GraphKeys.UPDATE_OPS) if 'generator' in var.name]
        with tf.variable_scope('learning_rate'):
            self.lr_v = tf.Variable(self.lr, trainable=False)
        with tf.control_dependencies(update_ops_CNN):
            grads, _ = tf.clip_by_global_norm(tf.gradients(self.rnn_loss, rnn_vars + cnn_vars), 10)
            optimizer = tf.train.AdamOptimizer(self.lr_v, beta1=self.beta1)
            self.rnn_optim = optimizer.apply_gradients(zip(grads, rnn_vars + cnn_vars))
        with tf.control_dependencies(update_ops_D):
            self.d_optim = tf.train.AdamOptimizer(self.lr_v, beta1=self.beta1).minimize(self.d_loss, var_list=d_vars)
        with tf.control_dependencies(update_ops_G):
            self.g_optim = tf.train.AdamOptimizer(self.lr_v, beta1=self.beta1).minimize(self.g_loss, var_list=g_vars)

In [None]:
tf.reset_default_graph()
model = Text2Img()

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.allow_soft_placement = True
sess = tf.Session(config=config)
init = tf.global_variables_initializer()
sess.run(init)
saver = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=10)
ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
if ckpt and ckpt.model_checkpoint_path:
    loader = tf.train.Saver(var_list=tf.global_variables())
    load_step = int(os.path.basename(ckpt.model_checkpoint_path).split('-')[1])
    load(loader, sess, ckpt.model_checkpoint_path)
else:
    print('no checkpoints find.')

n_epoch = 300
n_batch_epoch = int(n_images_train / batch_size)
for epoch in range(n_epoch):
    start_time = time.time()
    if epoch !=0 and (epoch % decay_every == 0):
        new_lr_decay = lr_decay ** (epoch // decay_every)
        sess.run(tf.assign(model.lr_v, lr * new_lr_decay))
        log = " ** new learning rate: %f" % (lr * new_lr_decay)
        print(log)
    elif epoch == 0:
        log = " ** init lr: %f  decay_every_epoch: %d, lr_decay: %f" % (lr, decay_every, lr_decay)
        print(log)
    for step in range(n_batch_epoch):
        step_time = time.time()
        idexs = get_random_int(min=0, max=n_captions_train-1, number=batch_size)
        b_real_caption = train_captions[idexs]
        b_real_images = train_images[np.floor(np.asarray(idexs).astype('float') / n_captions_per_image).astype('int')]
        idexs = get_random_int(min=0, max=n_captions_train-1, number=batch_size)
        b_wrong_caption = train_captions[idexs]
        idexs2 = get_random_int(min=0, max=n_images_train-1, number=batch_size)
        b_wrong_images = train_images[idexs2]
        b_z = np.random.normal(loc=0.0, scale=1.0, size=(batch_size, z_dim)).astype(np.float32)
        b_real_images = threading_data(b_real_images, prepro_img, mode='train')
        b_wrong_images = threading_data(b_wrong_images, prepro_img, mode='train')
        if epoch < 160:
            errRNN, _ = sess.run([model.rnn_loss, model.rnn_optim], feed_dict={
                                            model.t_real_image : b_real_images,
                                            model.t_wrong_image : b_wrong_images,
                                            model.t_real_caption : b_real_caption,
                                            model.t_wrong_caption : b_wrong_caption})
        else:
            errRNN = 0   
        errD, _ = sess.run([model.d_loss, model.d_optim], feed_dict={
                            model.t_real_image : b_real_images,
                            model.t_wrong_caption : b_wrong_caption,
                            model.t_real_caption : b_real_caption,
                            model.t_z : b_z})
        errG, _ = sess.run([model.g_loss, model.g_optim], feed_dict={
                            model.t_real_caption : b_real_caption,
                            model.t_z : b_z})
        if (step + 1) % (n_batch_epoch // 10) == 0:
            print("step: [%d/%d] time: %4.4fs, d_loss: %.8f, g_loss: %.8f, rnn_loss: %.8f" \
                                % (step, n_batch_epoch, time.time() - step_time, errD, errG, errRNN))
    if (epoch + 1) % 1 == 0:
        print(" ** Epoch %d took %fs" % (epoch, time.time()-start_time))
        img_gen, rnn_out = sess.run([model.net_g.outputs, model.net_rnn.outputs], feed_dict={
                                        model.t_real_caption : sample_sentence,
                                        model.t_z : sample_seed})
        save_images(img_gen, [ni, ni], 'train_samples_1217_b/train_{:02d}.png'.format(epoch))
    if (epoch != 0) and (epoch % 20) == 0:
        save(saver, sess, checkpoint_dir, epoch)
        print("[*] Save checkpoints SUCCESS!")
checkpoint_path = os.path.join(cfg.CHECKPOINT_DIR, cfg.CHECKPOINT_NAME)
saver.save(sess, checkpoint_path, global_step=epoch)
print('The checkpoint has been created.')

Instructions for updating:
This class is deprecated, please use tf.nn.rnn_cell.LSTMCell, which supports all the feature this cell currently has. Please replace the existing code with tf.nn.rnn_cell.LSTMCell(name='basic_lstm_cell').
no checkpoints find.
 ** init lr: 0.000200  decay_every_epoch: 80, lr_decay: 0.500000


`imresize` is deprecated in SciPy 1.0.0, and will be removed in 1.3.0.
Use Pillow instead: ``numpy.array(Image.fromarray(arr).resize())``.


step: [54/553] time: 1.0883s, d_loss: 1.38705206, g_loss: 2.33708286, rnn_loss: 0.39775246
step: [109/553] time: 1.1793s, d_loss: 1.40180385, g_loss: 2.37959051, rnn_loss: 0.41016191
step: [164/553] time: 1.1602s, d_loss: 1.22278309, g_loss: 3.04673719, rnn_loss: 0.41547209
step: [219/553] time: 1.1197s, d_loss: 1.32913947, g_loss: 1.37585688, rnn_loss: 0.34253895
step: [274/553] time: 1.1239s, d_loss: 1.48694444, g_loss: 0.89430153, rnn_loss: 0.37854373
step: [329/553] time: 1.1006s, d_loss: 1.39764833, g_loss: 1.29283631, rnn_loss: 0.25592062
step: [384/553] time: 1.0748s, d_loss: 1.52042508, g_loss: 0.85741961, rnn_loss: 0.30551744
step: [439/553] time: 1.1590s, d_loss: 1.42676139, g_loss: 1.05569386, rnn_loss: 0.35273051
step: [494/553] time: 1.0931s, d_loss: 1.49748445, g_loss: 3.45876074, rnn_loss: 0.31753987
step: [549/553] time: 1.0401s, d_loss: 1.23707390, g_loss: 2.00257349, rnn_loss: 0.31394351
 ** Epoch 0 took 651.239169s


`imsave` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``imageio.imwrite`` instead.


step: [54/553] time: 1.1203s, d_loss: 1.59152234, g_loss: 1.62561071, rnn_loss: 0.32082531
step: [109/553] time: 1.0630s, d_loss: 1.45342231, g_loss: 1.13815093, rnn_loss: 0.22740696
step: [164/553] time: 1.1392s, d_loss: 1.33295417, g_loss: 2.22067022, rnn_loss: 0.36073905
step: [219/553] time: 1.1728s, d_loss: 1.31358576, g_loss: 1.69726622, rnn_loss: 0.42920411
step: [274/553] time: 1.1479s, d_loss: 1.62550175, g_loss: 1.50648594, rnn_loss: 0.41849706
step: [329/553] time: 1.1384s, d_loss: 1.09215355, g_loss: 2.29267073, rnn_loss: 0.31957024
step: [384/553] time: 1.1188s, d_loss: 1.17970443, g_loss: 1.81818390, rnn_loss: 0.28850502
step: [439/553] time: 1.0654s, d_loss: 1.65111971, g_loss: 1.80784965, rnn_loss: 0.32400593
step: [494/553] time: 1.0639s, d_loss: 1.01173282, g_loss: 2.97288942, rnn_loss: 0.33005697
step: [549/553] time: 1.1143s, d_loss: 1.03128195, g_loss: 3.07012033, rnn_loss: 0.26643181
 ** Epoch 1 took 623.828323s
step: [54/553] time: 1.1081s, d_loss: 1.23430276, g_

step: [439/553] time: 1.0997s, d_loss: 0.70003021, g_loss: 1.00389922, rnn_loss: 0.16693904
step: [494/553] time: 1.1639s, d_loss: 1.17514050, g_loss: 1.34944046, rnn_loss: 0.24238808
step: [549/553] time: 1.1266s, d_loss: 0.49326220, g_loss: 3.26184893, rnn_loss: 0.19770797
 ** Epoch 9 took 625.747466s
step: [54/553] time: 1.1142s, d_loss: 1.87345552, g_loss: 1.53210413, rnn_loss: 0.15464528
step: [109/553] time: 1.1793s, d_loss: 0.78972495, g_loss: 2.02489138, rnn_loss: 0.18320416
step: [164/553] time: 1.1570s, d_loss: 1.39493406, g_loss: 3.08536673, rnn_loss: 0.22798181
step: [219/553] time: 1.1143s, d_loss: 0.55716228, g_loss: 2.88692713, rnn_loss: 0.25208831
step: [274/553] time: 1.1639s, d_loss: 1.41824663, g_loss: 0.76048839, rnn_loss: 0.19994226
step: [329/553] time: 1.2401s, d_loss: 0.69169760, g_loss: 2.70644093, rnn_loss: 0.24406528
step: [384/553] time: 1.1880s, d_loss: 0.28116792, g_loss: 2.10066581, rnn_loss: 0.21540102
step: [439/553] time: 1.0952s, d_loss: 0.84401226, g

step: [274/553] time: 1.0933s, d_loss: 0.50810707, g_loss: 2.83283257, rnn_loss: 0.12709168
step: [329/553] time: 1.1882s, d_loss: 0.78469259, g_loss: 0.89090025, rnn_loss: 0.19501498
step: [384/553] time: 1.1374s, d_loss: 0.67215109, g_loss: 2.44412279, rnn_loss: 0.19060840
step: [439/553] time: 1.0843s, d_loss: 0.14133199, g_loss: 4.73547697, rnn_loss: 0.09170498
step: [494/553] time: 1.1566s, d_loss: 2.03409266, g_loss: 0.24171549, rnn_loss: 0.13137811
step: [549/553] time: 1.0980s, d_loss: 0.37840790, g_loss: 4.09710979, rnn_loss: 0.11918177
 ** Epoch 18 took 614.188199s
step: [54/553] time: 1.0824s, d_loss: 0.21985233, g_loss: 2.19679451, rnn_loss: 0.11280283
step: [109/553] time: 1.0811s, d_loss: 0.34114468, g_loss: 2.68562841, rnn_loss: 0.07294194
step: [164/553] time: 1.1632s, d_loss: 1.35287404, g_loss: 1.89048016, rnn_loss: 0.15313247
step: [219/553] time: 1.0907s, d_loss: 0.72429633, g_loss: 3.77879310, rnn_loss: 0.19281697
step: [274/553] time: 1.0957s, d_loss: 0.25220945, 

 ** Epoch 26 took 622.200602s
step: [54/553] time: 1.1401s, d_loss: 0.36114478, g_loss: 3.30445766, rnn_loss: 0.17201045
step: [109/553] time: 1.1532s, d_loss: 0.47916013, g_loss: 4.15983486, rnn_loss: 0.06942438
step: [164/553] time: 1.1128s, d_loss: 0.61543113, g_loss: 4.76545715, rnn_loss: 0.20744178
step: [219/553] time: 1.1247s, d_loss: 0.30364072, g_loss: 2.36711383, rnn_loss: 0.08983407
step: [274/553] time: 1.1161s, d_loss: 0.62864381, g_loss: 4.54120493, rnn_loss: 0.15475062
step: [329/553] time: 1.1702s, d_loss: 0.26278439, g_loss: 4.85251808, rnn_loss: 0.17785177
step: [384/553] time: 1.0476s, d_loss: 0.46184981, g_loss: 4.90346909, rnn_loss: 0.10858870
step: [439/553] time: 1.1450s, d_loss: 0.16138335, g_loss: 3.59154844, rnn_loss: 0.08557019
step: [494/553] time: 1.0655s, d_loss: 0.25810504, g_loss: 6.28621674, rnn_loss: 0.07678013
step: [549/553] time: 1.1093s, d_loss: 0.63640499, g_loss: 3.61862803, rnn_loss: 0.23555005
 ** Epoch 27 took 620.423441s
step: [54/553] time: 

step: [439/553] time: 1.1465s, d_loss: 0.46757692, g_loss: 2.94859076, rnn_loss: 0.13107812
step: [494/553] time: 1.0591s, d_loss: 0.99767947, g_loss: 3.87669706, rnn_loss: 0.12469649
step: [549/553] time: 1.1115s, d_loss: 0.37915301, g_loss: 5.26974392, rnn_loss: 0.20506689
 ** Epoch 35 took 623.194785s
step: [54/553] time: 1.1304s, d_loss: 0.15011901, g_loss: 3.76051664, rnn_loss: 0.08691207
step: [109/553] time: 1.1548s, d_loss: 0.15914884, g_loss: 4.62452221, rnn_loss: 0.22891343
step: [164/553] time: 1.1735s, d_loss: 0.19211984, g_loss: 5.55153370, rnn_loss: 0.25615343
step: [219/553] time: 1.1447s, d_loss: 0.18458055, g_loss: 3.55968380, rnn_loss: 0.14217053
step: [274/553] time: 1.1262s, d_loss: 0.15518099, g_loss: 3.46976733, rnn_loss: 0.09222329
step: [329/553] time: 1.0909s, d_loss: 0.11025024, g_loss: 5.35719776, rnn_loss: 0.13234779
step: [384/553] time: 1.1122s, d_loss: 0.13502529, g_loss: 4.45042086, rnn_loss: 0.16232164
step: [439/553] time: 1.1389s, d_loss: 0.43768555, 

step: [219/553] time: 1.0990s, d_loss: 1.49093318, g_loss: 5.81655455, rnn_loss: 0.18731730
step: [274/553] time: 1.1578s, d_loss: 0.12075916, g_loss: 2.80299997, rnn_loss: 0.22481544
step: [329/553] time: 1.0754s, d_loss: 0.27003527, g_loss: 3.74594259, rnn_loss: 0.11344445
step: [384/553] time: 1.1293s, d_loss: 0.26935437, g_loss: 5.62710810, rnn_loss: 0.14604051
step: [439/553] time: 1.0369s, d_loss: 0.60750091, g_loss: 4.14641857, rnn_loss: 0.17096931
step: [494/553] time: 1.1371s, d_loss: 0.10606676, g_loss: 4.02872276, rnn_loss: 0.13905196
step: [549/553] time: 1.0502s, d_loss: 0.46994638, g_loss: 5.14419842, rnn_loss: 0.26606834
 ** Epoch 44 took 610.059262s
step: [54/553] time: 1.0912s, d_loss: 0.53564399, g_loss: 6.27490044, rnn_loss: 0.13789123
step: [109/553] time: 1.1025s, d_loss: 0.18390572, g_loss: 2.80620241, rnn_loss: 0.16462666
step: [164/553] time: 1.1064s, d_loss: 0.58504754, g_loss: 5.16242313, rnn_loss: 0.17378914
step: [219/553] time: 1.0428s, d_loss: 0.25307170, 

 ** Epoch 52 took 614.755042s
step: [54/553] time: 1.1400s, d_loss: 0.12795842, g_loss: 2.00323296, rnn_loss: 0.17842087
step: [109/553] time: 1.1137s, d_loss: 0.11075994, g_loss: 2.92646718, rnn_loss: 0.16196865
step: [164/553] time: 1.1491s, d_loss: 0.26028189, g_loss: 7.75607967, rnn_loss: 0.10364932
step: [219/553] time: 1.2100s, d_loss: 0.13190362, g_loss: 5.01979256, rnn_loss: 0.03807469
step: [274/553] time: 1.1253s, d_loss: 0.37158963, g_loss: 6.13444567, rnn_loss: 0.13675097
step: [329/553] time: 1.0972s, d_loss: 0.11060577, g_loss: 3.27454519, rnn_loss: 0.17352235
step: [384/553] time: 1.1111s, d_loss: 0.01052710, g_loss: 8.65362167, rnn_loss: 0.10426150
step: [439/553] time: 1.1317s, d_loss: 0.06399741, g_loss: 3.26033354, rnn_loss: 0.21085273
step: [494/553] time: 1.1462s, d_loss: 0.30294192, g_loss: 5.87197399, rnn_loss: 0.13479081
step: [549/553] time: 1.1583s, d_loss: 0.09920631, g_loss: 3.61588407, rnn_loss: 0.03312340
 ** Epoch 53 took 611.568812s
step: [54/553] time: 

step: [384/553] time: 1.1210s, d_loss: 0.12887579, g_loss: 9.22751427, rnn_loss: 0.14420411
step: [439/553] time: 1.1267s, d_loss: 0.14085704, g_loss: 3.97485375, rnn_loss: 0.04304415
step: [494/553] time: 1.1230s, d_loss: 0.08559012, g_loss: 4.90691662, rnn_loss: 0.04822606
step: [549/553] time: 1.0572s, d_loss: 0.17557904, g_loss: 4.27266407, rnn_loss: 0.10463867
 ** Epoch 61 took 613.942306s
step: [54/553] time: 1.1384s, d_loss: 0.50990605, g_loss: 3.25266552, rnn_loss: 0.10464712
step: [109/553] time: 1.1119s, d_loss: 0.12600224, g_loss: 4.75193882, rnn_loss: 0.16195667
step: [164/553] time: 1.0879s, d_loss: 0.29363936, g_loss: 2.97237062, rnn_loss: 0.08551756
step: [219/553] time: 1.0846s, d_loss: 0.22568159, g_loss: 2.53418732, rnn_loss: 0.11687734
step: [274/553] time: 1.0767s, d_loss: 0.32009849, g_loss: 5.75840044, rnn_loss: 0.14245069
step: [329/553] time: 1.0987s, d_loss: 0.05218803, g_loss: 4.77447510, rnn_loss: 0.07700042
step: [384/553] time: 1.1387s, d_loss: 0.03741790, 

step: [219/553] time: 1.0363s, d_loss: 0.35391682, g_loss: 8.67548561, rnn_loss: 0.12321776
step: [274/553] time: 1.1053s, d_loss: 0.30954057, g_loss: 4.11697006, rnn_loss: 0.22572039
step: [329/553] time: 1.0623s, d_loss: 0.18921337, g_loss: 5.48306847, rnn_loss: 0.05057829
step: [384/553] time: 1.0861s, d_loss: 0.79551917, g_loss: 1.44367707, rnn_loss: 0.19589119
step: [439/553] time: 1.0659s, d_loss: 0.03164611, g_loss: 9.48096085, rnn_loss: 0.14052048
step: [494/553] time: 1.1653s, d_loss: 0.18663324, g_loss: 3.78264546, rnn_loss: 0.06226755
step: [549/553] time: 1.1451s, d_loss: 1.53490341, g_loss: 6.16805840, rnn_loss: 0.15233725
 ** Epoch 70 took 606.718659s
step: [54/553] time: 1.0784s, d_loss: 0.08043879, g_loss: 5.60106277, rnn_loss: 0.04939846
step: [109/553] time: 1.1557s, d_loss: 0.38688889, g_loss: 5.16584015, rnn_loss: 0.17420584
step: [164/553] time: 1.0973s, d_loss: 0.14847413, g_loss: 2.93566942, rnn_loss: 0.18805940
step: [219/553] time: 1.1023s, d_loss: 0.44590604, 

 ** Epoch 78 took 603.795231s
step: [54/553] time: 1.0961s, d_loss: 0.02776862, g_loss: 4.52806187, rnn_loss: 0.06675075
step: [109/553] time: 1.1004s, d_loss: 0.04729906, g_loss: 4.59502792, rnn_loss: 0.03766588
step: [164/553] time: 1.0338s, d_loss: 0.21768245, g_loss: 4.93410826, rnn_loss: 0.15894610
step: [219/553] time: 1.1098s, d_loss: 1.52419519, g_loss: 0.12348305, rnn_loss: 0.14648047
step: [274/553] time: 1.0837s, d_loss: 0.16750506, g_loss: 9.52077961, rnn_loss: 0.06385315
step: [329/553] time: 1.1859s, d_loss: 0.02729553, g_loss: 6.61132145, rnn_loss: 0.10553037
step: [384/553] time: 1.0277s, d_loss: 0.18684706, g_loss: 4.34644938, rnn_loss: 0.24784532
step: [439/553] time: 1.1583s, d_loss: 0.19538146, g_loss: 2.93559861, rnn_loss: 0.09688410
step: [494/553] time: 0.9841s, d_loss: 0.16673419, g_loss: 5.62372303, rnn_loss: 0.12362865
step: [549/553] time: 1.1125s, d_loss: 0.16693145, g_loss: 10.02689171, rnn_loss: 0.11736253
 ** Epoch 79 took 601.014220s
 ** new learning rat

step: [384/553] time: 1.0587s, d_loss: 0.00509893, g_loss: 6.91237068, rnn_loss: 0.08184418
step: [439/553] time: 1.0932s, d_loss: 0.05897102, g_loss: 4.45598221, rnn_loss: 0.24328756
step: [494/553] time: 1.0729s, d_loss: 0.21522698, g_loss: 5.78236294, rnn_loss: 0.06866859
step: [549/553] time: 1.1550s, d_loss: 0.02195061, g_loss: 5.73575687, rnn_loss: 0.06985673
 ** Epoch 87 took 608.344044s
step: [54/553] time: 1.0151s, d_loss: 0.05601602, g_loss: 3.97740984, rnn_loss: 0.17791638
step: [109/553] time: 1.1137s, d_loss: 0.03513929, g_loss: 5.07405376, rnn_loss: 0.13645625
step: [164/553] time: 1.1206s, d_loss: 0.07348637, g_loss: 7.95953560, rnn_loss: 0.12105651
step: [219/553] time: 1.0827s, d_loss: 0.04065099, g_loss: 2.67323232, rnn_loss: 0.12760487
step: [274/553] time: 1.0609s, d_loss: 0.99633902, g_loss: 3.86695123, rnn_loss: 0.09250172
step: [329/553] time: 1.0530s, d_loss: 0.08814814, g_loss: 5.41026258, rnn_loss: 0.08854595
step: [384/553] time: 1.1226s, d_loss: 0.09345616, 

step: [219/553] time: 1.1249s, d_loss: 0.50285649, g_loss: 7.69290638, rnn_loss: 0.06055969
step: [274/553] time: 1.0945s, d_loss: 0.04007732, g_loss: 3.69050050, rnn_loss: 0.14482872
step: [329/553] time: 1.1393s, d_loss: 0.04886438, g_loss: 4.37130499, rnn_loss: 0.10121408
step: [384/553] time: 1.1807s, d_loss: 1.20846558, g_loss: 1.23762453, rnn_loss: 0.13331589
step: [439/553] time: 1.0650s, d_loss: 0.11469346, g_loss: 3.60234642, rnn_loss: 0.12697160
step: [494/553] time: 1.0767s, d_loss: 0.00491094, g_loss: 6.19402361, rnn_loss: 0.03186202
step: [549/553] time: 1.1126s, d_loss: 0.01497525, g_loss: 5.91199732, rnn_loss: 0.11475118
 ** Epoch 96 took 602.425176s
step: [54/553] time: 1.0721s, d_loss: 0.20289254, g_loss: 3.14801693, rnn_loss: 0.07008798
step: [109/553] time: 1.1426s, d_loss: 0.03195674, g_loss: 5.36317635, rnn_loss: 0.14668928
step: [164/553] time: 1.1059s, d_loss: 0.01125731, g_loss: 5.94313002, rnn_loss: 0.21073838
step: [219/553] time: 1.0517s, d_loss: 0.01747210, 

step: [549/553] time: 1.0972s, d_loss: 0.00514068, g_loss: 6.14648724, rnn_loss: 0.04402585
 ** Epoch 104 took 615.479682s
step: [54/553] time: 1.1684s, d_loss: 0.25047293, g_loss: 3.33241081, rnn_loss: 0.15517293
step: [109/553] time: 1.0680s, d_loss: 0.08894019, g_loss: 6.48262453, rnn_loss: 0.16307811
step: [164/553] time: 1.0726s, d_loss: 0.02085728, g_loss: 5.78406191, rnn_loss: 0.07132156
step: [219/553] time: 1.0870s, d_loss: 0.47591916, g_loss: 6.17015314, rnn_loss: 0.11928892
step: [274/553] time: 1.1302s, d_loss: 0.48210678, g_loss: 2.57144165, rnn_loss: 0.09625901
step: [329/553] time: 1.1167s, d_loss: 1.02762961, g_loss: 5.22513723, rnn_loss: 0.12988979
step: [384/553] time: 1.1277s, d_loss: 0.00948985, g_loss: 5.56094933, rnn_loss: 0.09206101
step: [439/553] time: 1.0726s, d_loss: 0.04513664, g_loss: 6.80781841, rnn_loss: 0.07792510
step: [494/553] time: 1.1121s, d_loss: 0.30272216, g_loss: 4.99048805, rnn_loss: 0.09897691
step: [549/553] time: 1.1460s, d_loss: 0.21670121,

step: [384/553] time: 1.0254s, d_loss: 0.03152180, g_loss: 5.95677567, rnn_loss: 0.15430996
step: [439/553] time: 1.0995s, d_loss: 0.03242642, g_loss: 4.03701067, rnn_loss: 0.07191584
step: [494/553] time: 1.1099s, d_loss: 0.01674936, g_loss: 6.44470882, rnn_loss: 0.13397415
step: [549/553] time: 1.0644s, d_loss: 0.40065217, g_loss: 3.29887581, rnn_loss: 0.09080791
 ** Epoch 113 took 608.424594s
step: [54/553] time: 1.0993s, d_loss: 0.02183127, g_loss: 4.56968307, rnn_loss: 0.02180161
step: [109/553] time: 1.1237s, d_loss: 0.04534716, g_loss: 4.54189920, rnn_loss: 0.13836327
step: [164/553] time: 1.1142s, d_loss: 2.52015209, g_loss: 3.64686394, rnn_loss: 0.18123977
step: [219/553] time: 1.1210s, d_loss: 0.00589263, g_loss: 4.80582428, rnn_loss: 0.10656945
step: [274/553] time: 1.1745s, d_loss: 0.01834426, g_loss: 4.50578880, rnn_loss: 0.11826214
step: [329/553] time: 1.0442s, d_loss: 0.01694335, g_loss: 5.02903652, rnn_loss: 0.11831091
step: [384/553] time: 1.1528s, d_loss: 0.77335531,

step: [164/553] time: 1.0518s, d_loss: 0.03454088, g_loss: 7.97100544, rnn_loss: 0.09468859
step: [219/553] time: 1.0872s, d_loss: 0.00243239, g_loss: 6.61120605, rnn_loss: 0.05067404
step: [274/553] time: 1.0899s, d_loss: 0.10593595, g_loss: 3.50987864, rnn_loss: 0.07041869
step: [329/553] time: 1.0506s, d_loss: 1.72448432, g_loss: 5.18311930, rnn_loss: 0.09210553
step: [384/553] time: 1.0949s, d_loss: 0.07403724, g_loss: 4.60642624, rnn_loss: 0.15223311
step: [439/553] time: 1.0708s, d_loss: 0.10425515, g_loss: 6.15948439, rnn_loss: 0.13751037
step: [494/553] time: 1.1173s, d_loss: 0.16061281, g_loss: 9.25624561, rnn_loss: 0.09250811
step: [549/553] time: 1.0626s, d_loss: 0.34692791, g_loss: 7.71992111, rnn_loss: 0.18587422
 ** Epoch 122 took 613.636101s
step: [54/553] time: 1.0661s, d_loss: 0.52917069, g_loss: 3.18769741, rnn_loss: 0.09928373
step: [109/553] time: 1.0665s, d_loss: 0.01585220, g_loss: 3.08872890, rnn_loss: 0.05719649
step: [164/553] time: 1.1239s, d_loss: 0.04106286,

step: [549/553] time: 1.0363s, d_loss: 0.02546614, g_loss: 4.24951315, rnn_loss: 0.06667876
 ** Epoch 130 took 622.639375s
step: [54/553] time: 1.0944s, d_loss: 0.11362079, g_loss: 6.55506897, rnn_loss: 0.12646469
step: [109/553] time: 1.1609s, d_loss: 0.31458378, g_loss: 7.09548378, rnn_loss: 0.13226137


## 3. Evaluation metric

In [None]:
def generate_r_precision_data():
    caption_ids = np.reshape(np.asarray(test_dataset.captions_ids), (-1, cfg.TEXT.WORDS_NUM))
    captions_ids_wrong = np.reshape(test_dataset.random_wrong_captions(), (-1, cfg.WRONG_CAPTION, cfg.TEXT.WORDS_NUM))

    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    init = tf.global_variables_initializer()
    sess.run(init)

    # load the trained checkpoint
    checkpoint_dir = cfg.CHECKPOINT_DIR
    if checkpoint_dir is not None:
        loader = tf.train.Saver(var_list=tf.global_variables())
        ckpt_path = os.path.join(cfg.CHECKPOINT_DIR, CHECKPOINT_NAME)
        loader.restore(sess, ckpt_path)
        print("Restored model parameters from {}".format(ckpt_path))
    else:
        print('no checkpoints find.')

    n_caption_test = len(caption_ids)
    num_batches = n_caption_test // cfg.BATCH_SIZE

    true_cnn_features = np.zeros((num_batches, cfg.BATCH_SIZE, cfg.TEXT.EMBEDDING_DIM), dtype=float)
    true_rnn_features = np.zeros((num_batches, cfg.BATCH_SIZE, cfg.TEXT.EMBEDDING_DIM), dtype=float)
    wrong_rnn_features = np.zeros((num_batches, cfg.WRONG_CAPTION, cfg.BATCH_SIZE, cfg.TEXT.EMBEDDING_DIM), dtype=float)

    for i in range(num_batches):
        test_cap = caption_ids[i * cfg.BATCH_SIZE: (i + 1) * cfg.BATCH_SIZE]

        z = np.random.normal(loc=0.0, scale=1.0, size=(cfg.BATCH_SIZE, cfg.GAN.Z_DIM)).astype(np.float32)
        
        rnn_features = sess.run(rnn_encoder.outputs, feed_dict={t_real_caption: test_cap})
        gen = sess.run(generator.outputs, feed_dict={t_real_caption: test_cap, t_z: z})
        cnn_features = sess.run(cnn_encoder.outputs, feed_dict={t_real_image: gen})

        true_cnn_features[i] = cnn_features
        true_rnn_features[i] = rnn_features

        for per_wrong_caption in range(cfg.WRONG_CAPTION):
            test_cap = captions_ids_wrong[i * cfg.BATCH_SIZE: (i + 1) * cfg.BATCH_SIZE]
            rnn_features = sess.run(rnn_encoder.outputs, feed_dict={t_real_caption: test_cap[:, per_wrong_caption]})
            wrong_rnn_features[i, per_wrong_caption] = rnn_features
    
    # if exists, remove the existing file first
    try:
        os.remove(os.path.join(cfg.R_PRECISION_DIR, cfg.R_PRECISION_FILE))
    except OSError:
        pass
    np.savez(os.path.join(cfg.R_PRECISION_DIR, cfg.R_PRECISION_FILE), true_cnn=true_cnn_features, true_rnn=true_rnn_features,
             wrong_rnn=wrong_rnn_features)

In [None]:
def generate_inception_score_data():
    caption_ids = np.reshape(np.asarray(test_dataset.captions_ids),
                             (-1, cfg.TEXT.CAPTIONS_PER_IMAGE, cfg.TEXT.WORDS_NUM))
    
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    init = tf.global_variables_initializer()
    sess.run(init)

    checkpoint_dir = cfg.CHECKPOINT_DIR
    if checkpoint_dir is not None:
        loader = tf.train.Saver(var_list=tf.global_variables())
        ckpt_path = os.path.join(cfg.CHECKPOINT_DIR, cfg.CHECKPOINT_NAME)
        loader.restore(sess, ckpt_path)
        print("Restored model parameters from {}".format(ckpt_path))
    else:
        print('no checkpoints find.')

    n_caption_test = len(caption_ids)
    num_batches = n_caption_test // cfg.BATCH_SIZE

    for i in range(num_batches):
        for per_caption in range(cfg.TEXT.CAPTIONS_PER_IMAGE):
            test_cap = caption_ids[i * cfg.BATCH_SIZE: (i + 1) * cfg.BATCH_SIZE, per_caption]
            test_directory = test_dataset.filenames[i * cfg.BATCH_SIZE: (i + 1) * cfg.BATCH_SIZE]

            z = np.random.normal(loc=0.0, scale=1.0, size=(cfg.BATCH_SIZE, cfg.GAN.Z_DIM)).astype(np.float32)
            gen = sess.run(generator.outputs, feed_dict={t_real_caption: test_cap, t_z: z})
            
            for j in range(cfg.BATCH_SIZE):
                if not os.path.exists(os.path.join(cfg.TEST.GENERATED_TEST_IMAGES, test_directory[j].split('/')[0])):
                    os.mkdir(os.path.join(cfg.TEST.GENERATED_TEST_IMAGES, test_directory[j].split('/')[0]))

                scipy.misc.imsave(os.path.join(cfg.TEST.GENERATED_TEST_IMAGES, test_directory[j] + '_{}.png'.format(per_caption)), gen[j])

In [None]:
generate_r_precision_data()

In [None]:
generate_inception_score_data()

## 3. Measure Inception score and R-precision of given test dataset

After set the config file as 'eval_birds.yml' and run the 'generate_inception_score_data()' and 'generate_r_precision_data()', the synthesized images based on given captions and set of image and caption features should be saved inside a 'evaluation' folder, specifically in 'evaluation/generated_images/..' and as 'evaluation/r_precision.npz' respectively.

**Then, go to the 'evaluation' folder and run each 'inception_score.ipynb' and 'r_precision.ipynb' file in order to measure inception score and r-precision score.**