# M2177.003100 Deep Learning <br> Final Proejct: Text to Image Synthesis (Tensorflow)

Copyright (C) Data Science & AI Laboratory, Seoul National University. This material is for educational uses only. Some contents are based on the material provided by other paper/book authors and may be copyrighted by them. 

**For understanding of this work, please carefully look at given PPT file.**

**Note**: certain details are missing or ambiguous on purpose, in order to test your knowledge on the related materials. However, if you really feel that something essential is missing and cannot proceed to the next step, then contact the teaching staff with clear description of your problem.

### Submitting your work:
<font color=red>**DO NOT clear the training process **</font> so that TAs can grade both your code and results.  
**The TA will set a config file as 'eval_birds.yml' when evaluating the code using 'hidden test dataset'. Thus, please make sure that your code can generate proper data to measure inception score and R-precision of 'hidden test dataset'.**

## 1. Load datasets
The Birds dataset will be downloaded automatically if it is not located in the *data* directory. <br>

Implemented for: main_1215_b + EDSR_NoiseAugmentation + CNN_ENCODER_256, RNN_ENCODER_256

In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import os, nltk
from miscc.config import cfg, cfg_from_file
import pprint
import datetime
import dateutil.tz
import numpy as np
import scipy
from utils.data_utils import CUBDataset
from utils.loss import cosine_similarity
import pandas as pd
from scipy.io import loadmat
import re
import string
import random
import time

#################################################
# DO NOT CHANGE 
from utils.model_ import CNN_ENCODER, RNN_ENCODER, GENERATOR, DISCRIMINATOR, CNN_ENCODER_256, RNN_ENCODER_256
#################################################

%matplotlib inline

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# Set a config file as 'train_birds.yml' in training, as 'eval_birds.yml' for evaluation
cfg_from_file('cfg/train_birds.yml') # eval_birds.yml

print('Using config:')
pprint.pprint(cfg)

os.environ['CUDA_VISIBLE_DEVICES'] = cfg.GPU_ID

now = datetime.datetime.now(dateutil.tz.tzlocal())
timestamp = now.strftime('%Y_%m_%d_%H_%M_%S')
output_dir = 'sample/%s_%s_%s' % (cfg.DATASET_NAME, cfg.CONFIG_NAME, timestamp)

Using config:
{'BATCH_SIZE': 64,
 'CHECKPOINT_DIR': './checkpoint',
 'CHECKPOINT_NAME': 'model.ckpt',
 'CNN': {'EMBEDDING_DIM': 0, 'H_DIM': 0},
 'CONFIG_NAME': 'text-to-image',
 'CUDA': False,
 'DATASET_NAME': 'birds',
 'DATA_DIR': 'data/birds',
 'EMBEDDING_TYPE': 'cnn-rnn',
 'GAN': {'B_ATTENTION': False,
         'B_CONDITION': False,
         'B_DCGAN': False,
         'CONDITION_DIM': 0,
         'DF_DIM': 0,
         'EMBEDDING_DIM': 0,
         'GF_DIM': 0,
         'R_NUM': 0,
         'Z_DIM': 512},
 'GPU_ID': '0',
 'IMAGE_SIZE': 256,
 'NUM_BATCH_FOR_TEST': 0,
 'RANDOM_SEED': 0,
 'RNN': {'EMBEDDING_DIM': 0,
         'H_DIM': 0,
         'TYPE': '',
         'VOCAB_SIZE': 0,
         'WORD_EMBEDDING_DIM': 0},
 'R_PRECISION_DIR': './evaluation',
 'R_PRECISION_FILE': 'r_precision.npz',
 'R_PRECISION_FILE_HIDDEN': 'r_precision_hidden.npz',
 'TEST': {'B_EXAMPLE': False,
          'GENERATED_HIDDEN_TEST_IMAGES': './evaluation/generated_images_hidden',
          'GENERATED_TEST_IMAGES'

  yaml_cfg = edict(yaml.load(f))


In [3]:
train_dataset = CUBDataset(cfg.DATA_DIR, split='train')
test_dataset = CUBDataset(cfg.DATA_DIR, split='test')

print(f'\ntrain data directory:\n{train_dataset.split_dir}')
print(f'test data directory:\n{test_dataset.split_dir}\n')

print(f'# of train filenames:{train_dataset.filenames.shape}')
print(f'# of test filenames:{test_dataset.filenames.shape}\n')

print(f'example of filename of train image:{train_dataset.filenames[0]}')
print(f'example of filename of valid image:{test_dataset.filenames[0]}\n')

print(f'example of caption and its ids:\n{train_dataset.captions[0]}\n{train_dataset.captions_ids[0]}\n')
print(f'example of caption and its ids:\n{test_dataset.captions[0]}\n{test_dataset.captions_ids[0]}\n')

print(f'# of train captions:{np.asarray(train_dataset.captions).shape}')
print(f'# of test captions:{np.asarray(test_dataset.captions).shape}\n')

print(f'# of train caption ids:{np.asarray(train_dataset.captions_ids).shape}')
print(f'# of test caption ids:{np.asarray(test_dataset.captions_ids).shape}\n')

print(f'# of train images:{train_dataset.images.shape}')
print(f'# of test images:{test_dataset.images.shape}\n')

self.current_dir:
/home/chszerg/final-project-deep-learning-19-tf

self.data_dir:
/home/chszerg/final-project-deep-learning-19-tf/data/birds

self.image_dir:
/home/chszerg/final-project-deep-learning-19-tf/data/birds/CUB_200_2011.tgz

Dataset already exists
self.image_dir:
/home/chszerg/final-project-deep-learning-19-tf/data/birds/CUB_200_2011/images

Load from:  data/birds/captions.pickle
self.current_dir:
/home/chszerg/final-project-deep-learning-19-tf

self.data_dir:
/home/chszerg/final-project-deep-learning-19-tf/data/birds

self.image_dir:
/home/chszerg/final-project-deep-learning-19-tf/data/birds/CUB_200_2011.tgz

Dataset already exists
self.image_dir:
/home/chszerg/final-project-deep-learning-19-tf/data/birds/CUB_200_2011/images

Load from:  data/birds/captions.pickle

train data directory:
/home/chszerg/final-project-deep-learning-19-tf/data/birds/train
test data directory:
/home/chszerg/final-project-deep-learning-19-tf/data/birds/test

# of train filenames:(8855,)
# of test f

In [4]:
train_images = train_dataset.images
test_images = test_dataset.images
train_captions = np.asarray(train_dataset.captions_ids)
test_captions = np.asarray(test_dataset.captions_ids)
print(train_images.shape)
print(test_images.shape)
print(train_captions.shape)
print(test_captions.shape)

(8855, 256, 256, 3)
(2933, 256, 256, 3)
(88550, 20)
(29330, 20)


In [5]:
from skimage.transform import resize
train_images_64 = []
for train_image in train_images:
    train_images_64.append(resize(train_image, (64, 64, 3)))
train_images_64 = np.asarray(train_images_64)
print(train_images_64.shape)
assert train_images_64.shape[0] == train_images.shape[0]
test_images_64 = []
for test_image in test_images:
    test_images_64.append(resize(test_image, (64, 64, 3)))
test_images_64 = np.asarray(test_images_64)
print(test_images_64.shape)
assert test_images_64.shape[0] == test_images.shape[0]

(8855, 64, 64, 3)
(2933, 64, 64, 3)


In [6]:
n_captions_train = len(train_captions)
n_captions_per_image = 10
n_images_train = len(train_images)

In [7]:
import scipy.misc
import threading
import scipy.ndimage as ndi
from skimage import transform
from skimage import exposure
import skimage
from nltk.tokenize import RegexpTokenizer

def sent2ID(sample_sentence):
    caption = []
    cap = sample_sentence
    if len(cap) == 0:
        exit()
    cap = cap.replace("\ufffd\ufffd", " ")
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(cap.lower())
    tokens_new = []
    for t in tokens:
        t = t.encode('ascii', 'ignore').decode('ascii')
        if len(t) > 0:
            tokens_new.append(t)
    caption.append(tokens_new)
    caption_new = []
    t = caption[0]
    rev = []
    for w in t:
        if w in train_dataset.wordtoix:
            rev.append(train_dataset.wordtoix[w])
    x, x_len = train_dataset.get_caption(rev)
    caption_new.append(np.squeeze(x, axis=1))
    return caption_new

def ID2sent(sample_caption):
    sentence = []
    for ID in sample_caption:
        if ID != train_dataset.ixtoword['<PAD>']:
            sentence.append(train_dataset.ixtoword[ID])
    return sentence

def get_random_int(min=0, max=10, number=5):
    return [random.randint(min,max) for p in range(0,number)]

def merge(images, size):
    h, w = images.shape[1], images.shape[2]
    img = np.zeros((h * size[0], w * size[1], 3))
    for idx, image in enumerate(images):
        i = idx % size[1]
        j = idx // size[1]
        img[j*h:j*h+h, i*w:i*w+w, :] = image
    return img

def imsave(images, size, path):
    return scipy.misc.imsave(path, merge(images, size))

def save_images(images, size, image_path):
    return imsave(images, size, image_path)

def threading_data(data=None, fn=None, **kwargs):
    def apply_fn(results, i, data, kwargs):
        results[i] = fn(data, **kwargs)
    results = [None] * len(data)
    threads = []
    for i in range(len(data)):
        t = threading.Thread(
                        name='threading_and_return',
                        target=apply_fn,
                        args=(results, i, data[i], kwargs)
                        )
        t.start()
        threads.append(t)
    for t in threads:
        t.join()
    return np.asarray(results)

def threading_data_256(data=None, fn=None, **kwargs):
    def apply_fn(results, i, data, kwargs):
        results[i] = fn(data, **kwargs)
    results = [None] * len(data)
    threads = []
    for i in range(len(data)):
        t = threading.Thread(
                        name='threading_and_return',
                        target=apply_fn,
                        args=(results, i, data[i], kwargs)
                        )
        t.start()
        threads.append(t)
    for t in threads:
        t.join()
    return np.asarray(results)

def apply_transform(x, transform_matrix, channel_index=2, fill_mode='nearest', cval=0., order=1):
    x = np.rollaxis(x, channel_index, 0)
    final_affine_matrix = transform_matrix[:2, :2]
    final_offset = transform_matrix[:2, 2]
    channel_images = [ndi.interpolation.affine_transform(x_channel, final_affine_matrix,
                      final_offset, order=order, mode=fill_mode, cval=cval) for x_channel in x]
    x = np.stack(channel_images, axis=0)
    x = np.rollaxis(x, 0, channel_index + 1)
    return x

def transform_matrix_offset_center(matrix, x, y):
    o_x = float(x) / 2 + 0.5
    o_y = float(y) / 2 + 0.5
    offset_matrix = np.array([[1, 0, o_x], [0, 1, o_y], [0, 0, 1]])
    reset_matrix = np.array([[1, 0, -o_x], [0, 1, -o_y], [0, 0, 1]])
    transform_matrix = np.dot(np.dot(offset_matrix, matrix), reset_matrix)
    return transform_matrix

def rotation(x, rg=20, is_random=False, row_index=0, col_index=1, channel_index=2,
                    fill_mode='nearest', cval=0.):
    if is_random:
        theta = np.pi / 180 * np.random.uniform(-rg, rg)
    else:
        theta = np.pi / 180 * rg
    rotation_matrix = np.array([[np.cos(theta), -np.sin(theta), 0],
                                [np.sin(theta), np.cos(theta), 0],
                                [0, 0, 1]])
    h, w = x.shape[row_index], x.shape[col_index]
    transform_matrix = transform_matrix_offset_center(rotation_matrix, h, w)
    x = apply_transform(x, transform_matrix, channel_index, fill_mode, cval)
    return x

def crop(x, wrg, hrg, is_random=False, row_index=0, col_index=1, channel_index=2):
    h, w = x.shape[row_index], x.shape[col_index]
    assert (h > hrg) and (w > wrg), "The size of cropping should smaller than the original image"
    if is_random:
        h_offset = int(np.random.uniform(0, h-hrg) - 1)
        w_offset = int(np.random.uniform(0, w-wrg) - 1)
        return x[h_offset: hrg + h_offset ,w_offset: wrg + w_offset]
    else:
        h_offset = int(np.floor((h - hrg)/ 2.))
        w_offset = int(np.floor((w - wrg)/ 2.))
        h_end = h_offset + hrg
        w_end = w_offset + wrg
        return x[h_offset: h_end, w_offset: w_end]

def flip_axis(x, axis, is_random=False):
    if is_random:
        factor = np.random.uniform(-1, 1)
        if factor > 0:
            x = np.asarray(x).swapaxes(axis, 0)
            x = x[::-1, ...]
            x = x.swapaxes(0, axis)
            return x
        else:
            return x
    else:
        x = np.asarray(x).swapaxes(axis, 0)
        x = x[::-1, ...]
        x = x.swapaxes(0, axis)
        return x

def imresize(x, size=[100, 100], interp='bilinear', mode=None):
    if x.shape[-1] == 1:
        x = scipy.misc.imresize(x[:, :, 0], size, interp=interp, mode=mode)
        return x[:, :, np.newaxis]
    elif x.shape[-1] == 3:
        return scipy.misc.imresize(x, size, interp=interp, mode=mode)
    else:
        raise Exception("Unsupported channel %d" % x.shape[-1])

def prepro_img(x, mode=None):
    if mode=='train':
        x = flip_axis(x, axis=1, is_random=True)
        x = rotation(x, rg=16, is_random=True, fill_mode='nearest')
        x = imresize(x, size=[64 + 15, 64 + 15], interp='bilinear', mode=None)
        x = crop(x, wrg=64, hrg=64, is_random=True)
        x = x / (255. / 2.)
        x = x - 1.
    return x

def prepro_img_256(x, mode=None):
    if mode=='train':
        x = flip_axis(x, axis=1, is_random=True)
        x = rotation(x, rg=16, is_random=True, fill_mode='nearest')
        x = imresize(x, size=[256 + 60, 256 + 60], interp='bilinear', mode=None)
        x = crop(x, wrg=256, hrg=256, is_random=True)
        x = x / (255. / 2.)
        x = x - 1.
    return x

def combine_and_save_image_sets(image_sets, directory):
    for i in range(len(image_sets[0])):
        combined_image = []
        for set_no in range(len(image_sets)):
            combined_image.append(image_sets[set_no][i])
            combined_image.append(np.zeros((image_sets[set_no][i].shape[0], 5, 3)))
        combined_image = np.concatenate(combined_image, axis = 1)
        scipy.misc.imsave(os.path.join(directory, 'combined_{}.jpg'.format(i)), combined_image)

def save_encoders_256(saver, sess, logdir, step):
    model_name = 'encoders_256.ckpt'
    checkpoint_path = os.path.join(logdir, model_name)
    if not os.path.exists(logdir):
        os.makedirs(logdir)
    saver.save(sess, checkpoint_path, global_step=step)
    print('The checkpoint has been created.')
        
def load(saver, sess, ckpt_path):
    saver.restore(sess, ckpt_path)
    print("Restored model parameters from {}".format(ckpt_path))

In [8]:
import tensorflow.contrib.slim as slim

def resBlock(x, channels=64, kernel_size=[3,3], scale=1):
    tmp = slim.conv2d(x, channels, kernel_size, activation_fn=None)
    tmp = tf.nn.relu(tmp)
    tmp = slim.conv2d(tmp, channels, kernel_size, activation_fn=None)
    tmp *= scale
    return x + tmp

def upsample(x, scale=2, features=64, activation=tf.nn.relu):
    assert scale in [2,3,4]
    x = slim.conv2d(x, features, [3,3], activation_fn=activation)
    if scale == 2:
        ps_features = 3*(scale**2)
        x = slim.conv2d(x, ps_features, [3,3], activation_fn=activation)
        x = PS(x, 2, color=True)
    elif scale == 3:
        ps_features =3*(scale**2)
        x = slim.conv2d(x, ps_features, [3,3], activation_fn=activation)
        x = PS(x, 3, color=True)
    elif scale == 4:
        ps_features = 3*(2**2)
        for i in range(2):
            x = slim.conv2d(x, ps_features, [3,3], activation_fn=activation)
            x = PS(x, 2, color=True)
    return x

def _phase_shift(I, r):
    bsize, a, b, c = I.get_shape().as_list()
    bsize = tf.shape(I)[0]
    X = tf.reshape(I, (bsize, a, b, r, r))
    X = tf.transpose(X, (0, 1, 2, 4, 3))
    X = tf.split(X, a, 1)
    X = tf.concat([tf.squeeze(x, axis=1) for x in X],2)
    X = tf.split(X, b, 1)
    X = tf.concat([tf.squeeze(x, axis=1) for x in X],2)
    return tf.reshape(X, (bsize, a*r, b*r, 1))

def PS(X, r, color=False):
    if color:
        Xc = tf.split(X, 3, 3)
        X = tf.concat([_phase_shift(x, r) for x in Xc],3)
    else:
        X = _phase_shift(X, r)
    return X

def log10(x):
    numerator = tf.log(x)
    denominator = tf.log(tf.constant(10, dtype=numerator.dtype))
    return numerator / denominator

Loading Text2Img and EDSR

In [9]:
z_dim = 512
batch_size = 64
sample_size = batch_size
ni = 8
sample_seed = np.random.normal(loc=0.0, scale=1.0, size=(sample_size, z_dim)).astype(np.float32)
sample_sentence = ["a black bird with oily black feathers and rounded black beak."] * int(sample_size/ni) + \
                  ["a medium sized black bird, with a white belly, and webbed feet."] * int(sample_size/ni) + \
                  ["this is a white bird with black webbed feet and a black beak."] * int(sample_size/ni) + \
                  ["a small dully colored bird that has a grey head and nape, an oatmeal colored breast, belly and yellow and oatmeal-grey colored wings and tail."] * int(sample_size/ni) + \
                  ["this bird has a yellow throat, breast and belly, with a black band at the throat, and black crown, wings and tail."] * int(sample_size/ni) + \
                  ["this small bird is soft green all over, with a light eyering, short wings and moderate tail."] * int(sample_size/ni) + \
                  ["small bird with crown and throat is yellow, outer and inner rectrices are grey, beak is small, black and pointed."] * int(sample_size/ni) + \
                  ["a brown bird with striped wings, a large head, a long pointy beak, a long tail, and narrow legs."] * int(sample_size/ni)
for i, sent in enumerate(sample_sentence):
    sample_sentence[i] = sent2ID(sent)
sample_sentence = np.asarray(sample_sentence)
sample_sentence = np.reshape(sample_sentence, (sample_size, 20))
print(sample_sentence.shape)

(64, 20)


In [10]:
class Text2Img:
    def __init__(self):
        """ Information """
        self.lr = 0.0002
        self.lr_decay = 0.5      
        self.decay_every = 100  
        self.beta1 = 0.5
        self.checkpoint_dir = './checkpoint'
        self.z_dim = 512
        self.image_size = 64
        self.c_dim = 3
        self.batch_size = 64
        self.alpha = 0.2
        
        """ Place Holders """
        self.t_real_image = tf.placeholder('float32', [self.batch_size, self.image_size, self.image_size, 3], name = 'real_image')
        self.t_wrong_image = tf.placeholder('float32', [self.batch_size ,self.image_size, self.image_size, 3], name = 'wrong_image')
        self.t_real_caption = tf.placeholder(dtype=tf.int64, shape=[self.batch_size, None], name='real_caption_input')
        self.t_wrong_caption = tf.placeholder(dtype=tf.int64, shape=[self.batch_size, None], name='wrong_caption_input')
        self.t_z = tf.placeholder(tf.float32, [self.batch_size, self.z_dim], name='z_noise')
        
        """ Training Phase - CNN - RNN mapping """
        net_cnn = CNN_ENCODER(self.t_real_image, is_training=True, reuse=False)
        x = net_cnn.outputs
        v = RNN_ENCODER(self.t_real_caption, is_training=True, reuse=False).outputs
        x_w = CNN_ENCODER(self.t_wrong_image, is_training=True, reuse=True).outputs
        v_w = RNN_ENCODER(self.t_wrong_caption, is_training=True, reuse=True).outputs
        rnn_loss = tf.reduce_mean(tf.maximum(0., self.alpha - cosine_similarity(x, v) + cosine_similarity(x, v_w))) + \
                    tf.reduce_mean(tf.maximum(0., self.alpha - cosine_similarity(x, v) + cosine_similarity(x_w, v)))
        
        """ Training Phase - GAN """
        net_rnn = RNN_ENCODER(self.t_real_caption, is_training=False, reuse=True)
        net_fake_image = GENERATOR(self.t_z, net_rnn.outputs, is_training=True, reuse=False)
        net_disc_fake = DISCRIMINATOR(net_fake_image.outputs, net_rnn.outputs, is_training=True, reuse=False)
        disc_fake_logits = net_disc_fake.logits
        net_disc_real = DISCRIMINATOR(self.t_real_image, net_rnn.outputs, is_training=True, reuse=True)
        disc_real_logits = net_disc_real.logits
        net_disc_mismatch = DISCRIMINATOR(self.t_real_image, RNN_ENCODER(self.t_wrong_caption, is_training=False, reuse=True).outputs,
                                        is_training=True, reuse=True)
        disc_mismatch_logits = net_disc_mismatch.logits
        d_loss1 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=disc_real_logits,     labels=tf.ones_like(disc_real_logits),      name='d1'))
        d_loss2 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=disc_mismatch_logits, labels=tf.zeros_like(disc_mismatch_logits), name='d2'))
        d_loss3 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=disc_fake_logits,     labels=tf.zeros_like(disc_fake_logits),     name='d3'))
        d_loss = d_loss1 + (d_loss2 + d_loss3) * 0.5
        g_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=disc_fake_logits, labels=tf.ones_like(disc_fake_logits), name='g'))

        """ Testing Phase """
        self.net_g = GENERATOR(self.t_z, RNN_ENCODER(self.t_real_caption, is_training=False, reuse=True).outputs,
                            is_training=False, reuse=True)
        
        """ Training """
        rnn_vars = [var for var in tf.trainable_variables() if 'rnnencoder' in var.name]
        cnn_vars = [var for var in tf.trainable_variables() if 'cnnencoder' in var.name]
        d_vars = [var for var in tf.trainable_variables() if 'discriminator' in var.name]
        g_vars = [var for var in tf.trainable_variables() if 'generator' in var.name]
        update_ops_CNN = [var for var in tf.get_collection(tf.GraphKeys.UPDATE_OPS) if 'cnnencoder' in var.name]
        update_ops_D = [var for var in tf.get_collection(tf.GraphKeys.UPDATE_OPS) if 'discriminator' in var.name]
        update_ops_G = [var for var in tf.get_collection(tf.GraphKeys.UPDATE_OPS) if 'generator' in var.name]
        with tf.variable_scope('learning_rate'):
            lr_v = tf.Variable(self.lr, trainable=False)
        with tf.control_dependencies(update_ops_CNN):
            grads, _ = tf.clip_by_global_norm(tf.gradients(rnn_loss, rnn_vars + cnn_vars), 10)
            optimizer = tf.train.AdamOptimizer(lr_v, beta1=self.beta1)
            rnn_optim = optimizer.apply_gradients(zip(grads, rnn_vars + cnn_vars))
        with tf.control_dependencies(update_ops_D):
            d_optim = tf.train.AdamOptimizer(lr_v, beta1=self.beta1).minimize(d_loss, var_list=d_vars)
        with tf.control_dependencies(update_ops_G):
            g_optim = tf.train.AdamOptimizer(lr_v, beta1=self.beta1).minimize(g_loss, var_list=g_vars)

In [11]:
class EDSR(object):
    def __init__(self, img_size=64, num_layers=32, feature_size=256, scale=4, output_channels=3, lr=1e-4):
        """ Information """
        self.img_size = img_size
        self.scale = scale
        self.output_channels = output_channels
        self.lr = tf.Variable(lr, trainable=False)
        
        """ Place Holders"""
        self.input = x = tf.placeholder(tf.float32, [None, img_size, img_size, output_channels])
        self.target = y = tf.placeholder(tf.float32,[None, img_size*scale, img_size*scale, output_channels])
        image_input = x
        image_target = y
        
        """ Graph """
        x = slim.conv2d(image_input, feature_size, [3,3])
        conv_1 = x
        scaling_factor = 0.1
        for i in range(num_layers):
            x = resBlock(x, feature_size, scale=scaling_factor)
        x = slim.conv2d(x, feature_size, [3,3])
        x += conv_1
        x = upsample(x, scale, feature_size, None)
        self.out = output = tf.nn.tanh(x)
        
        """ Loss and Optimizer """
        self.loss = loss = tf.reduce_mean(tf.losses.absolute_difference(image_target, output))
        optimizer = tf.train.AdamOptimizer(self.lr)
        self.train_op = optimizer.minimize(self.loss)

In [12]:
tf.reset_default_graph()
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.allow_soft_placement = True

In [15]:
text2img_graph = tf.Graph()
with text2img_graph.as_default():
    text2img = Text2Img()

edsr_graph = tf.Graph()
with edsr_graph.as_default():
    edsr = EDSR()

text2img_sess = tf.Session(graph=text2img_graph)
edsr_sess = tf.Session(graph=edsr_graph)

text2img_checkpoint_dir = './64_gen_checkpoint'
with text2img_sess.as_default():
    with text2img_graph.as_default():
        tf.global_variables_initializer().run()
        text2img_saver = tf.train.Saver(tf.global_variables())
        text2img_ckpt = tf.train.get_checkpoint_state(text2img_checkpoint_dir)
        text2img_saver.restore(text2img_sess, text2img_ckpt.model_checkpoint_path)

edsr_checkpoint_dir = './EDSR_checkpoint'
with edsr_sess.as_default():
    with edsr_graph.as_default():
        tf.global_variables_initializer().run()
        edsr_saver = tf.train.Saver(tf.global_variables())
        edsr_ckpt = tf.train.get_checkpoint_state(edsr_checkpoint_dir)
        edsr_saver.restore(edsr_sess, edsr_ckpt.model_checkpoint_path)

INFO:tensorflow:Restoring parameters from ./64_gen_checkpoint/model.ckpt-950
INFO:tensorflow:Restoring parameters from ./EDSR_checkpoint/model.ckpt-216


In [16]:
sample_images = text2img_sess.run([text2img.net_g.outputs],
                                  feed_dict={text2img.t_real_caption : sample_sentence, text2img.t_z : sample_seed})
sample_images = np.asarray(sample_images)
sample_images = np.reshape(sample_images, [64, 64, 64, 3])
print(sample_images.shape)
save_images(sample_images, [8, 8], 'temp/temp1.png')

(64, 64, 64, 3)


`imsave` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``imageio.imwrite`` instead.


In [17]:
output_images = edsr_sess.run([edsr.out], feed_dict={edsr.input: sample_images})
output_images = np.asarray(output_images)
output_images.shape
output_images = np.reshape(output_images, [64, 256, 256, 3])
print(output_images.shape)
save_images(output_images, [8, 8], 'temp/temp2.png')

(64, 256, 256, 3)


`imsave` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``imageio.imwrite`` instead.


Train CNN_ENCODER_256, RNN_ENCODER_256

In [None]:
class ENCODERS_256:
    def __init__(self):
        """ Information """
        self.batch_size = 64
        self.alpha = 0.2
        self.lr = 2e-4
        self.beta1 = 0.5
        
        """ Place Holders """
        self.real_image_256 = tf.placeholder('float32', [self.batch_size, 256, 256, 3], name = 'real_image')
        self.wrong_image_256 = tf.placeholder('float32', [self.batch_size, 256, 256, 3], name = 'fake_image')
        self.real_caption_256 = tf.placeholder(dtype=tf.int64, shape=[self.batch_size, None], name='real_caption_input')
        self.wrong_caption_256 = tf.placeholder(dtype=tf.int64, shape=[self.batch_size, None], name='wrong_caption_input')
        self.t_z_256 = tf.placeholder(tf.float32, [self.batch_size, 512], name='z_noise')
        
        """ Training Phase - CNN - RNN mapping """
        net_cnn_256 = CNN_ENCODER_256(self.real_image_256, is_training=True, reuse=False)
        x_256 = net_cnn_256.outputs
        v_256 = RNN_ENCODER_256(self.real_caption_256, is_training=True, reuse=False).outputs
        x_w_256 = CNN_ENCODER_256(self.wrong_image_256, is_training=True, reuse=True).outputs
        v_w_256 = RNN_ENCODER_256(self.wrong_caption_256, is_training=True, reuse=True).outputs
        self.rnn_loss_256 = tf.reduce_mean(tf.maximum(0., self.alpha - cosine_similarity(x_256, v_256) + cosine_similarity(x_256, v_w_256))) + \
                        tf.reduce_mean(tf.maximum(0., self.alpha - cosine_similarity(x_256, v_256) + cosine_similarity(x_w_256, v_256)))
        
        """ Training """
        rnn_vars_256 = [var for var in tf.trainable_variables() if 'rnn_256' in var.name]
        cnn_vars_256 = [var for var in tf.trainable_variables() if 'cnn_256' in var.name]
        update_ops_CNN_256 = [var for var in tf.get_collection(tf.GraphKeys.UPDATE_OPS) if 'cnn_256' in var.name]
        with tf.variable_scope('learning_rate'):
            self.lr_v_256 = tf.Variable(self.lr, trainable=False)
        with tf.control_dependencies(update_ops_CNN_256):
            grads, _ = tf.clip_by_global_norm(tf.gradients(self.rnn_loss_256, rnn_vars_256 + cnn_vars_256), 10.)
            optimizer = tf.train.AdamOptimizer(self.lr_v_256, beta1=self.beta1)
            self.rnn_optim_256 = optimizer.apply_gradients(zip(grads, rnn_vars_256 + cnn_vars_256))

In [None]:
lr = 2e-4
lr_decay = 0.5      
decay_every = 80
beta1 = 0.5
checkpoint_dir = './checkpoint_encoders_256'
z_dim = 512
image_size = 256
c_dim = 3
batch_size = 64
n_epoch = 200

In [None]:
tf.reset_default_graph()
model = ENCODERS_256()

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.allow_soft_placement = True
sess = tf.Session(config=config)
init = tf.global_variables_initializer()
sess.run(init)
saver = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=10)
ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
if ckpt and ckpt.model_checkpoint_path:
    loader = tf.train.Saver(var_list=tf.global_variables())
    load_step = int(os.path.basename(ckpt.model_checkpoint_path).split('-')[1])
    load(loader, sess, ckpt.model_checkpoint_path)
else:
    print('no checkpoints find.')

n_batch_epoch = int(n_images_train / batch_size)
for epoch in range(n_epoch):
    start_time = time.time()
    if epoch !=0 and (epoch % decay_every == 0):
        new_lr_decay = lr_decay ** (epoch // decay_every)
        sess.run(tf.assign(model.lr_v_256, lr * new_lr_decay))
        log = " ** new learning rate: %f" % (lr * new_lr_decay)
        print(log)
    elif epoch == 0:
        log = " ** init lr: %f  decay_every_epoch: %d, lr_decay: %f" % (lr, decay_every, lr_decay)
        print(log)
    for step in range(n_batch_epoch):
        step_time = time.time()
        idexs = get_random_int(min=0, max=n_captions_train-1, number=batch_size)
        b_real_caption = train_captions[idexs]
        b_real_images = train_images[np.floor(np.asarray(idexs).astype('float') / n_captions_per_image).astype('int')]
        idexs = get_random_int(min=0, max=n_captions_train-1, number=batch_size)
        b_wrong_caption = train_captions[idexs]
        idexs2 = get_random_int(min=0, max=n_images_train-1, number=batch_size)
        b_wrong_images = train_images[idexs2]
        b_z = np.random.normal(loc=0.0, scale=1.0, size=(batch_size, z_dim)).astype(np.float32)
        b_real_images = threading_data(b_real_images, prepro_img_256, mode='train')
        b_wrong_images = threading_data(b_wrong_images, prepro_img_256, mode='train')
        errRNN, _ = sess.run([model.rnn_loss_256, model.rnn_optim_256], feed_dict={
                                        model.real_image_256 : b_real_images,
                                        model.wrong_image_256 : b_wrong_images,
                                        model.real_caption_256 : b_real_caption,
                                        model.wrong_caption_256 : b_wrong_caption})
        if (step + 1) % (n_batch_epoch // 10) == 0:
            print("Step: [%d/%d] time: %4.4fs, Loss: %.8f" % (step, n_batch_epoch, time.time() - step_time, errRNN))
    if (epoch + 1) % 1 == 0:
        print(" ** Epoch %d took %fs" % (epoch, time.time()-start_time))
    if (epoch != 0) and (epoch % 20) == 0:
        save_encoders_256(saver, sess, checkpoint_dir, epoch)
        print("[*] Save checkpoints SUCCESS!")

## 3. Evaluation metric

In [None]:
def generate_r_precision_data():
    caption_ids = np.reshape(np.asarray(test_dataset.captions_ids), (-1, cfg.TEXT.WORDS_NUM))
    captions_ids_wrong = np.reshape(test_dataset.random_wrong_captions(), (-1, cfg.WRONG_CAPTION, cfg.TEXT.WORDS_NUM))

    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    init = tf.global_variables_initializer()
    sess.run(init)

    # load the trained checkpoint
    checkpoint_dir = cfg.CHECKPOINT_DIR
    if checkpoint_dir is not None:
        loader = tf.train.Saver(var_list=tf.global_variables())
        ckpt_path = os.path.join(cfg.CHECKPOINT_DIR, CHECKPOINT_NAME)
        loader.restore(sess, ckpt_path)
        print("Restored model parameters from {}".format(ckpt_path))
    else:
        print('no checkpoints find.')

    n_caption_test = len(caption_ids)
    num_batches = n_caption_test // cfg.BATCH_SIZE

    true_cnn_features = np.zeros((num_batches, cfg.BATCH_SIZE, cfg.TEXT.EMBEDDING_DIM), dtype=float)
    true_rnn_features = np.zeros((num_batches, cfg.BATCH_SIZE, cfg.TEXT.EMBEDDING_DIM), dtype=float)
    wrong_rnn_features = np.zeros((num_batches, cfg.WRONG_CAPTION, cfg.BATCH_SIZE, cfg.TEXT.EMBEDDING_DIM), dtype=float)

    for i in range(num_batches):
        test_cap = caption_ids[i * cfg.BATCH_SIZE: (i + 1) * cfg.BATCH_SIZE]

        z = np.random.normal(loc=0.0, scale=1.0, size=(cfg.BATCH_SIZE, cfg.GAN.Z_DIM)).astype(np.float32)
        
        rnn_features = sess.run(rnn_encoder.outputs, feed_dict={t_real_caption: test_cap})
        gen = sess.run(generator.outputs, feed_dict={t_real_caption: test_cap, t_z: z})
        cnn_features = sess.run(cnn_encoder.outputs, feed_dict={t_real_image: gen})

        true_cnn_features[i] = cnn_features
        true_rnn_features[i] = rnn_features

        for per_wrong_caption in range(cfg.WRONG_CAPTION):
            test_cap = captions_ids_wrong[i * cfg.BATCH_SIZE: (i + 1) * cfg.BATCH_SIZE]
            rnn_features = sess.run(rnn_encoder.outputs, feed_dict={t_real_caption: test_cap[:, per_wrong_caption]})
            wrong_rnn_features[i, per_wrong_caption] = rnn_features
    
    # if exists, remove the existing file first
    try:
        os.remove(os.path.join(cfg.R_PRECISION_DIR, cfg.R_PRECISION_FILE))
    except OSError:
        pass
    np.savez(os.path.join(cfg.R_PRECISION_DIR, cfg.R_PRECISION_FILE), true_cnn=true_cnn_features, true_rnn=true_rnn_features,
             wrong_rnn=wrong_rnn_features)

In [None]:
def generate_inception_score_data():
    caption_ids = np.reshape(np.asarray(test_dataset.captions_ids),
                             (-1, cfg.TEXT.CAPTIONS_PER_IMAGE, cfg.TEXT.WORDS_NUM))
    
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    init = tf.global_variables_initializer()
    sess.run(init)

    checkpoint_dir = cfg.CHECKPOINT_DIR
    if checkpoint_dir is not None:
        loader = tf.train.Saver(var_list=tf.global_variables())
        ckpt_path = os.path.join(cfg.CHECKPOINT_DIR, cfg.CHECKPOINT_NAME)
        loader.restore(sess, ckpt_path)
        print("Restored model parameters from {}".format(ckpt_path))
    else:
        print('no checkpoints find.')

    n_caption_test = len(caption_ids)
    num_batches = n_caption_test // cfg.BATCH_SIZE

    for i in range(num_batches):
        for per_caption in range(cfg.TEXT.CAPTIONS_PER_IMAGE):
            test_cap = caption_ids[i * cfg.BATCH_SIZE: (i + 1) * cfg.BATCH_SIZE, per_caption]
            test_directory = test_dataset.filenames[i * cfg.BATCH_SIZE: (i + 1) * cfg.BATCH_SIZE]

            z = np.random.normal(loc=0.0, scale=1.0, size=(cfg.BATCH_SIZE, cfg.GAN.Z_DIM)).astype(np.float32)
            gen = sess.run(generator.outputs, feed_dict={t_real_caption: test_cap, t_z: z})
            
            for j in range(cfg.BATCH_SIZE):
                if not os.path.exists(os.path.join(cfg.TEST.GENERATED_TEST_IMAGES, test_directory[j].split('/')[0])):
                    os.mkdir(os.path.join(cfg.TEST.GENERATED_TEST_IMAGES, test_directory[j].split('/')[0]))

                scipy.misc.imsave(os.path.join(cfg.TEST.GENERATED_TEST_IMAGES, test_directory[j] + '_{}.png'.format(per_caption)), gen[j])

In [None]:
generate_r_precision_data()

In [None]:
generate_inception_score_data()

## 3. Measure Inception score and R-precision of given test dataset

After set the config file as 'eval_birds.yml' and run the 'generate_inception_score_data()' and 'generate_r_precision_data()', the synthesized images based on given captions and set of image and caption features should be saved inside a 'evaluation' folder, specifically in 'evaluation/generated_images/..' and as 'evaluation/r_precision.npz' respectively.

**Then, go to the 'evaluation' folder and run each 'inception_score.ipynb' and 'r_precision.ipynb' file in order to measure inception score and r-precision score.**