# Environment setting

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
from tensorflow.keras import layers
import os
#from transformers import BertTokenizer, TFBertModel

import string
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import PIL
from PIL import Image
import random
import time
from pathlib import Path

import re
from IPython import display

import tensorflow_hub as hub
import tensorflow_text


MAX_SEQ_LENGTH = 20

In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [3]:
COMP3_DIR = "/home/yuan65536/DL-hw/kaggle03/"

In [4]:
dictionary_path = COMP3_DIR + 'dictionary'
vocab = np.load(dictionary_path + '/vocab.npy')
print('there are {} vocabularies in total'.format(len(vocab)))

word2Id_dict = dict(np.load(dictionary_path + '/word2Id.npy'))
id2word_dict = dict(np.load(dictionary_path + '/id2Word.npy'))
print('Word to id mapping, for example: %s -> %s' % ('flower', word2Id_dict['flower']))
print('Id to word mapping, for example: %s -> %s' % ('1', id2word_dict['1']))
print('Tokens: <PAD>: %s; <RARE>: %s' % (word2Id_dict['<PAD>'], word2Id_dict['<RARE>']))

there are 5427 vocabularies in total
Word to id mapping, for example: flower -> 1
Id to word mapping, for example: 1 -> flower
Tokens: <PAD>: 5427; <RARE>: 5428


## Preprocess Text

In [5]:
def sent2IdList(line, MAX_SEQ_LENGTH=20):
    MAX_SEQ_LIMIT = MAX_SEQ_LENGTH
    padding = 0
    
    # data preprocessing, remove all puntuation in the texts
    prep_line = re.sub('[%s]' % re.escape(string.punctuation), ' ', line.rstrip())
    prep_line = prep_line.replace('-', ' ')
    prep_line = prep_line.replace('-', ' ')
    prep_line = prep_line.replace('  ', ' ')
    prep_line = prep_line.replace('.', '')
    tokens = prep_line.split(' ')
    tokens = [
        tokens[i] for i in range(len(tokens))
        if tokens[i] != ' ' and tokens[i] != ''
    ]
    l = len(tokens)
    padding = MAX_SEQ_LIMIT - l
    
    # make sure length of each text is equal to MAX_SEQ_LENGTH, and replace the less common word with <RARE> token
    for i in range(padding):
        tokens.append('<PAD>')
    line = [
        word2Id_dict[tokens[k]]
        if tokens[k] in word2Id_dict else word2Id_dict['<RARE>']
        for k in range(len(tokens))
    ]

    return line

text = "the flower shown has yellow anther red pistil and bright red petals."
print(text)
print(sent2IdList(text))

the flower shown has yellow anther red pistil and bright red petals.
['9', '1', '82', '5', '11', '70', '20', '31', '3', '29', '20', '2', '5427', '5427', '5427', '5427', '5427', '5427', '5427', '5427']


## Dataset

In [6]:
data_path = COMP3_DIR + 'dataset'
df = pd.read_pickle(data_path + '/text2ImgData.pkl')
num_training_sample = len(df)
n_images_train = num_training_sample
print('There are %d image in training data' % (n_images_train))

There are 7370 image in training data


In [7]:
df.head(5)


Unnamed: 0_level_0,Captions,ImagePath
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
6734,"[[9, 2, 17, 9, 1, 6, 14, 13, 18, 3, 41, 8, 11,...",./102flowers/image_06734.jpg
6736,"[[4, 1, 5, 12, 2, 3, 11, 31, 28, 68, 106, 132,...",./102flowers/image_06736.jpg
6737,"[[9, 2, 27, 4, 1, 6, 14, 7, 12, 19, 5427, 5427...",./102flowers/image_06737.jpg
6738,"[[9, 1, 5, 8, 54, 16, 38, 7, 12, 116, 325, 3, ...",./102flowers/image_06738.jpg
6739,"[[4, 12, 1, 5, 29, 11, 19, 7, 26, 70, 5427, 54...",./102flowers/image_06739.jpg


In [8]:
def id_to_string(df):
    for i in range(len(df)):
    #     if i > 1: break
        captures_word = []
        for cap in df['Captions'].iloc[i]:
            capture_word = ''
            for id_ in cap:
                word = id2word_dict[id_]
                if word == '<PAD>':
                    continue
                capture_word += word+' '
#             print(capture_word)
            captures_word.append(capture_word)
        df['Captions'].iloc[i] = captures_word

In [9]:
df.head(5)

Unnamed: 0_level_0,Captions,ImagePath
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
6734,"[[9, 2, 17, 9, 1, 6, 14, 13, 18, 3, 41, 8, 11,...",./102flowers/image_06734.jpg
6736,"[[4, 1, 5, 12, 2, 3, 11, 31, 28, 68, 106, 132,...",./102flowers/image_06736.jpg
6737,"[[9, 2, 27, 4, 1, 6, 14, 7, 12, 19, 5427, 5427...",./102flowers/image_06737.jpg
6738,"[[9, 1, 5, 8, 54, 16, 38, 7, 12, 116, 325, 3, ...",./102flowers/image_06738.jpg
6739,"[[4, 12, 1, 5, 29, 11, 19, 7, 26, 70, 5427, 54...",./102flowers/image_06739.jpg


## Create Dataset by Dataset API

In [10]:
# in this competition, you have to generate image in size 64x64x3
IMAGE_HEIGHT = 64
IMAGE_WIDTH = 64
IMAGE_CHANNEL = 3

def training_data_generator(caption, image_path):

    img = tf.io.read_file(image_path)
    img = tf.image.decode_image(img, channels=3)
    img = tf.image.convert_image_dtype(img, tf.float32)
    img.set_shape([None, None, 3])
    img = tf.image.resize(img, size=[IMAGE_HEIGHT, IMAGE_WIDTH])
    img.set_shape([IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNEL])
    img = img * 2. -1.

    return img, caption

def dataset_generator(filenames, batch_size, data_generator):
    # load the training data into two NumPy arrays
    df = pd.read_pickle(filenames)
    id_to_string(df)
    captions = df['Captions'].values
    caption = []
    # each image has 1 to 10 corresponding captions
    # we choose one of them randomly for training
    for i in range(len(captions)):

        caption.append(random.choice(captions[i]))
    caption = np.asarray(caption)

    for i in df.index:
        df['ImagePath'].loc[i] = COMP3_DIR + df['ImagePath'].loc[i][2:]
        
    image_path = df['ImagePath'].values
    assert caption.shape[0] == image_path.shape[0]
    
    dataset = tf.data.Dataset.from_tensor_slices((caption, image_path))
    dataset = dataset.map(data_generator, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.shuffle(len(caption)).batch(batch_size, drop_remainder=True)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

    return dataset

In [11]:
BATCH_SIZE = 64
dataset = dataset_generator(data_path + '/text2ImgData.pkl', BATCH_SIZE, training_data_generator)

## Conditional GAN Model
## Text Encoder

直接import tensorflow提供的universal sentence encoder幫助我們將句子變成embedding
然而這邊有點矛盾，因為這個encoder出來的照片沒有原本RNN架構的圖片漂亮，但是這個universal sentence encoder的分數卻比較高，應該有更好的改進方式，但我們決定利用kaggle的分數為主來選擇使用的encoder，也有想用過其他的encoder，但是這個USC的架構最好使用，就決定使用這個encoder為主了

In [12]:
class TextEncoder(tf.keras.Model):
    """
    Encode text (a captio n) into hidden representation
    input: text, which is a text
    output: embedding
    """
    def __init__(self):
        super(TextEncoder, self).__init__()
        module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
        self.text_encoder = hub.load(module_url)

    
    def call(self, text):
        return self.text_encoder(text)

## Define function

In [13]:
class conv(tf.keras.Model):
    def __init__(self, filters, size, stride):
        super(conv, self).__init__()
        self.conv = layers.Conv2D(filters, size, stride, padding="same", kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02), use_bias=False)
        self.bn = layers.BatchNormalization()
        

    def call(self, inputs, training=False):
        x = self.bn(self.conv(inputs), training=training)
        return x
'''class conv_relu(tf.keras.Model):
    def __init__(self, filters, size, stride):
        super(conv_relu, self).__init__()
        self.conv = layers.Conv2D(filters, size, stride, padding="same", kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02), use_bias=False)
        self.bn =layers.BatchNormalization()
        self.ReLU = layers.ReLU()

    def call(self, inputs, training=False):
        x = self.bn(self.conv(inputs), training=training)
        x = self.ReLU(x)
        return x'''    


class conv_leaky_relu(tf.keras.Model):
    def __init__(self, filters, size, stride):
        super(conv_leaky_relu, self).__init__()
        self.conv = layers.Conv2D(filters, size, stride, padding="same", kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),  use_bias=False)
        self.bn = layers.BatchNormalization()
        self.LeakyReLU = layers.LeakyReLU(0.2)

    def call(self, inputs, training=False):
        x = self.bn(self.conv(inputs), training=training)
        x = self.LeakyReLU(x)
        return x
    
def UpSampleBlock(filters_num):
        
    return tf.keras.Sequential([
                tf.keras.layers.UpSampling2D(),
                tf.keras.layers.Conv2D(filters=filters_num, kernel_size=3, strides=1, padding='SAME',
                                       use_bias=False, kernel_initializer=tf.keras.initializers.TruncatedNormal(mean=0.0, stddev=0.02)),
                tf.keras.layers.BatchNormalization(),

            ])

## Generator

1226將G的relu 全部都換成leaky_relu試試

In [14]:
class Generator(tf.keras.Model):
    def __init__(self, hparas):
        super(Generator, self).__init__()
        self.hparas = hparas

        self.hparas = hparas
        self.t_dim = 128
        self.gf_dim = 128
        self.image_size = 64
        self.c_dim = 3
        
        self.text_fc = tf.keras.layers.Dense(128, kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
                                              use_bias=False)
         
        self.dense_block = tf.keras.Sequential([
                tf.keras.layers.Dense(128*8*4*4, kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),use_bias=False),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Reshape((4, 4, 128*8))
            ])
        #self.conv1_1 = conv_leaky_relu(256,1,1)
        #self.conv1_2 = conv_leaky_relu(256,3,1)
        #self.conv1_3 = conv(1024,3,1)
        
        self.upsample1 = UpSampleBlock(filters_num=128*4)
        
        self.conv2_1 = conv_leaky_relu(128,1,1)
        self.conv2_2 = conv_leaky_relu(128,3,1)
        self.conv2_3 = conv(512,3,1)
        
        self.upsample2 = UpSampleBlock(128*2)
        self.upsample3 = UpSampleBlock(128)
        #self.upsample4 = UpSampleBlock(128)
        self.upsample4 = UpSampleBlock(64)
        self.conv_o = conv(3, 3, 1)
   

    def call(self, text, noise_z, training):

        x_in = tf.concat([noise_z, text], axis=1)

        x_0 = self.dense_block(x_in, training=training) #(-1*4*4*1024)
        
        x_2 = self.upsample1(x_0)  #(-1*8*8*512)
        x_2 = tf.nn.relu(x_2)
        x = self.conv2_1(x_2)
        x = self.conv2_2(x)
        x = self.conv2_3(x)
        
        x_3 = tf.add_n([x_2, x])  
        x_3_output = tf.nn.relu(x_3) 

        x_4 = self.upsample2(x_3_output)  #(-1*16*16*256)
        x_4 = tf.nn.relu(x_4)
        x_5 = self.upsample3(x_4)  #(-1*32*32*128)
        x_5 = tf.nn.relu(x_5)
        x_6 = self.upsample4(x_5) #(-1*64*64*256)
        x_6 = tf.nn.relu(x_6)
        x_o =  self.conv_o(x_6)
        
        outputs = tf.nn.tanh(x_o)
        logits = x_o
        
        # return logits, output, mean, logvar
        return logits, outputs

## Discriminator

In [15]:
class Discriminator(tf.keras.Model):
    def __init__(self, hparas):
        super(Discriminator, self).__init__()
        self.hparas = hparas
        self.conv0 = layers.Conv2D(64, 4, 2, padding="same", kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
                                   use_bias=False)
        self.conv1 = conv_leaky_relu(128,4,2)
        self.conv2 = conv_leaky_relu(256,4,2)
        self.conv3 = conv_leaky_relu(512,4,2)
        
        self.conv4_1 = conv_leaky_relu(128,1,1)
        self.conv4_2 = conv_leaky_relu(128,3,1)
        self.conv4_3 = conv_leaky_relu(512,3,1)
        
        self.text_fc = tf.keras.layers.Dense(128, kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),  use_bias=False)
        self.leakyRelu1 = layers.LeakyReLU(0.1)
        self.conv5 = conv_leaky_relu(512,3,1)
        self.conv6 = layers.Conv2D(1, 4, 4, padding="same", kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
                                    use_bias=False)
    def call(self, input_image, input_text, training):
        x = self.conv0(input_image,training=training)
        x = tf.nn.leaky_relu(x)
        x = self.conv1(x,training=training)
        x = self.conv2(x,training=training)
        x = self.conv3(x,training=training)

        x_1 = self.conv4_1(x,training=training)
        x_1 = self.conv4_2(x_1,training=training)
        x_1 = self.conv4_3(x_1,training=training)

        x_2 = tf.add_n([x_1, x])
        x_2 = tf.nn.leaky_relu(x_2)

        t = self.text_fc(input_text,training=training)
#         t = self.leakyRelu1(t)
        t = tf.reshape(t, (-1, 1, 1, 128))
        t = tf.tile(t, [1, 4, 4, 1])

        x_t_concat = tf.concat([x_2, t], axis=3)

        x_t =  self.conv5(x_t_concat,training=training)
        logits = self.conv6(x_t,training=training)
        output = tf.nn.sigmoid(logits)
        #     logits, outputs
        return logits, output

# Hyperparameter and model declaration

In [16]:
hparas = {
    'MAX_SEQ_LENGTH': 20,                     # maximum sequence length
#     'EMBED_DIM': 256,                         # word embedding dimension
    'EMBED_DIM': 512,
    'VOCAB_SIZE': len(word2Id_dict),          # size of dictionary of captions
    'RNN_HIDDEN_SIZE': 128,                   # number of RNN neurons
    'Z_DIM': 512,                             # random noise z dimension
    'DENSE_DIM': 128,                         # number of neurons in dense layer
    'IMAGE_SIZE': [64, 64, 3],                # render image size
    'BATCH_SIZE': 64,
    'LR': 1e-4,
    'LR_DECAY': 0.5,
    'BETA_1': 0.5,
    'N_EPOCH': 1000,
    'N_SAMPLE': num_training_sample,          # size of training data
    'CHECKPOINTS_DIR': './checkpoints/demo_test',  # checkpoint path
    'PRINT_FREQ': 1,                          # printing frequency of loss
    'SHOW_IMAGES': 20
}

In [17]:
text_encoder = TextEncoder()
generator = Generator(hparas)
discriminator = Discriminator(hparas)

## Loss Function and Optimization

原本的loss只考慮(正確圖片、正確敘述)(錯誤圖片、正確敘述) 但是沒有考慮到(正確圖片、錯誤敘述)的情況，因此我們多加上一個loss，結果發現加上之後效果就有大幅提升



In [18]:
# This method returns a helper function to compute cross entropy loss
cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [19]:
def discriminator_loss(real_logits, fake_logits, wrong_logits):

    # output value of real image should be 1
    real_loss = cross_entropy(tf.ones_like(real_logits), real_logits)
    # output value of fake image should be 0
    fake_loss = cross_entropy(tf.zeros_like(fake_logits), fake_logits)

    wrong_loss = cross_entropy(tf.zeros_like(wrong_logits), wrong_logits)

    total_loss = real_loss + 0.5*(fake_loss + wrong_loss)

    return total_loss

def generator_loss(fake_output):
    # output value of fake image should be 0
    return cross_entropy(tf.ones_like(fake_output), fake_output)

In [20]:
# we use seperated optimizers for training generator and discriminator
generator_optimizer = tf.keras.optimizers.Adam(hparas['LR'])
discriminator_optimizer = tf.keras.optimizers.Adam(hparas['LR'])

# Check points

1226 改變一下noise的分布 試試看
前幾個epoch看起來好像整陀爛掉，等全部train完再試試(X)#沒用

In [21]:
# one benefit of tf.train.Checkpoint() API is we can save everything seperately
checkpoint_dir = hparas['CHECKPOINTS_DIR']
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(generator_optimizer=generator_optimizer,
                                 discriminator_optimizer=discriminator_optimizer,
                                 text_encoder=text_encoder,
                                 generator=generator,
                                 discriminator=discriminator)

In [22]:
print(checkpoint)

<tensorflow.python.training.tracking.util.Checkpoint object at 0x7f9bd3e847b8>


## train step

In [23]:
@tf.function
def train_step_D(real_image, caption):
    # random noise for generator

    noise = tf.random.normal(shape=[hparas['BATCH_SIZE'], hparas['Z_DIM']])
    
    wrong_caption = tf.concat([caption[1:], tf.expand_dims(caption[0], 0)], axis=0)
    with tf.GradientTape() as disc_tape:
        text_embed = text_encoder(caption)
        wrong_text_embed = text_encoder(wrong_caption)
        _, fake_image = generator(text_embed, noise, True)
        real_logits, real_output = discriminator(real_image, text_embed, True)
        fake_logits, fake_output = discriminator(fake_image, text_embed, True)
        wrong_logits, wrong_output = discriminator(real_image, wrong_text_embed, training=True)
        
        # for gradient penalty
        epsilon = tf.compat.v2.random.uniform([hparas['BATCH_SIZE'], 1, 1, 1], 0., 1.)
        mix_image = epsilon*real_image + (1. - epsilon)*fake_image
            
        mix_logits, mix_output = discriminator(mix_image, text_embed, training=True)
           
        grad = tf.square(tf.sqrt(tf.reduce_sum(tf.square(tf.gradients(mix_logits, mix_image)[0]), axis=[1, 2])) - 1.)

        g_loss = generator_loss(fake_logits)
        d_loss = discriminator_loss(real_logits, fake_logits, wrong_logits) + 10.*tf.reduce_mean(grad)

    grad_d = disc_tape.gradient(d_loss, discriminator.trainable_variables)

    discriminator_optimizer.apply_gradients(zip(grad_d, discriminator.trainable_variables))
    
    return g_loss, d_loss

In [24]:
@tf.function
def train_step_G(real_image, caption):
    # random noise for generator
 
    noise = tf.random.normal(shape=[hparas['BATCH_SIZE'], hparas['Z_DIM']])
    
    wrong_caption = tf.concat([caption[1:], tf.expand_dims(caption[0], 0)], axis=0)

    with tf.GradientTape() as gen_tape:
        text_embed = text_encoder(caption)
        wrong_text_embed = text_encoder(wrong_caption)
        _, fake_image = generator(text_embed, noise, True)
        real_logits, real_output = discriminator(real_image, text_embed, True)
        fake_logits, fake_output = discriminator(fake_image, text_embed, True)
        wrong_logits, wrong_output = discriminator(real_image, wrong_text_embed, training=True)
        
        # for gradient penalty
        epsilon = tf.compat.v2.random.uniform([hparas['BATCH_SIZE'], 1, 1, 1], 0., 1.)
        mix_image = epsilon*real_image + (1. - epsilon)*fake_image
            
        mix_logits, mix_output = discriminator(mix_image, text_embed, training=True)
           
        grad = tf.square(tf.sqrt(tf.reduce_sum(tf.square(tf.gradients(mix_logits, mix_image)[0]), axis=[1, 2])) - 1.)

        g_loss = generator_loss(fake_logits)
        d_loss = discriminator_loss(real_logits, fake_logits, wrong_logits) + 10.*tf.reduce_mean(grad)

    grad_g = gen_tape.gradient(g_loss, generator.trainable_variables)

    generator_optimizer.apply_gradients(zip(grad_g, generator.trainable_variables))
    
    return g_loss, d_loss

## test step

In [86]:
@tf.function
def test_step(caption, noise):
    text_embed = text_encoder(caption)
    print(text_embed)
    _, fake_image = generator(text_embed, noise)
    return fake_image

## Visualiztion

In [26]:
def merge(images, size):
    h, w = images.shape[1], images.shape[2]
    img = np.zeros((h * size[0], w * size[1], 3))
    for idx, image in enumerate(images):
        i = idx % size[1]
        j = idx // size[1]
        img[j*h:j*h+h, i*w:i*w+w, :] = image
    return img

def imsave(images, size, path):
    # getting the pixel values between [0, 1] to save it
    return plt.imsave(path, merge(images, size)*0.5 + 0.5)

def save_images(images, size, image_path):
    return imsave(images, size, image_path)

In [27]:
def sample_generator(caption, batch_size):
    caption = np.asarray(caption)
    caption = caption.astype(np.int)
    dataset = tf.data.Dataset.from_tensor_slices(caption)
    dataset = dataset.batch(batch_size)
    return dataset

In [28]:
ni = int(np.ceil(np.sqrt(hparas['BATCH_SIZE'])))
sample_size = hparas['BATCH_SIZE']
sample_seed = np.random.normal(loc=0.0, scale=1.0, size=(sample_size, hparas['Z_DIM'])).astype(np.float32)
sample_sentence = ["the flower shown has yellow anther red pistil and bright red petals."] * int(sample_size/ni) + \
                  ["this flower has petals that are yellow, white and purple and has dark lines"] * int(sample_size/ni) + \
                  ["the petals on this flower are white with a yellow center"] * int(sample_size/ni) + \
                  ["this flower has a lot of small round pink petals."] * int(sample_size/ni) + \
                  ["this flower is orange in color, and has petals that are ruffled and rounded."] * int(sample_size/ni) + \
                  ["the flower has yellow petals and the center of it is brown."] * int(sample_size/ni) + \
                  ["this flower has petals that are blue and white."] * int(sample_size/ni) +\
                  ["these white flowers have petals that start off white in color and end in a white towards the tips."] * int(sample_size/ni)

sample_sentence = tf.data.Dataset.from_tensor_slices(sample_sentence).batch(hparas['BATCH_SIZE'])

## Training

和GAN的lab一樣使用每5次生成，更新一次discriminator
由於embedding改為使用universal sentence encoder沒有用到RNN 所以input進train裡面就沒有hidden，改為在training Generator的時候直接將sentence轉為embedding，並且跟noise concate在一起，之後丟到Generator裡面做training

In [29]:
# ratio of training step D:G = 5:1
ratio = 6

def train(dataset, epochs):
    steps_per_epoch = int(hparas['N_SAMPLE'] / hparas['BATCH_SIZE'])
    turn = 0
    
    for epoch in range(hparas['N_EPOCH']):
        g_total_loss = 0
        d_total_loss = 0
        start = time.time()
        
        for image, caption in dataset:
            if (turn + 1) == ratio:
                g_loss, d_loss = train_step_G(image, caption)
            else:
                g_loss, d_loss = train_step_D(image, caption)
            turn = (turn + 1) % ratio
            g_total_loss += g_loss
            d_total_loss += d_loss
            
        time_tuple = time.localtime()
        time_string = time.strftime("%m/%d/%Y, %H:%M:%S", time_tuple)
            
        print("Epoch {}, gen_loss: {:.4f}, disc_loss: {:.4f}".format(epoch+1,
                                                                     g_total_loss/steps_per_epoch,
                                                                     d_total_loss/steps_per_epoch))
        print('Time for epoch {} is {:.4f} sec'.format(epoch+1, time.time()-start))
        
        # save the model
        if (epoch + 1) % 5 == 0:
            checkpoint.save(file_prefix = checkpoint_prefix)
        
        # visualization
        if (epoch + 1) % hparas['PRINT_FREQ'] == 0:
            for caption in sample_sentence:
                fake_image = test_step(caption, sample_seed)
            save_images(fake_image, [ni, ni], './samples/demo_test/train_{:02d}.jpg'.format(epoch))
            
        # show the generated images
        if (epoch + 1) % hparas['SHOW_IMAGES'] == 0:    
            img = Image.open('./samples/demo_test/train_{:02d}.jpg'.format(epoch))
            display.display(img)

In [None]:
train(dataset, hparas['N_EPOCH'])

## Testing dataset 

In [30]:
def id_to_string_for_testdata(ids):
    length = len(ids)
    caption = []
    for i in range(length):
        caption.append(ids[i])
    for cap in range(len(caption)):
        caption_text = ""
        for word in caption[cap]:
            if id2word_dict[word] != '<PAD>':
                caption_text += id2word_dict[word]
                caption_text += ' '
        caption[cap] = caption_text
    strings = np.asarray(caption)
    return strings

def testing_dataset_generator(batch_size):
    caption = id_to_string_for_testdata(captions)

    index = data['ID'].values
    index = np.asarray(index)
    
    dataset = tf.data.Dataset.from_tensor_slices((caption, index))
    dataset = dataset.repeat().batch(batch_size)
    
    return dataset

In [31]:
data = pd.read_pickle(data_path +'/testData.pkl')
captions = data['Captions'].values
NUM_TEST = len(captions)
EPOCH_TEST = int(NUM_TEST / hparas['BATCH_SIZE'])


In [32]:
testing_dataset = testing_dataset_generator(hparas['BATCH_SIZE'])

## Inferece 

In [94]:
def inference(dataset):
#     hidden = text_encoder.initialize_hidden_state()
    sample_size = hparas['BATCH_SIZE']
    sample_seed = np.random.normal(loc=0.0, scale=1.0, size=(sample_size, hparas['Z_DIM'])).astype(np.float32)
    
    step = 0
    start = time.time()
    for captions, idx in dataset:
        print(captions.shape)
        if step > EPOCH_TEST:
            break
        
        fake_image = test_step(captions, sample_seed)
        step += 1
        for i in range(hparas['BATCH_SIZE']):
            plt.imsave('./inference/demo_test/inference_{:04d}.jpg'.format(idx[i]), fake_image[i].numpy()*0.5 + 0.5)
            #print("number of captions:",idx[i])
            print('captions: ',captions[i])
            
    print('Time for inference is {:.4f} sec'.format(time.time()-start))

In [99]:

s1=tf.convert_to_tensor([
    'this flower has a lot of tall orange petals and a lot of brown anthers ',
   'this flower has a lot of tall orange petals and a lot of brown anthers ',
   'this flower has a lot of tall orange petals and a lot of brown anthers ',
   'this flower has a lot of tall orange petals and a lot of brown anthers ',
   'this flower has a lot of tall orange petals and a lot of brown anthers '])
print(s1.shape)
sample_size =5

sample_seed = np.random.normal(loc=0.0, scale=1.0, size=(sample_size, hparas['Z_DIM'])).astype(np.float32)
print(sample_seed.shape)
for i in range(5):
            fake_image=test_step(s1,sample_seed)
            plt.imsave('./inference/demo_final/temp5/inference_{:04d}.jpg'.format(i), fake_image[i].numpy()*0.5 + 0.5)


(5,)
(5, 512)


In [58]:
print(checkpoint_dir)

./checkpoints/demo_test


In [59]:
checkpoint.restore(checkpoint_dir + '/ckpt-172')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f9bd46fbf98>

In [95]:
inference(testing_dataset)

(64,)
captions:  tf.Tensor(b'this white and purple flower has fragile petals and soft stamens ', shape=(), dtype=string)
captions:  tf.Tensor(b'this flower has four large wide pink petals with white centers and vein like markings ', shape=(), dtype=string)
captions:  tf.Tensor(b'a flower with broad white and pink ribbed petals and yellow stamen ', shape=(), dtype=string)
captions:  tf.Tensor(b'one prominet pistil with alarger stigam and many stamens with anthers ', shape=(), dtype=string)
captions:  tf.Tensor(b'leaves are green in color petals are light pink in color ', shape=(), dtype=string)
captions:  tf.Tensor(b'this flower is bright pink with overlapping petals and a lime green pistil ', shape=(), dtype=string)
captions:  tf.Tensor(b'this flower is white and yellow in color with petals that are multi colored ', shape=(), dtype=string)
captions:  tf.Tensor(b'this flower has 4 leaves three are purple and yellow with lines and one is solid purple ', shape=(), dtype=string)
captions: 

# combine picture

In [None]:
import cv2

img1 = cv2.imread('/home/yuan65536/DL-hw/kaggle03/inference/demo_final/temp1/inference_0000.jpg')
img2 = cv2.imread('/home/yuan65536/DL-hw/kaggle03/inference/demo_final/temp1/inference_0001.jpg')
img3 = cv2.imread('/home/yuan65536/DL-hw/kaggle03/inference/demo_final/temp1/inference_0002.jpg')
img4 = cv2.imread('/home/yuan65536/DL-hw/kaggle03/inference/demo_final/temp1/inference_0003.jpg')
img5 = cv2.imread('/home/yuan65536/DL-hw/kaggle03/inference/demo_final/temp1/inference_0004.jpg')

h_img = cv2.hconcat([img1, img2,img3,img4,img5])


cv2.imshow('Vertical', h_img)
cv2.waitKey(0)
cv2.destroyAllWindows()