### reference
- https://www.tensorflow.org/tutorials/text/image_captioning
- https://www.tensorflow.org/tutorials/text/transformer

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
tf.enable_eager_execution()

import re
import numpy as np
import os
import time
import json
from glob import glob
from PIL import Image
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import nltk
from utils import *

In [None]:
annotations = load_pickle('data/train/train.annotations.pkl')
annotations = annotations

all_captions = annotations['caption']
image_id = annotations['image_id']
all_img_name_vector = annotations['file_name']

train_captions, img_name_vector = shuffle(all_captions, all_img_name_vector, random_state=1)

In [None]:
annotation_file = 'dataset/annotations/captions_train2014.json'

with open(annotation_file,'r') as f:
    annotations = json.load(f)
    
all_captions = []
all_img_name_vector = []

for annot in annotations['annotations']:
    caption = '<start> ' + annot['caption'] + ' <end>'
    image_id = annot['image_id']
    full_coco_image_path = 'dataset/train2014/' + 'COCO_train2014_' + '%012d.jpg' % (image_id)
    
    all_img_name_vector.append(full_coco_image_path)
    all_captions.append(caption)
    
train_captions, img_name_vector = shuffle(all_captions,
                                         all_img_name_vector,
                                         random_state=1)

# Optionally, limit the size of the training set for faster training, 30,000, original = 414,113
# num_examples = 100000
# train_captions = train_captions[:num_examples]
# img_name_vector = img_name_vector[:num_examples]

In [None]:
len(train_captions), len(img_name_vector), len(all_captions)

In [None]:
def load_image(image_path):
    img = tf.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize_images(img, (299,299))
    img = tf.keras.applications.inception_v3.preprocess_input(img)
    return img, image_path

### Initialize InceptionV3 and load the pretrained Imagenet weights

In [None]:
image_model = tf.keras.applications.InceptionV3(include_top=False, weights='imagenet')

new_input = image_model.input
hidden_layer = image_model.layers[-1].output # tensor

image_features_extract_model = tf.keras.Model(new_input, hidden_layer)

### Caching the features extracted from InceptionV3

In [None]:
encode_train = sorted(set(img_name_vector))

# feel free to change the batch_size according to your system configuration
image_dataset = tf.data.Dataset.from_tensor_slices(encode_train).map(load_image).batch(16)

for img, path in tqdm(image_dataset):
    batch_features = image_features_extract_model(img)
    batch_features = tf.reshape(batch_features,
                                  (batch_features.shape[0], -1, batch_features.shape[3]))
    
    for bf, p in zip(batch_features, path):
        path_of_feature = p.numpy().decode("utf-8")
        np.save(path_of_feature, bf.numpy())

### Preprocess and tokenize the captions

In [None]:
# This will find the maximum length of any caption in our dataset
def calc_max_length(tensor):
    return max(len(t) for t in tensor)

In [None]:
# # # The step above is a general process of dealing with text processing

# # # choosing the top 5000 words from the vocaburary
top_k = 15000
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_k,
                                                 oov_token="<unk>",
                                                 filters='!"#$&()*+.,-/:;=?@[\]^_`{|}~ ')
tokenizer.fit_on_texts(train_captions)
train_seqs = tokenizer.texts_to_sequences(train_captions) 

tokenizer.word_index['<pad>'] = 0

In [None]:
tokenizer = load_pickle('data/word/word_tokenizer.pkl')

In [None]:
len(tokenizer.index_word)

In [None]:
# creating the tokenized vectors
train_seqs = tokenizer.texts_to_sequences(train_captions)

In [None]:
# padding each vector to the max_length of the captions
# if the max_length parameter is not provided, pad_sequence calculates that automatically
cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')

# calculating the max_length
# used to store the attention weights
max_length = calc_max_length(train_seqs) # 48

In [None]:
tokenizer.index_word

### save tokenizer as pickle

In [None]:
path = 'data/word/word_tokenizer_2.pkl'
with open(path, 'wb') as f:
    pickle.dump(tokenizer, f)
    print('Saved %s..' %path)

### Split the data into training and testing

In [None]:
# Create training and validation sets using 80-20 split
# img_name_train, img_name_val, cap_train, cap_val = train_test_split(img_name_vector,
#                                                                    cap_vector,
#                                                                    test_size=0.2,
#                                                                    random_state=0)

# np.shape(img_name_train), np.shape(cap_train), np.shape(img_name_val), np.shape(cap_val)
img_name_train = img_name_vector
cap_train = cap_vector

In [None]:
# feel free to change these parameters according to your system's configuration

BATCH_SIZE = 64
BUFFER_SIZE = 1000 
embedding_dim = 256 # d_model
units = 512
vocab_size = len(tokenizer.word_index) # 9023
# shape of the vector extracted from InceptionV3 is (64, 2048)
# these two variables represent that
features_shape = 2048
attention_features_shape = 64

In [None]:
vocab_size

In [None]:
# loading the numpy files
def map_func(img_name, cap):
    img_tensor = np.load(''+img_name.decode('utf-8')+'.npy')
    return img_tensor, cap

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train)) # 24000

# using map to load the numpy files in parallel
# NOTE: Be sure to set num_parallel_calls to the number of CPU cores you have
# https://www.tensorflow.org/api_docs/python/tf/py_func
dataset = dataset.map(lambda item1, item2: tf.py_func(
            map_func, [item1, item2], [tf.float32, tf.int32]), num_parallel_calls=8)

# shuffling and batching
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(1)

### Positional encoding

In [None]:
def get_angles(pos, i, d_model):
    angle_rates = 1/np.power(10000, (2*(i//2))/np.float32(d_model))
    return pos*angle_rates

In [None]:
def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis], np.arange(d_model)[np.newaxis, :], d_model)
    
    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2]) 
    
    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    
    pos_encoding = angle_rads[np.newaxis, ...]
    
    return tf.cast(pos_encoding, dtype=tf.float32)

### Masking

In [None]:
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    
    # add extra dimensions to add the padding
    # to the attention logits.
    return seq[:, tf.newaxis, tf.newaxis, :] # (batch_size, 1, 1, seq_len)

In [None]:
def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask

### Scaled dot product attention

In [None]:
def scaled_dot_product_attention(q, k, v, mask):
    """Calculate the attention weights.
    q, k, v must have matching leading dimensions.
    k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
    The mask has different shapes depending on its type(padding or look ahead) 
    but it must be broadcastable for addition.
  
    Args:
      q: query shape == (..., seq_len_q, depth)
      k: key shape == (..., seq_len_k, depth)
      v: value shape == (..., seq_len_v, depth_v)
      mask: Float tensor with shape broadcastable 
          to (..., seq_len_q, seq_len_k). Defaults to None.
    
    Returns:
      output, attention_weights
    """
    
    matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
    
    # scale matmul_qk
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
#     print(scaled_attention_logits.shape)
    
    # add the mask to the scaled tensor.
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)
        
    # softmax is normalized on the last axis (seq_len_k) so that the scores add up to 1.
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
    # (..., seq_len_q, seq_len_k)
    
    output = tf.matmul(attention_weights, v) # (..., seq_len_q, depth_v)
    return output, attention_weights

### Multi-head attention

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        
        assert d_model % self.num_heads == 0
        
        self.depth = d_model // self.num_heads # 256/8 = 32
        
        self.wq = tf.keras.layers.Dense(d_model) # 256
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)
        
        self.dense = tf.keras.layers.Dense(d_model)
        
    def split_heads(self, x, batch_size):
        """Split the last dimension into (num_heads, depth). 
        Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)"""
        
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3]) # (64, 8, 64, 32
    
    
    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]
        
        q = self.wq(q)  # (batch_size, seq_len, d_model)
        k = self.wk(k)  # (batch_size, seq_len, d_model)
        v = self.wv(v)  # (batch_size, seq_len, d_model)
#         print('q ', q.shape)
        
        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
#         print("q.shape = ", q.shape)
        # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
        scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)
#         print('scaled_attention ', scaled_attention.shape)
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
        # (batch_size, seq_len_q, num_heads, depth)
#         print('scaled_attention ', scaled_attention.shape)
        
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model)) 
        # (batch_size, seq_len_q, d_model)
        
        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)
        
        return output, attention_weights, concat_attention

### Point wise feed forward network

In [None]:
def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
        tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
    ]) 

### Encoder and Decoder

In [None]:
class CNN_Encoder(tf.keras.Model):
    def __init__(self, embedding_dim):
        super(CNN_Encoder, self).__init__()
        self.fc = tf.keras.layers.Dense(embedding_dim)
        
    def call(self, x):
        x = self.fc(x)
        x = tf.nn.relu(x)
        return x

In [None]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(DecoderLayer, self).__init__()
        
        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)
        
        self.ffn = point_wise_feed_forward_network(d_model, dff)
        
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)
        
    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        # enc_output.shape == (batch_size, input_seq_len, d_model)
        
        attn1, attn_weights_block1, _ = self.mha1(x, x, x, look_ahead_mask) # (batch_size, target_seq_len, d_model)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)
        
        attn2, attn_weights_block2, concat_weights = self.mha2(enc_output, enc_output, out1, padding_mask) # (batch_size, target_seq_len, d_model)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1) # (batch_size, target_seq_len, d_model)
        
        ffn_output = self.ffn(out2) # (batch_size, target_seq_len, d_model)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2) # (batch_size, target_seq_len, d_model)
        
        return out3, attn_weights_block1, attn_weights_block2
        

In [None]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, rate=0.1):
        super(Decoder, self).__init__()
        
        self.d_model = d_model
        self.num_layers = num_layers
        
        self.embedding = tf.keras.layers.Embedding(target_vocab_size+1, d_model)
        self.pos_encoding = positional_encoding(target_vocab_size, d_model)
        
        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)
        
    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        seq_len = tf.shape(x)[1]
        attention_weights = {}
        
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]
        
        x = self.dropout(x, training=training)
        
        for i in range(self.num_layers):
            x, block1, block2 = self.dec_layers[i](x, enc_output, training, look_ahead_mask, padding_mask)
            
            attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
            attention_weights['decoder_layer{}_block2'.format(i+1)] = block2
            
        # x.shape == (batch_size, target_seq_len, d_model)
        return x, attention_weights

### Create the Transformer

In [None]:
class Transformer(tf.keras.Model):
#     def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, rate=0.1):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, rate=0.1):
        super(Transformer, self).__init__()
        
#         self.encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, rate)
#         self.encoder = Encoder(num_layers, d_model, num_heads, dff, rate)
        self.encoder = CNN_Encoder(embedding_dim)
        self.decoder = Decoder(num_layers, d_model, num_heads, dff, target_vocab_size, rate)
        
        self.final_layer = tf.keras.layers.Dense(target_vocab_size)
        
    def call(self, img_tensor, target, training, enc_padding_mask, look_ahead_mask, dec_padding_mask):
#         enc_output = self.encoder(img_tensor, training, enc_padding_mask) # (batch_size, inp_seq_len, d_model)
#         enc_output = self.encoder(img_tensor, training, mask=None) # (batch_size, inp_seq_len, d_model)
        features = self.encoder(img_tensor)
#         dec_output, attention_weights = self.decoder(target, enc_output, training, look_ahead_mask, dec_padding_mask)
        dec_output, attention_weights = self.decoder(target, features, training, look_ahead_mask, padding_mask=None)
        
        final_output = self.final_layer(dec_output)
        
        return final_output, attention_weights

### Set hyperparameters

In [None]:
num_layers = 4
# d_model = 128
d_model = 256
dff = 512
num_heads = 8

target_vocab_size = vocab_size
dropout_rate = 0.1

### Optimizer

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        
        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)
        
        self.warmup_steps = warmup_steps
        
    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
        
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [None]:
learning_rate = CustomSchedule(256)

In [None]:
learning_rate(tf.range(40000, dtype=tf.float32))

In [None]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

In [None]:
temp_learning_rate_schedule = CustomSchedule(d_model)

plt.plot(temp_learning_rate_schedule(tf.range(40000, dtype=tf.float32)).numpy())
plt.ylabel("Learning Rate")
plt.xlabel("Train Step")

### Loss and metrics

In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

In [None]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    
    return tf.reduce_mean(loss_)

In [None]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

### Training and checkpointing

In [None]:
transformer = Transformer(num_layers, d_model, num_heads, dff, target_vocab_size, dropout_rate)

In [None]:
def create_masks(img_tensor, target):
    # Encoder padding
    enc_padding_mask = create_padding_mask(img_tensor)
    
    # Used in the 2nd attention block in the decoder.
    # This padding mask is used to mask the encoder outputs.
    dec_padding_mask = create_padding_mask(img_tensor)
    
    # Used in the 1st attention block in the decoder.
    # It is used to pad and mask future tokens in the input received by the decoder.
    look_ahead_mask = create_look_ahead_mask(tf.shape(target)[1])
    
    dec_target_padding_mask = create_padding_mask(target)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
    
    return enc_padding_mask, combined_mask, dec_padding_mask

In [None]:
from datetime import datetime
now = datetime.now()
# today = str(now.year) + str(now.month) + str(now.day)
# today
date = now.strftime("%Y%m%d")

In [None]:
checkpoint_path = "checkpoints/save/"+str(date)+"/"

ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=10)

# ckpt.restore('./checkpoints/train/ckpt-1')
# print('Latest checkpoint restored!!')
# if a checkpoint exists, restore the lastest checkpoint.
# if ckpt_manager.latest_checkpoint:
#     ckpt.restore(ckpt_manager.latest_checkpoint)
#     print('Latest checkpoint {} restored!!'.format(ckpt_manager.latest_checkpoint))
# else:
#     ckpt.restore('checkpoints/save/ckpt-4')
#     print('Checkpoint {} restored!!'.format('checkpoints/save/ckpt-4'))

In [None]:
class EarlyStopping():
    def __init__(self, patience=0, verbose=0):
        self._step = 0
        self._loss = float('inf')
        self.patience  = patience
        self.verbose = verbose

    def validate(self, loss):
        if self._loss < loss:
            self._step += 1
            if self._step > self.patience:
                if self.verbose:
                    print(f'Training process is stopped early....')
                return True
        else:
            self._step = 0
            self._loss = loss

        return False

In [None]:
early_stopping = EarlyStopping(patience=10, verbose=1) 
EPOCHS = 20

In [None]:
train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
]

# @tf.function(input_signature=train_step_signature)
def train_step(img_tensor, target):
    target_inp = target[:, :-1]
    target_real = target[:, 1:]
    
    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(img_tensor, target_inp)
    
    with tf.GradientTape() as tape:
        predictions, _ = transformer(img_tensor, target_inp, True, enc_padding_mask, combined_mask, dec_padding_mask)
        loss = loss_function(target_real, predictions)
        
    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
    
    train_loss(loss)
    train_accuracy(target_real, predictions)

In [None]:
loss = []

for epoch in range(EPOCHS):
    start = time.time()
    
    train_loss.reset_states()
    train_accuracy.reset_states()
    
    # inp -> portuguese, tar -> english
    for (batch, (img_tensor, target)) in enumerate(dataset):
        train_step(img_tensor, target)
        
        if batch % 500 == 0:
            print('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch+1, batch, 
                                                                         train_loss.result(), train_accuracy.result()))
            
    if (epoch + 1) % 5 == 0:
        ckpt_save_path = ckpt_manager.save()
        print('Saving checkpoint for epoch {} at {}'.format(epoch+1, ckpt_save_path))
    
    print('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch+1, train_loss.result(), train_accuracy.result()))
    loss.append(train_loss.result())
    
    print('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

In [None]:
plt.plot(loss)

In [None]:
def evaluate(image):
    temp_input = tf.expand_dims(load_image(image)[0], 0)
    img_tensor_val = image_features_extract_model(temp_input)
    img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0], -1, img_tensor_val.shape[3]))
    
    encoder_input = img_tensor_val
    decoder_input = [vocab_size]
    output = tf.expand_dims(decoder_input, 0)    
    
    result = []

    for i in range(max_length):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(encoder_input, output)
        
        predictions, attention_weights = transformer(encoder_input, output, False, enc_padding_mask,
                                                    combined_mask, dec_padding_mask)
        
        pred = predictions
        # attention_weights = (1, 8, 1, 64)

        predictions = predictions[:, -1:, :] # (batch_size, 1, vocab_size)
#         print('predictions.shape = ', predictions.shape)
#         print(i, predictions)
    
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32) # greedy decoder? 
        
#         print('predicted_id ', predicted_id)
#         print('predicted_id squezze and numpy', tf.squeeze(predicted_id).numpy())
        
        result.append(tokenizer.index_word[tf.squeeze(predicted_id).numpy()])

        if tokenizer.index_word[tf.squeeze(predicted_id).numpy()] == '<end>':
            return result, attention_weights, pred
#             return tf.squeeze(output, axis=0), attention_weights
#         print("output.shape ", output.shape)
#         print('predicted_id ', predicted_id)
        output = tf.concat([output, predicted_id], axis=-1)
    
#     return tf.squeeze(output, axis=0), attention_weights
    return result, attention_weights

In [None]:
# https://machinelearningmastery.com/beam-search-decoder-natural-language-processing/
def beam_search_decoder(data, k):
    sequences = [[list(), 1.0]]
    # walk over each step in sequence
    for row in data:
        all_candidates = list()
        # expand each current candidate
        for i in range(len(sequences)):
            seq, score = sequences[i]
            for j in range(len(row)):
                candidate = [seq + [j], score*-log(row[j])]
                all_candidates.append(candidate)
        # order all candidates by score
        ordered = sorted(all_candidates, key=lambda tup:tup[1])
        # select k best
        sequences = ordered[:k]
    return sequences

In [None]:
ckpt_manager.latest_checkpoint

In [None]:
if ckpt_manager.latest_checkpoint:
    ckpt.restore('./checkpoints/train/ckpt-4')
    print('Latest checkpoint restored!!')

In [None]:
val_annotations = load_pickle('data/val/val.annotations.pkl')

val_captions = val_annotations['caption']
val_img_name_vector = val_annotations['file_name']

In [None]:
# creating the tokenized vectors
val_seqs = tokenizer.texts_to_sequences(val_captions)

In [None]:
# padding each vector to the max_length of the captions
# if the max_length parameter is not provided, pad_sequence calculates that automatically
val_cap_vector = tf.keras.preprocessing.sequence.pad_sequences(val_seqs, padding='post')

# calculating the max_length
# used to store the attention weights
max_length = calc_max_length(train_seqs) # 48

In [None]:
tf.argmax(predictions, axis=-1)

In [None]:
rid = np.random.randint(0, len(val_img_name_vector)) 
image_num = val_img_name_vector.index[rid]
image = val_img_name_vector[image_num] # ex) 'dataset/train2014/COCO_train2014_000000165492.jpg'
real_caption = ' '.join([tokenizer.index_word[i] for i in val_cap_vector[rid] if i not in [0]])
result, attention_weigths, predictions = evaluate(image)

# predicted_sentence = tokenizer.decode([i for i in result if i < ])
predicted_caption = ' '.join(result)

print ('Real Caption:', real_caption)
print ('Prediction Caption:', predicted_caption)

# https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
N_gram_1 = nltk.translate.bleu_score.sentence_bleu([real_caption], predicted_caption, weights=(1, 0, 0, 0))
N_gram_2 = nltk.translate.bleu_score.sentence_bleu([real_caption], predicted_caption, weights=(0, 1, 0, 0))
N_gram_3 = nltk.translate.bleu_score.sentence_bleu([real_caption], predicted_caption, weights=(0, 0, 1, 0))
N_gram_4 = nltk.translate.bleu_score.sentence_bleu([real_caption], predicted_caption, weights=(0, 0, 0, 1))


meteor = nltk.translate.meteor_score.meteor_score([real_caption], predicted_caption)
print("1-gram: ", N_gram_1)
print("2-gram: ", N_gram_2)
print("3-gram: ", N_gram_3)
print("4-gram: ", N_gram_4)
print("METEOR: ", meteor)

# plot_attention_weights(image, result, attention_weigths, 5)
# opening the image
Image.open(val_img_name_vector[image_num])

In [None]:
image_url = 'https://tensorflow.org/images/surf.jpg'
image_extension = image_url[-4:]
image_path = tf.keras.utils.get_file('image'+image_extension, 
                                     origin=image_url)

result, attention_plot = evaluate(image_path)
print ('Prediction Caption:', ' '.join(result))
plot_attention_weights(image_path, result, attention_plot, 0)
# opening the image
Image.open(image_path)