In [23]:
import cv2
import glob
import tensorflow as tf
import numpy as np
import pickle
import os
import numpy as np

In [4]:
# image related data
images_name = []
images_list = []
images_features = {}
img_features_path = os.path.join('../dataset/pkl/train2048.pkl')

# caption related data
captions_list = []
captions_mapped = {}
vocab = {}

# encode model for preprocessing
encode_model = None

In [5]:
path = '../dataset/images/Flicker8k_Dataset/*.jpg'
for img in glob.glob(path):
    if not os.path.exists(img_features_path):
        image = cv2.imread(img)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = cv2.resize(image, (299,299))
        images_list.append(image)
    img = img.split('/')[-1]
    images_name.append(img)

In [6]:
x = tf.keras.applications.InceptionV3(include_top=True, weights='imagenet')
encode_model = tf.keras.models.Model(x.input, x.layers[-2].output)

In [7]:
if not os.path.exists(img_features_path):
    for j in range(0,len(images_list)):
        i = images_list[j].reshape(1,299,299,3)
        i = encode_model.predict(i).reshape(2048,)
        images_features[images_name[j]] = i

    with open(img_features_path, 'wb') as f:
        pickle.dump(self.images_features, f)

else:
    print('File already exist, opening...')
    with open(img_features_path, 'rb') as f:
        images_features = pickle.load(f)

File already exist, opening...


In [8]:
f = open('../dataset/text/Flickr8k.token.txt', 'rb')
captions_list = f.read().decode('utf-8').split('\n')

for i in captions_list:
    name = i.split('\t')[0][:-2]
    caption = i.split('\t')[1]
    caption = caption.lower()
    caption = 'sequencestart ' + caption + ' sequenceend'

    if name in images_name:
        if name not in captions_mapped:
            captions_mapped[name] = [caption]
        else:
            captions_mapped[name].append(caption)

In [9]:
mapped_number = 1
for w in captions_mapped.values():
    for words in w:
        for word in words.split():
            if word not in vocab:
                vocab[word] = mapped_number
                mapped_number += 1

for i, w in captions_mapped.items():
    for words in w:
        mapped = []
        for word in words.split():
            mapped.append(vocab[word])

        captions_mapped[i][w.index(words)] = mapped

In [10]:
len(captions_mapped)

8091

In [11]:
captions_mapped['1000268201_693b08cb0e.jpg']

[[1, 2, 3, 4, 2, 5, 6, 7, 8, 9, 2, 10, 11, 12, 4, 13, 14, 15, 16, 17],
 [1, 2, 18, 19, 20, 2, 21, 22, 16, 17],
 [1, 2, 23, 18, 8, 20, 2, 21, 24, 16, 17],
 [1, 2, 23, 18, 8, 25, 12, 26, 27, 24, 16, 17],
 [1, 2, 23, 18, 4, 2, 5, 6, 19, 20, 2, 21, 28, 16, 17]]

In [12]:
MAX_LEN = 0
for img_name, captions in captions_mapped.items():
    for caption in captions:
        if len(caption) > MAX_LEN:
            MAX_LEN = len(caption)

In [13]:
MAX_LEN

40

In [24]:
def create_seq(captions, photo):
    X1, X2, y = [], [], []
    for caption in captions:
        for i in range(1, len(caption)):
            in_seq, out_seq = caption[:i], caption[i]

            in_seq = tf.keras.preprocessing.sequence.pad_sequences(
                [in_seq], 
                maxlen=MAX_LEN)[0]

            out_seq = tf.keras.utils.to_categorical([out_seq], num_classes=len(vocab) + 1)[0]
            
            X1.append(photo)
            X2.append(in_seq)
            y.append(out_seq)
        return np.array(X1), np.array(X2), np.array(y)

In [25]:
def generator():
    while True:
        for img_name, caption in captions_mapped.items():
            img = images_features[img_name]
            x1, x2, y = create_seq(captions, img)
            yield [x1, x2], y

In [26]:
gen = generator()
inputs, outputs = next(gen)
print(inputs[0].shape)
print(inputs[1].shape)
print(outputs.shape)

(11, 2048)
(11, 40)
(11, 8921)
