# Data Preparation

In [1]:
# Load the Dataset
import os

image_dataset_path = '../input/flickr8k-imageswithcaptions/Flickr8k_Dataset/Flicker8k_Dataset'
caption_dataset_path = '../input/flickr8k-imageswithcaptions/Flickr8k_text/Flickr8k.token.txt'

First of all, we will load the caption file and store it in a captions dictionary for later purposes

In [2]:
# load the caption file & read it
def load_caption_file(path):
    
    # dictionary to store captions
    captions_dict = {}
    
    # iterate through the file
    for caption in open(path):
    
        # caption has format-> 1000268201_693b08cb0e.jpg#0  A child in a pink dress is climbing up a set of stairs in an entry way .
        tokens = caption.split()
        caption_id, caption_text = tokens[0].split('.')[0], tokens[1:]
        caption_text = ' '.join(caption_text)
        
        # save it in the captions dictionary
        if caption_id not in captions_dict:
            captions_dict[caption_id] = caption_text
        
    return captions_dict

# call the function
captions_dict = load_caption_file(caption_dataset_path)


Similarly, load the image file. Extract the feature of each image using **VGG16 (Visual Geometry Group)** CNN model feature extractor and map it to the image id in image dictionary.

In [3]:
# Load the important
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.layers import Input

In [4]:
# Input() is used to instantiate a Keras tensor. Pass the dimensions of the image into shape parameter. 
# VGG16 model uses image shape (224, 224, 3) where 3 is the color RGB. 
input_layer = Input(shape=(224, 224, 3))

# load the VGG16 model. include_top is set to False because we just want the feature extractor part not the dense layers. 
model = VGG16(include_top=False, input_tensor = input_layer)

print(model.summary())    

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 224, 224, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)    

The following piece of code took 3 hrs on CPU power to run.

In [None]:
images_dict = {}
# loop through the images
for image_path in os.listdir(image_dataset_path):
    
    image = load_img(image_dataset_path + '/' + image_path)
    # convert the image pixels into array
    image = img_to_array(image)
    # reshape the image for the vgg16 model
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    # prepare the images for vgg model
    image = preprocess_input(image)
    # extract features from the image
    feature = model.predict(image, verbose=0)
    # extract the image id from the image_name
    image_id = image_path.split('.')[0]
    # store images and its features into 
    images_dict[image_id] = feature
    

In [None]:
# save the image features into pickle file
from pickle import dump

dump(images_dict, open('image_features_dictionary.pkl', 'wb'))

### I am saving this pickle file for the later use and you can download it from my datasets

Preprocess the captions dataset 

1) Convert the captions into lowercase

2) Tokenize the captions into different tokens

3) Remove all the punctuations from the tokens

In [5]:
# clean the captions
import string

# dictionary to store the cleaned captions
new_captions_dict = {}

# prepare translation table for removing punctuation. third argument is the list of punctuations we want to remove
table = str.maketrans('', '', string.punctuation)

# loop through the dictionary
for caption_id, caption_text in captions_dict.items():
    # tokenize the caption_text
    caption_text = caption_text.split()
    # convert it into lower case
    caption_text = [token.lower() for token in caption_text]
    # remove punctuation from each token
    caption_text = [token.translate(table) for token in caption_text]
    # remove all the single letter tokens like 'a', 's'
    caption_text = [token for token in caption_text if len(token)>1]
    # store the cleaned captions
    new_captions_dict[caption_id] = ' '.join(caption_text)
    

# Prepare the word embeddings to feed into the embedding layer

In [6]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# initialise tokenizer
tokenizer = Tokenizer()
# create word count dictionary on the captions list
tokenizer.fit_on_texts(list(new_captions_dict.values()))
vocab_len = len(tokenizer.word_index) + 1
sequences = tokenizer.texts_to_sequences(list(new_captions_dict.values()))

# pad the sequences to be of fixed size
max_len = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen= max_len, padding='post')

# one hot encode all the captions
one_hot_encoded = to_categorical(padded_sequences, num_classes= vocab_len)
# ensure the shape 
one_hot_encoded = one_hot_encoded.reshape(len(new_captions_dict), max_len, vocab_len)

print("captions size ", len(new_captions_dict))
print("vocabulary size ", vocab_len)
print("sequences size ", max_len)
print("shape of the word embeddings ", one_hot_encoded.shape)

captions size  8092
vocabulary size  4485
sequences size  28
shape of the word embeddings  (8092, 28, 4485)


# More to come... Stay Tuned!!