In [1]:
import numpy as np
import pandas as pd
import pickle
import glob
from PIL import Image

from keras.models import Model
   from keras.applications.imagenet_utils import preprocess_input 
from keras.preprocessing import image

  return f(*args, **kwds)
Using TensorFlow backend.


In [2]:
CAPTION_PATH = 'Flickr8k_text/Flickr8k.token.txt'
TRAIN_IMAGES_PATH = 'Flickr8k_text/Flickr_8k.trainImages.txt' 
VAL_IMAGES_PATH = 'Flickr8k_text/Flickr_8k.devImages.txt'
TEST_IMAGES_PATH = 'Flickr8k_text/Flickr_8k.testImages.txt'
DATASET = 'Flicker8k_Dataset/Flicker8k_Dataset/'

## Data Processing

First we create a dictonary with the image file name as the key and the caption as the value.

In [3]:
captions = open(CAPTION_PATH,'r').read().strip().split('\n')
captions[:5]

['1000268201_693b08cb0e.jpg#0\tA child in a pink dress is climbing up a set of stairs in an entry way .',
 '1000268201_693b08cb0e.jpg#1\tA girl going into a wooden building .',
 '1000268201_693b08cb0e.jpg#2\tA little girl climbing into a wooden playhouse .',
 '1000268201_693b08cb0e.jpg#3\tA little girl climbing the stairs to her playhouse .',
 '1000268201_693b08cb0e.jpg#4\tA little girl in a pink dress going into a wooden cabin .']

In [4]:
#dictionary with file name as key and captions as value
d = {}
for i, row in enumerate(captions):
    row = row.split('\t')
    row[0] = row[0][:len(row[0])-2]
    if row[0] in d:
        d[row[0]].append(row[1])
    else:
        d[row[0]] = [row[1]]

In [5]:
d['1000268201_693b08cb0e.jpg']

['A child in a pink dress is climbing up a set of stairs in an entry way .',
 'A girl going into a wooden building .',
 'A little girl climbing into a wooden playhouse .',
 'A little girl climbing the stairs to her playhouse .',
 'A little girl in a pink dress going into a wooden cabin .']

In [6]:
#img -> Path of all images
img = glob.glob(DATASET +'*.jpg')
img[:5]

['Flicker8k_Dataset/2249865945_f432c8e5da.jpg',
 'Flicker8k_Dataset/3223302125_f8154417f4.jpg',
 'Flicker8k_Dataset/3552796830_2dd2aa9c2c.jpg',
 'Flicker8k_Dataset/3230101918_7d81cb0fc8.jpg',
 'Flicker8k_Dataset/535399240_0714a6e950.jpg']

In [7]:
#train_images -> Filename of train_images
#train_img -> Path of train_images
f_train_images = open(TRAIN_IMAGES_PATH, 'r')
train_images = f_train_images.read().strip().split('\n')

train_img = []
for i in train_images:
    temp = DATASET + i
    train_img.append(temp)

f_train_images.close()
len(train_img)

6000

In [8]:
f_val_images = open(VAL_IMAGES_PATH, 'r')
val_images = f_val_images.read().strip().split('\n')

val_img = []
for i in val_images:
    temp = DATASET + i
    val_img.append(temp)

f_val_images.close()
len(val_img)

1000

In [9]:
f_test_images = open(TEST_IMAGES_PATH, 'r')
test_images = f_test_images.read().strip().split('\n')

test_img = []
for i in test_images:
    temp = DATASET + i
    test_img.append(temp)

f_test_images.close()
len(test_img)

1000

We now save the dictonary items into different files, each for test, validation and training datas

In [10]:
f_train_dataset = open('Flickr8k_text/flickr_8k_train_dataset.txt','w')
f_train_dataset.write("image_id\tcaptions\n")

f_val_dataset = open('Flickr8k_text/flickr_8k_val_dataset.txt','w')
f_val_dataset.write("image_id\tcaptions\n")

f_test_dataset = open('Flickr8k_text/flickr_8k_test_dataset.txt','w')
f_test_dataset.write("image_id\tcaptions\n")

18

In [11]:
c_train = 0
for i in train_images:
    for capt in d[i]:
        caption = "<start> "+ capt +" <end>"
        f_train_dataset.write(i+"\t"+caption+"\n")
        f_train_dataset.flush()
        c_train += 1

f_train_dataset.close()

In [12]:
c_val = 0
for i in val_images:
    for capt in d[i]:
        caption = "<start> "+ capt +" <end>"
        f_val_dataset.write(i+"\t"+caption+"\n")
        f_val_dataset.flush()
        c_val += 1

f_val_dataset.close()

In [13]:
c_test = 0
for i in test_images:
    for capt in d[i]:
        caption = "<start> "+ capt +" <end>"
        f_test_dataset.write(i+"\t"+caption+"\n")
        f_test_dataset.flush()
        c_test += 1

f_test_dataset.close()

## MODEL:
We use an VGG16 Model pretrained on the ImageNet dataset.
We remove the last softmax layer of the VGG model, so that we can obtain the features of the images

In [14]:
model = VGG16(weights = "imagenet", include_top=True, input_shape=(224, 224, 3))
new_input = model.input
hidden_layer = model.layers[-2].output

encoding_model = Model(new_input, hidden_layer)

We now pass all the images through the VGG model and extract its features, which is the stored in a pockle file.

In [15]:
def load_image(path):
    img = image.load_img(path, target_size=(224,224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return np.asarray(x)

In [16]:
counter = 0

def img_encoding(model, i):
    global counter
    counter += 1
    image = load_image(i)
    pred = model.predict(image)
    pred = np.reshape(pred, pred.shape[1])
    return pred

In [17]:
encoded_images = {}
for i in img:
    encoded_images[i[len(DATASET):]] = img_encoding(encoding_model,i)

In [18]:
f = open("encoded_images.pickle", "wb")
pickle.dump(encoded_images, f)
f.close()    