## Import all the necessary packages

In [2]:
import string
import numpy as np
from PIL import Image
import os
from pickle import dump, load
import numpy as np
from keras.applications.xception import Xception, preprocess_input
from keras.preprocessing.image import load_img, img_to_array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers.merge import add
from keras.models import Model, load_model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout

In [3]:
from tqdm import tqdm
tqdm().pandas()

0it [00:00, ?it/s]


## Preprocessing the data

In [4]:
# Loading a text file into memory
def load_doc(fn):
    file = open(fn, 'r')
    text = file.read()
    file.close()
    return text

# get all imgs with their captions
def captions(fn):
    file = load_doc(fn)
    cap = file.split('\n')
    d ={}
    for i in cap[:-1]:
        img, i = i.split('\t')
        if img[:-2] not in d:
            d[img[:-2]] = [ i ]
        else:
            d[img[:-2]].append(i)
    return d

#Data cleaning- lower casing, removing puntuations and words containing numbers
def cleaning(text):
    table = str.maketrans('','',string.punctuation)
    for img,caps in text.items():
        for i,img_caption in enumerate(caps):
            img_caption.replace("-"," ")
            desc = img_caption.split()
           
            #converts to lowercase
            desc = [word.lower() for word in desc]
            
            #remove punctuation from each token
            desc = [word.translate(table) for word in desc]
            
            #remove hanging 's and a 
            desc = [word for word in desc if(len(word)>1)]
            
            #remove tokens with numbers in them
            desc = [word for word in desc if(word.isalpha())]
            
            #convert back to string
            img_caption = ' '.join(desc)
            text[img][i]= img_caption
    return text

# build vocabulary of all unique words
def text_vocabulary(descriptions):
    vocab = set()
    for key in descriptions.keys():
        [vocab.update(d.split()) for d in descriptions[key]]
    return vocab

#All descriptions in one file 
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + '\t' + desc )
    data = "\n".join(lines)
    file = open(filename,"w")
    file.write(data)
    file.close()

In [5]:
# Set the path
dataset_text = "Flickr8k_text"
dataset_images = "Flickr8k_Dataset"

# Prepare the text data
fn = dataset_text + "/" + "Flickr8k.token.txt"

In [6]:
# Loading file containing data and mapping them into dictionary
desc = captions(fn)
print("Length of descriptions = " ,len(desc))

Length of descriptions =  8092


In [7]:
# cleaning the text
clean = cleaning(desc)

In [8]:
len(clean)

8092

In [9]:
# building vocab
vocab = text_vocabulary(clean)
print("Length of vocab = " , len(vocab))

Length of vocab =  8763


In [9]:
save_descriptions(clean, "descriptions.txt")

## Extracting the feature vector from images

In [10]:
# extract features for all images and we will map image names with their respective feature array.
def extract_features(d):
    model = Xception(include_top=False, pooling='avg')
    features = {}
    for img in tqdm(os.listdir(d)):
        fn = d + "/" + img
        image = Image.open(fn)
        image = image.resize((299,299))
        image = np.expand_dims(image, axis=0)
        image = image/127.5
        image = image-1.0
        feature = model.predict(image)
        features[img] = feature
    return features

In [17]:
# extract features and then dump dictionary into a pickle file
features = extract_features(dataset_images)
dump(features, open("features.p","wb"))

100%|██████████████████████████████████████████████████████████████████████████████| 8091/8091 [43:23<00:00,  3.11it/s]


In [26]:
features = load(open("features.p","rb"))

## Loading dataset for Training the Model

In [11]:
# load the data
def load_imgs(fn):
    file = load_doc(fn)
    imgs = file.split("\n")[:-1]
    return imgs

In [12]:
# Create dictionary that contains captions for each image
def descriptions(fn, imgs):
    file = load_doc(fn)
    desc = {}
    for line in file.split("\n"):
        words = line.split()
        if len(words) < 1:
            continue
        img, ic = words[0], words[1:]
        
        if img in imgs:
            if img not in desc:
                desc[img] = []
            d = '<start> ' + " ".join(ic) + ' <end>'
            desc[img].append(d)
    
    return desc

In [13]:
# load dictionary for image names and their feature vector which we have previously 
# extracted from the Xception model
def load_features(imgs):
    features = load(open("features.p", "rb"))
    f = {k:features[k] for k in imgs}
    return f

In [14]:
fn = dataset_text + "/" + "Flickr_8k.trainImages.txt"

In [15]:
train_imgs = load_imgs(fn)
train_descriptions = descriptions("descriptions.txt", train_imgs)
train_features = load_features(train_imgs)

## Tokenizing the Vocabulary

In [16]:
# converting dictionary to clean list of captions
def to_list(cap):
    all_cap = []
    for key in cap.keys():
        [all_cap.append(i) for i in cap[key]]
    return all_cap

In [17]:
from keras.preprocessing.text import Tokenizer

# using tokenizer class, we will represent each token with an integer
def tokenize(cap):
    l = to_list(cap)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(l)
    return tokenizer

In [18]:
# give each word an index, and store that into tokenizer.p pickle file
tokenizer = tokenize(train_descriptions)
dump(tokenizer, open('tokenizer.p', 'wb'))
size = len(tokenizer.word_index) + 1
size

7577

In [19]:
# calculate maximum length of captions
def maximum_length(captions):
    l = to_list(captions)
    return max(len(i.split()) for i in l)

In [20]:
max_length = maximum_length(desc)
max_length

32

## Data Generator

In [32]:
def create_sequences(tokenizer, length, desc_list, feature):
    X1, X2, y = list(), list(), list()
    # walk through each description for the image
    for desc in desc_list:
        # encode the sequence
        seq = tokenizer.texts_to_sequences([desc])[0]
        # split one sequence into multiple X,y pairs
        for i in range(1, len(seq)):
            # split into input and output pair
            in_seq, out_seq = seq[:i], seq[i]
            # pad input sequence
            in_seq = pad_sequences([in_seq], maxlen=length)[0]
            # encode output sequence
            out_seq = to_categorical([out_seq], num_classes=size)[0]
            # store
            X1.append(feature)
            X2.append(in_seq)
            y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

In [33]:
# data generator, used by model.fit_generator()
def generator(captions, features, tokenizer, length):
    while 1:
        for key, l in captions.items():
            feature = features[key][0]
            img, seq, word = create_sequences(tokenizer, length, l, feature)
            yield [[img, seq], word]

In [34]:
[a,b],c = next(generator(train_descriptions, features, tokenizer, max_length))
a.shape, b.shape, c.shape

((47, 2048), (47, 32), (47, 7577))