## Import all the necessary packages

In [1]:
import string
import numpy as np
from PIL import Image
import os
from pickle import dump, load
import numpy as np
from keras.applications.xception import Xception, preprocess_input
from keras.preprocessing.image import load_img, img_to_array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers.merge import add
from keras.models import Model, load_model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout

In [12]:
from tqdm import tqdm
tqdm().pandas()

0it [00:00, ?it/s]


## Preprocessing the data

In [3]:
# Loading a text file into memory
def load_doc(fn):
    file = open(fn, 'r')
    text = file.read()
    file.close()
    return text

# get all imgs with their captions
def captions(fn):
    file = load_doc(fn)
    cap = file.split('\n')
    d ={}
    for i in cap[:-1]:
        img, i = i.split('\t')
        if img[:-2] not in d:
            d[img[:-2]] = [ i ]
        else:
            d[img[:-2]].append(i)
    return d

#Data cleaning- lower casing, removing puntuations and words containing numbers
def cleaning(text):
    table = str.maketrans('','',string.punctuation)
    for img,caps in text.items():
        for i,img_caption in enumerate(caps):
            img_caption.replace("-"," ")
            desc = img_caption.split()
           
            #converts to lowercase
            desc = [word.lower() for word in desc]
            
            #remove punctuation from each token
            desc = [word.translate(table) for word in desc]
            
            #remove hanging 's and a 
            desc = [word for word in desc if(len(word)>1)]
            
            #remove tokens with numbers in them
            desc = [word for word in desc if(word.isalpha())]
            
            #convert back to string
            img_caption = ' '.join(desc)
            text[img][i]= img_caption
    return text

# build vocabulary of all unique words
def text_vocabulary(descriptions):
    vocab = set()
    for key in descriptions.keys():
        [vocab.update(d.split()) for d in descriptions[key]]
    return vocab

#All descriptions in one file 
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + '\t' + desc )
    data = "\n".join(lines)
    file = open(filename,"w")
    file.write(data)
    file.close()

In [4]:
# Set the path
dataset_text = "Flickr8k_text"
dataset_images = "Flickr8k_Dataset"

# Prepare the text data
fn = dataset_text + "/" + "Flickr8k.token.txt"

In [5]:
# Loading file containing data and mapping them into dictionary
desc = captions(fn)
print("Length of descriptions = " ,len(desc))

Length of descriptions =  8092


In [6]:
# cleaning the text
clean = cleaning(desc)

In [7]:
len(clean)

8092

In [8]:
# building vocab
vocab = text_vocabulary(clean)
print("Length of vocab = " , len(vocab))

Length of vocab =  8763


In [9]:
save_descriptions(clean, "descriptions.txt")

## Extracting the feature vector from images

In [10]:
# extract features for all images and we will map image names with their respective feature array.
def extract_features(d):
    model = Xception(include_top=False, pooling='avg')
    features = {}
    for img in tqdm(os.listdir(d)):
        fn = d + "/" + img
        image = Image.open(fn)
        image = image.resize((299,299))
        image = np.expand_dims(image, axis=0)
        image = image/127.5
        image = image-1.0
        feature = model.predict(image)
        features[img] = feature
    return features

In [17]:
# extract features and then dump dictionary into a pickle file
features = extract_features(dataset_images)
dump(features, open("features.p","wb"))

100%|██████████████████████████████████████████████████████████████████████████████| 8091/8091 [43:23<00:00,  3.11it/s]
