In [1]:
import string
import numpy as np
from PIL import Image
import os
from pickle import dump , load

from tensorflow.keras.applications.xception import Xception , preprocess_input
from tensorflow.keras.preprocessing.image import load_img , img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import add
from tensorflow.keras.models import Model , load_model
from tensorflow.keras.layers import Input , Dense , LSTM , Embedding , Dropout

from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  tqdm().pandas()


HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

  from pandas import Panel


In [2]:
def load_doc(filename):
    file = open(filename , 'r')
    text = file.read()
    file.close()
    return text



def all_img_captions(filename):
    file = load_doc(filename)
    captions = file.split('\n')
    descriptions ={}
    for caption in captions[:-1]:
        img , caption = caption.split('\t')
        if img[:-2] not in descriptions:
            descriptions[img[:-2]] = [caption]
        else:
            descriptions[img[:-2]].append(caption)
    return descriptions


def cleaning_text(captions):
    table = str.maketrans('' , '' , string.punctuation)
    
    for img , caps in captions.items():
        for i , img_caption in enumerate(caps):
            img_caption.replace("-" , " ")
            desc = img_caption.split()
            
            desc = [word.lower() for word in desc]

            desc = [word.translate(table) for word in desc]

            desc = [word for word in desc if(len(word)>1)]

            desc = [word for word in desc if(word.isalpha())]

            img_caption = ' '.join(desc)
            captions[img][i] = img_caption
    return captions

def text_vocabulary(descriptions):
    vocab = set()

    for key in descriptions.keys():
        [vocab.update(d.split()) for d in descriptions[key]]

    return vocab

def save_descriptions(descriptions , filename):
    lines = list()
    for key,desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + '\t' + desc)

    data = "\n".join(lines)
    file = open(filename , "w")
    file.write(data)
    file.close


dataset_text = 'Flickr_8k_text'
dataset_images = 'Flicker8k_Dataset'

filename = dataset_text + '/' + "Flickr8k.token.txt"

descriptions = all_img_captions(filename)
print("Length of descriptions = " , len(descriptions))

clean_descriptions = cleaning_text(descriptions)

vocabulary = text_vocabulary(clean_descriptions)
print("Length of vocab = " , len(vocabulary))

save_descriptions(clean_descriptions , "descriptions.txt")


Length of descriptions =  8092
Length of vocab =  8763


In [3]:
def extract_features(directory):
    model = Xception(include_top = False , pooling='avg')
    features = {}

    for img in tqdm(os.listdir(directory)):
        filename = directory + '/' + img
        image = Image.open(filename)
        image = image.resize((299,299))
        image = np.expand_dims(image , axis=0)
        image = image/127.5
        image = image - 1.0

        feature = model.predict(image)

        features[img] = feature

    return features


#2048 feature vector

features = extract_features(dataset_images)
dump(features , open("features.p" , "wb"))


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/xception/xception_weights_tf_dim_ordering_tf_kernels_notop.h5


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for img in tqdm(os.listdir(directory)):


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8091.0), HTML(value='')))




In [4]:
features = load(open("features.p" , "rb"))

In [5]:
def load_photos(filename):
  file = load_doc(filename)
  photos = file.split("\n")[:-1]
  return photos

def load_clean_description(filename , photos):
    file = load_doc(filename)
    descriptions = {}
    for line in file.split("\n"):
        words = line.split()
        if len(words)<1:
            continue

        image , image_caption = words[0] , words[1:]

        if image in photos:
            if image not in descriptions:
                descriptions[image] = []
            desc = '<start> '+" ".join(image_caption) + ' <end>'
            descriptions[image].append(desc)
  
    return descriptions


def load_features(photos):
    all_features = load(open("features.p", "rb"))

    features = {k:all_features[k] for k in photos}
    return features

filename = dataset_text + '/' + "Flickr_8k.trainImages.txt"

train_imgs = load_photos(filename)
train_descriptions = load_clean_description("descriptions.txt" , train_imgs)
train_features = load_features(train_imgs)



In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [7]:
def dict_to_list(descriptions):
    all_desc = []
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

def create_tokenizer(descriptions):
    desc_list = dict_to_list(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(desc_list)
    return tokenizer

tokenizer = create_tokenizer(train_descriptions)
dump(tokenizer , open("tokenizer.p" , "wb"))
vocab_size = len(tokenizer.word_index) + 1
vocab_size

7577

In [8]:
def max_length(descriptions):
    desc_list = dict_to_list(descriptions)
    return max(len(d.split()) for d in desc_list)

max_length = max_length(descriptions)
max_length

32

In [14]:
def data_generator(descriptions , features , tokenizer , max_length):
    while 1:
        for key , description_list in descriptions.items():
            feature = features[key][0]
            input_image , input_sequence , output_word = create_sequences(tokenizer , max_length , description_list , feature)
            yield([input_image , input_sequence] , output_word)


def create_sequences(tokenizer , max_length , desc_list , feature):
    X1 , X2 , y = list() , list() , list()

    for desc in desc_list:
    
        seq = tokenizer.texts_to_sequences([desc])[0]
        for i in range(1,len(seq)):
            in_seq , out_seq = seq[:i] , seq[i]

            in_seq = pad_sequences([in_seq] , maxlen = max_length)[0]
            out_seq = to_categorical([out_seq] , num_classes=vocab_size)[0]

            X1.append(feature)
            X2.append(in_seq)
            y.append(out_seq)
    
    return np.array(X1) , np.array(X2) , np.array(y)


[a,b],c = next(data_generator(train_descriptions , features , tokenizer , max_length))
a.shape , b.shape , c.shape

((47, 2048), (47, 32), (47, 7577))

In [15]:
from tensorflow.keras.utils import plot_model

In [16]:
def define_model(vocab_size , max_length):
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256 , activation='relu')(fe1)

    #LSTM
    inputs2 = Input(shape = (max_length , ))
    se1 = Embedding(vocab_size , 256 , mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)

    #Merge
    decoder1 = add([fe2 , se3])
    decoder2 = Dense(256 , activation='relu')(decoder1)
    outputs = Dense(vocab_size , activation='softmax')(decoder2)

    #Tie it together [image seq] [word]
    model = Model(inputs=[inputs1 , inputs2] , outputs=outputs)
    model.compile(loss = 'categorical_crossentropy' , optimizer = 'adam')

    print(model.summary())
    plot_model(model , to_file='model.png' , show_shapes=True)

    return model


In [22]:
print('Dataset : ' , len(train_imgs))
print('Descriptions: train = ' , len(train_descriptions))
print('Photos: train = ' , len(train_features))
print('vocab Size = ' , vocab_size)
print('Description Lenght = ' , max_length)

model = define_model(vocab_size , max_length)

epochs = 10

steps = len(train_descriptions)

os.mkdir('models')

for i in range(epochs):
    generator = data_generator(train_descriptions , train_features , tokenizer , max_length)
    model.fit_generator(generator , steps_per_epoch=steps , epochs = 1 , verbose=1)
    model.save("models/model_" + str(i+1) + ".h5")

Dataset :  6000
Descriptions: train =  6000
Photos: train =  6000
vocab Size =  7577
Description Lenght =  32
Model: "functional_15"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_17 (InputLayer)           [(None, 32)]         0                                            
__________________________________________________________________________________________________
input_16 (InputLayer)           [(None, 2048)]       0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 32, 256)      1939712     input_17[0][0]                   
__________________________________________________________________________________________________
dropout_14 (Dropout)            (None, 2048)         0           input_16[0