In [2]:
import numpy as np
import pandas as pd
import pickle

from keras.applications import VGG16
from keras.models import Sequential
from keras.layers import LSTM, Embedding, TimeDistributed, Dense, RepeatVector, Merge, Activation, Flatten
from keras.preprocessing import image, sequence
from keras.callbacks import ModelCheckpoint

  return f(*args, **kwds)
Using TensorFlow backend.


In [3]:
encoding_test = pickle.load(open('encoded_images.pickle', 'rb'))

In [11]:
encoding_test['3556792157_d09d42bef7.jpg'].shape

(4096,)

In [12]:
df = pd.read_csv('Flickr8k_text/flickr_8k_train_dataset.txt', delimiter='\t')
nb_samples = df.shape[0]
nb_samples

30000

Calculating the unique words in the vocabulary.

In [15]:
caps = []
imgs = []
iter = df.iterrows()
for i in range(nb_samples):
    x = next(iter)
    caps.append(x[1][1])
    imgs.append(x[1][0])

len(caps)

30000

In [16]:
total_samples = 0
for text in caps:
    total_samples += len(text.split())-1

In [17]:
words = [txt.split() for txt in caps]
len(words)

30000

In [26]:
unique = []
for i in words:
    unique.extend(i)

In [27]:
unique = list(set(unique))
vocab_size = len(unique)
vocab_size

8256

Mapping the unique words to indices and vice-versa

In [31]:
word_index = {}
index_word = {}

for i, word in enumerate(unique):
    word_index[word]=i
    index_word[i]=word
    
len(word_index)

8256

Calculating the maximum length among all the captions

In [34]:
max_len = 0
for caption in caps:
    if(len(caption.split()) > max_len):
        max_len = len(caption.split())

print( "Vocabulary size: "+str(vocab_size))
print( "Maximum caption length: "+str(max_len))

Vocabulary size: 8256
Maximum caption length: 40


## Generator 

We will use the encoding of an image and use a start word to predict the next word.
After that, we will again use the same image and use the predicted word 
to predict the next word.
So, the image will be used at every iteration for the entire caption. 
This is how we will generate the caption for an image. Hence, we need to create 
a custom generator for that.

In [47]:
def data_generator(batch_size = 32):
    partial_caps = []
    next_words = []
    images = []
    
    gen_count = 0
    df = pd.read_csv('Flickr8k_text/flickr_8k_train_dataset.txt', delimiter='\t')
    nb_samples = df.shape[0]
    iter = df.iterrows()
    caps = []
    imgs = []
    for i in range(nb_samples):
        x = next(iter)
        caps.append(x[1][1])
        imgs.append(x[1][0])


        count = 0
        while True:
            for j, text in enumerate(c):
                #print('Image: ',j)
                current_image = encoded_images[imgs[j]]
                for i in range(len(text.split())-1):
                    total_count += 1
                    #print('total_count: ', total_count)
                    partial = [word_index[txt] for txt in text.split()[:i+1]]
                    partial_caps.append(partial)
                    n = np.zeros(vocab_size)
                    n[word_index[text.split()[i+1]]] = 1
                    next_words.append(n)
                    images.append(current_image)

                    if total_count>=batch_size:
                        next_words = np.asarray(next_words)
                        images = np.asarray(images)
                        partial_caps = sequence.pad_sequences(partial_caps, maxlen=max_len, padding='post')
                        total_count = 0
                        gen_count+=1
                        #print ("yielding count: "+str(gen_count))
                        yield [[images, partial_caps], next_words]
                        partial_caps = []
                        next_words = []
                        images = []

## Let's create the model

In [53]:
EMBEDDING_DIM = 128

def create_model():

    image_model = Sequential()
    image_model.add(Dense(EMBEDDING_DIM, input_dim = 4096, activation='relu'))
    image_model.add(RepeatVector(max_len))

    lang_model = Sequential()
    lang_model.add(Embedding(vocab_size, 256, input_length=max_len))
    lang_model.add(LSTM(256,return_sequences=True))
    lang_model.add(TimeDistributed(Dense(EMBEDDING_DIM)))

    model = Sequential()
    model.add(Merge([image_model, lang_model], mode='concat'))
    model.add(LSTM(1000,return_sequences=False))
    model.add(Dense(vocab_size))
    model.add(Activation('softmax'))


    model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

    return model

In [55]:
model = create_model()
model.summary()

  from ipykernel import kernelapp as app


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
merge_3 (Merge)              (None, 40, 256)           0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 1000)              5028000   
_________________________________________________________________
dense_9 (Dense)              (None, 8256)              8264256   
_________________________________________________________________
activation_3 (Activation)    (None, 8256)              0         
Total params: 16,488,416
Trainable params: 16,488,416
Non-trainable params: 0
_________________________________________________________________


In [69]:
final_model.fit_generator(data_generator(batch_size=128), samples_per_epoch=samples_per_epoch, nb_epoch=1, 
                          verbose=2)

Epoch 1/1
993s - loss: 3.2185 - acc: 0.4458




<keras.callbacks.History at 0x7f76326d5518>

In [78]:
model.save('Models/Model.h5', overwrite=True)
model.save_weights('Models/Weights.h5',overwrite=True)