In [149]:
import numpy as np
from numpy import array
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import string
import os
from PIL import Image
import glob
from pickle import dump, load
from time import time
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import LSTM, Embedding, TimeDistributed, Dense, RepeatVector,\
                         Activation, Flatten, Reshape, concatenate, Dropout, BatchNormalization
from keras.optimizers import Adam, RMSprop
from keras.layers.wrappers import Bidirectional
from keras.layers.merge import add
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing import image
from keras.models import Model
from keras import Input, layers
from keras import optimizers
from keras.applications.inception_v3 import preprocess_input
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [94]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


In [95]:
wd = 'My Drive/CV_Project/'

with open(os.path.join(wd, 'train_img_id_comments.pkl'),'rb') as f:
  train = pickle.load(f)

with open(os.path.join(wd, 'train_img_encoding.pkl'),'rb') as f:
  train_img_encoding = pickle.load(f)

In [96]:
#adding startsequence and end sequence tokens before and after each caption
for id in tqdm(train.keys()):
  comments = train[id]
  t = []
  for comment in comments:
    c = '<s> ' + comment + ' </s>'
    t.append(c)

  train[id] = t

100%|██████████| 20340/20340 [00:00<00:00, 333468.10it/s]


In [97]:
train_images = list(train_img_encoding.keys())
train[train_images[100]]

['<s> two indian students looking at book with adults beside them </s>',
 '<s> indian students working and in discussion in classroom </s>',
 '<s> two darkhaired dark complected talking </s>',
 '<s> two indian women treading to each other </s>',
 '<s> four indian people in classroom </s>']

#### Building vocabulary

In [98]:
vocab = {}
for id in train.keys():
    comments = train[id]
    for comment in comments:
        words = comment.split(' ')
        for word in words:
            vocab[word] = vocab.get(word, 0) + 1
                
print ("Number of unique words in train corpus = %d" %(len(vocab.keys())))

Number of unique words in train corpus = 16508


Since the number of unique words is too large, we consider only those words which have over 30 occurences. 

In [99]:
words = [word for word in vocab.keys() if vocab[word]>30]
len(words)

2199

In [100]:
vocab_words = set(words)
#vocab_words.add('<unk>') #token for out vocabulary word
vocab_words.add('<pad>') #token for padding word
print (len(vocab_words)) 

2200


In [101]:
word2idx = {}
idx2word = {}

word2idx['<pad>'] = 0
idx2word[0] = '<pad>'

idx = 1
for word in vocab_words :
  if word!='<pad>':
    word2idx[word] = idx
    idx2word[idx] = word
    idx+=1


In [102]:
max_len = 0
comment_lengths = []

for img_id in tqdm(train.keys()):
    for comment in train[img_id]:
        l = len(comment.split(' '))
        if l>max_len:
            max_len = l
        comment_lengths.append(l)
        
max_len

100%|██████████| 20340/20340 [00:00<00:00, 162796.70it/s]


70

### Building data generator

The model will be predicting one word of the caption at a time given the previous words upto that time for the caption and the image encoding as the input. As each image has 5 captions, atleast 5-6 words per caption and there are around 20K train images, the data won't fit in memory while training. So, the data has to be loaded in batches and for that a generator function is needed.

In [150]:

vocab_size = len(word2idx)

# data generator, intended to be used in a call to model.fit_generator()
def data_generator(data, attribute_vec, word2idx, max_len, num_photos_per_batch):
    X1, X2, y = list(), list(), list()
    n=0
    # loop for ever over images
    while 1:
        for key, desc_list in data.items():
            n+=1
            # retrieve the image feature vector
            photo = attribute_vec[key]
            for desc in desc_list:
                # encode the sequence
                seq = [word2idx[word] for word in desc.split(' ') if word in word2idx]
                # split one sequence into multiple X, y pairs
                for i in range(1, len(seq)):
                    # split into input and output pair
                    in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_len)[0]
                    # encode output sequence
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    # store
                    X1.append(photo)
                    X2.append(in_seq)
                    y.append(out_seq)
            # yield the batch data
            if n==num_photos_per_batch:
                yield [[array(X1), array(X2)], array(y)]
                X1, X2, y = list(), list(), list()
                n=0

In [129]:
datagen = data_generator(train, train_img_encoding, word2idx, max_len, num_photos_per_batch=1)

In [130]:
X, target_word = next(datagen)
img_encoding = X[0]
partial_caption = X[1]
print (img_encoding.shape, partial_caption.shape, target_word.shape)

(38, 2048) (38, 70) (38, 2200)


### Word Embedding

In [110]:

embeddings_index = {} # empty dictionary
f = open(os.path.join(wd, 'glove.6B.100d.txt'), encoding="utf-8")

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()

In [111]:
embedding_dim = 100
vocab_size = len(word2idx)
# Get 100-dim dense vector for each of the words in our vocabulary
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word2idx.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in the embedding index will be all zeros
        embedding_matrix[i] = embedding_vector

### Model Architecture

In [177]:
# image feature extractor model
inputs1 = Input(shape=(2048,))
# partial caption sequence model
inputs2 = Input(shape=(max_len,))

fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)


se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)

# decoder (feed forward) model
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

# merge the two input models
model = Model(inputs=[inputs1, inputs2], outputs=outputs)

In [178]:
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_10 (InputLayer)           [(None, 70)]         0                                            
__________________________________________________________________________________________________
input_9 (InputLayer)            [(None, 2048)]       0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 70, 100)      220000      input_10[0][0]                   
__________________________________________________________________________________________________
dropout_8 (Dropout)             (None, 2048)         0           input_9[0][0]                    
____________________________________________________________________________________________

Freezing the weights of Embedding layer as we don't want to retrain our word embeddings

In [179]:
model.layers[2]

<tensorflow.python.keras.layers.embeddings.Embedding at 0x7f5c701a44d0>

In [180]:
model.layers[2].set_weights([embedding_matrix])
model.layers[2].trainable = False

In [181]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

### Training

Trying out on a 50 samples of train data

In [175]:
sample_train = {}
for img in train_images[:50]:
  sample_train[img] = train[img]

In [176]:
epochs = 1
number_pics_per_batch = 5
steps = len(sample_train)//number_pics_per_batch

In [182]:
generator = data_generator(sample_train, train_img_encoding, word2idx, max_len, 30)
for e in range(epochs):
    print('Epoch', e)
    batches = 0
    for batch_id in range(steps):
      print ("Bacth", (batch_id+1))
      X,y = next(generator)
      print (y.shape)
      model.fit(X,y)
      batches += number_pics_per_batch
      if batches >= len(sample_train):
        break

Epoch 0
Bacth 1
(1674, 2200)
Bacth 2
(1706, 2200)
Bacth 3
(1673, 2200)
Bacth 4
(1743, 2200)
Bacth 5
(1670, 2200)
Bacth 6
(1674, 2200)
Bacth 7
(1706, 2200)
Bacth 8
(1673, 2200)
Bacth 9
(1743, 2200)
Bacth 10
(1670, 2200)


In [214]:
def greedySearch(photo):
    #photo = train_img_encoding[photo_id]
    photo  = photo[np.newaxis,...]
    in_text = '<s>'
    for i in range(max_len):
        sequence = [word2idx[w] for w in in_text.split() if w in word2idx]
        sequence = pad_sequences([sequence], maxlen=max_len)
        yhat = model.predict([photo,sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = idx2word[yhat]
        in_text += ' ' + word
        if word == '</s>':
            break
    
    final = in_text.split()
    final = final[1:-1]
    final = ' '.join(final)
    return final

In [215]:
sample_images = list(sample_train.keys())
photo_id = sample_images[5]
print (photo_id)

228822815.jpg


In [217]:
photo = train_img_encoding[photo_id]
greedySearch(photo)

'two people running in the beach'

In [218]:
len(sample_images)

50

In [220]:
photo_id = sample_images[49]
print (photo_id)

2937611480.jpg


In [221]:
photo = train_img_encoding[photo_id]
greedySearch(photo)

'people people on bench bench bench bench'

In [222]:
photo_id = sample_images[0]
print (photo_id)

17516940.jpg


In [223]:
photo = train_img_encoding[photo_id]
greedySearch(photo)

'lady holding holding in kitchen'