In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import pickle
import os
from os import listdir
from pickle import dump
import keras
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.applications.vgg16 import VGG16
from keras.models import Model

In [None]:
def extract_features(directory):
    #loading vgg model
    model = VGG16()
    #restructure model by removing the prediction layer as we don't need predictions we need encodings
    model.layers.pop()
    #initialising new model
    model = Model(inputs = model.inputs , outputs = model.layers[-1].output)
    print(model.summary())
    #dictionary to save encodings of each image
    features = dict()
    #directory is path address of image folder
    for name in listdir(directory):
        #address of each image
        filename = directory + '/' + name
        #loading image tensor by changing dimensions
        image = load_img(filename , target_size=(224 , 224))
        #tranforming image from tensor to numpy array
        image = img_to_array(image)
        #reshaping image as per requirements of vgg network
        image = image.reshape((1 , image.shape[0] , image.shape[1] , image.shape[2]))
        image = preprocess_input(image)
        #predicting encodings of image
        feature = model.predict(image , verbose = 0)
        image_id = name.split('.')[0]
        features[image_id] = feature
        print(name)
    return features

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/My\ Drive/

/content/drive/My Drive


In [None]:
filename = 'Image_Caption_Project/features.pkl'
infile = open(filename,'rb')
features = pickle.load(infile)
infile.close()

In [None]:
features['862177617_c2c0581075'].shape

(1, 4096)

In [None]:
import string

In [None]:
#load doc into memory
def load_doc(filename):
    #open file as read only
    file = open(filename , 'r')
    #read all text
    text = file.read()
    #close the file
    file.close()
    return text

In [None]:
#loading descriptions of each image from doc file we created from load_doc function
def load_descriptions(doc):
    mapping = dict()
    #process lines
    for line in doc.split('\n'):
        #split by white spaces
        tokens = line.split()
        if(len(line) < 2):
            continue
        #first token will be image id and remaining will be decription
        image_id , image_desc = tokens[0] , tokens[1:]
        #remove file name from image id
        image_id = image_id.split('.')[0]
        #convert description token back to strings
        image_desc = ' '.join(image_desc)
        if image_id not in mapping:
            mapping[image_id] = list()
        #storing descriptions according to image
        mapping[image_id].append(image_desc)
    return mapping

In [None]:
#function to clean all punctuations(. , ?)
def clean_descriptions(descriptions):
    #loading puctuations
    punctuations = string.punctuation
    for key , desc_list in descriptions.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            #splliting description in list of words
            desc = desc.split()
            #removing punctuations and capital words and 1 letter words from the description
            desc = [word.lower() for word in desc if (word.lower() not in punctuations) and (len(word.lower()) > 1)]
            #removing numbers from description
            desc = [word for word in desc if word.isalpha()]
            #making list to string again and storing the description
            desc_list[i] = ' '.join(desc)

In [None]:
#making vocabulary of words
def create_vocabulary(descriptions):
    all_desc = set()
    for key in descriptions.keys():
        [all_desc.update(d.split()) for d in descriptions[key]]
    return all_desc

In [None]:
filename = 'Image_Caption_Project/Flickr8k_text/Flickr8k.token.txt'
#load descriptions
doc = load_doc(filename)
descriptions = load_descriptions(doc)

In [None]:
len(descriptions)

8092

In [None]:
clean_descriptions(descriptions)

In [None]:
len(descriptions)

8092

In [None]:
descriptions['667626_18933d713e']

['girl is stretched out in shallow water',
 'girl wearing red and bikini is laying on her back in shallow water',
 'little girl in red swimsuit is laying on her back in shallow water',
 'young girl is lying in the sand while ocean water is surrounding her',
 'girl wearing bikini lying on her back in shallow pool of clear blue water']

In [None]:
vocabulary = create_vocabulary(descriptions)

In [None]:
len(vocabulary)

8357

In [None]:
dump(vocabulary , open('Image_Caption_Project/vocabulary.pkl' , 'wb'))
dump(descriptions , open('Image_Caption_Project/descriptions.pkl' , 'wb'))

In [None]:
infile = open('Image_Caption_Project/vocabulary.pkl' , 'rb')
vocabulary = pickle.load(infile)
infile.close()
infile = open('Image_Caption_Project/descriptions.pkl' , 'rb')
descriptions = pickle.load(infile)
infile.close()

In [None]:
#loading pictures for train/dev set
def load_set(filename):
  doc = load_doc(filename)
  dataset = list()
  for line in doc.split('\n'):
    if(len(line) < 1):
      continue
    #get image identifier
    identifier = line.split('.')[0]
    dataset.append(identifier)
  return set(dataset)

In [None]:
#making training/testing dataset
def load_clean_descriptions(filename , dataset):
  infile = open(filename , 'rb')
  descriptions = pickle.load(infile)
  infile.close()
  newdescriptions = dict()
  for image_id in dataset:
    if image_id in descriptions:
      newdescriptions[image_id] = list()
      for desc in descriptions[image_id]:
        #wrap descriptions in tokens
        newdescriptions[image_id].append('startseq ' + desc + ' endseq')
  return newdescriptions

In [None]:
def load_photo_features(filename , dataset):
  all_features = pickle.load(open(filename , 'rb'))
  features = dict()
  for image_id in dataset:
    if image_id in all_features:
      features[image_id] = all_features[image_id]
  return features

In [None]:
#loading traindata
filename = 'Image_Caption_Project/Flickr8k_text/Flickr_8k.trainImages.txt'
train = load_set(filename)
print(len(train))

6000


In [None]:
#load descriptions of train images
train_descriptions = load_clean_descriptions('Image_Caption_Project/descriptions.pkl' , train)
#load photo features
train_features = load_photo_features('Image_Caption_Project/features.pkl' , train)

In [None]:
print(train_descriptions['2513260012_03d33305cf'])

['startseq black dog is running after white dog in the snow endseq', 'startseq black dog chasing brown dog through snow endseq', 'startseq two dogs chase each other across the snowy ground endseq', 'startseq two dogs play together in the snow endseq', 'startseq two dogs running through low lying body of water endseq']


In [None]:
import tensorflow
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils import plot_model
from keras.layers import Input
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers.merge import add
from keras.callbacks import ModelCheckpoint

In [None]:
#convert dictionary of discriptions to list of descriptions
def to_lines(descriptions):
  all_desc = list()
  for key in descriptions:
    for desc in descriptions[key]:
      all_desc.append(desc)
  return all_desc

In [None]:
#fit a tokenizer which maps all the words in descriptions to indices
def create_tokenizer(descriptions):
  lines = to_lines(descriptions)
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer

In [None]:
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index)+1
print(vocab_size)

7266


In [None]:
#returns length of description with maximum words
def Max_length(descriptions):
  max_len = 0
  for key in descriptions:
    for desc in descriptions[key]:
      max_len = max(len(desc.split()) , max_len)
  return max_len

In [None]:
def create_sequences(tokenizer , max_length , descriptions , photos , vocab_size):
  X1 , X2 , y = list() , list() , list()
  for key , desc_list in descriptions.items():
    for desc in desc_list:
      seq = tokenizer.texts_to_sequences([desc])[0]
      for i in range(1 , len(seq)):
        in_seq , out_seq = seq[:i] , seq[i]
        in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
        X1.append(photos[key][0])
        X2.append(in_seq)
        y.append(out_seq)
  return np.array(X1) , np.array(X2) , np.array(y)

In [None]:
def define_model(vocab_size , max_len):
  #processing input image
  inputs1 = Input(shape = (4096,))
  fe1 = Dropout(0.5)(inputs1)
  fe2 = Dense(256 , activation='relu')(fe1)

  #processing descriptions
  inputs2 = Input(shape = (max_len,))
  se1 = Embedding(vocab_size , 256 , mask_zero = True)(inputs2)
  se2 = Dropout(0.5)(se1)
  se3 = LSTM(256)(se2)

  #decoder model
  decoder1 = add([fe2 , se3])
  decoder2 = Dense(256 , activation='relu')(decoder1)
  outputs = Dense(vocab_size , activation='softmax')(decoder2)

  #initialize the model
  model = Model(inputs=[inputs1 , inputs2] , outputs = outputs)
  #compiling model
  model.compile(loss = 'categorical_crossentropy' , optimizer = 'adam')
  #printing summary of model
  print(model.summary())
  plot_model(model, to_file='model.png', show_shapes=True)
  return model

In [None]:
#training model

#load train dataset
filename = 'Image_Caption_Project/Flickr8k_text/Flickr_8k.trainImages.txt'
train_dataset = load_set(filename)
print("len train dataset:" , end = " ")
print(len(train_dataset))
train_features = load_photo_features('Image_Caption_Project/features.pkl' , train_dataset)
train_descriptions = load_clean_descriptions('Image_Caption_Project/descriptions.pkl' , train_dataset)
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print("vocabsize:" , end = " ")
print(vocab_size)
max_len = Max_length(train_descriptions)
print("max_length:" , end = " ")
print(max_len)
X1train , X2train , Ytrain = create_sequences(tokenizer , max_len , train_descriptions , train_features , vocab_size)

#load dev dataset
filename = 'Image_Caption_Project/Flickr8k_text/Flickr_8k.devImages.txt'
dev_dataset = load_set(filename)
print("len dev dataset:" , end = " ")
print(len(dev_dataset))
dev_features = load_photo_features('Image_Caption_Project/features.pkl' , train_dataset)
dev_descriptions = load_clean_descriptions('Image_Caption_Project/descriptions.pkl' , train_dataset)
X1dev , X2dev , Ydev = create_sequences(tokenizer , max_len , dev_descriptions , dev_features , vocab_size)
#####this method is giving me memory limit exceeded error#########

len train dataset: 6000
vocabsize: 7266
max_length: 33


In [None]:
def create_sequences2(tokenizer , desc_list , photo , max_len , vocab_size):
  X1 , X2 , y = list() , list() , list()
  for desc in desc_list:
    seq = tokenizer.texts_to_sequences([desc])[0]
    for i in range(1 , len(seq)):
      in_seq , out_seq = seq[:i] , seq[i]
      in_seq = pad_sequences([in_seq], maxlen=max_len)[0]
      out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
      X1.append(photo)
      X2.append(in_seq)
      y.append(out_seq)
  return np.array(X1) , np.array(X2) , np.array(y)

In [None]:
#at each step data generator gives out dataset of a single image
def data_generator(descriptions, photos, tokenizer, max_len, vocab_size):
  #loop for all images
  while True:
    for key, desc_list in descriptions.items():
      photo = photos[key][0]
      in_img, in_seq, out_word = create_sequences2(tokenizer, desc_list, photo, max_len , vocab_size)
      yield ((in_img, in_seq), out_word)

In [None]:
filename = 'Image_Caption_Project/Flickr8k_text/Flickr_8k.trainImages.txt'
train_dataset = load_set(filename)
print("len train dataset:" , end = " ")
print(len(train_dataset))
train_features = load_photo_features('Image_Caption_Project/features.pkl' , train_dataset)
train_descriptions = load_clean_descriptions('Image_Caption_Project/descriptions.pkl' , train_dataset)
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print("vocabsize:" , end = " ")
print(vocab_size)
max_len = Max_length(train_descriptions)
print("max_length:" , end = " ")
print(max_len)

len train dataset: 6000
vocabsize: 7266
max_length: 33


In [None]:
model = define_model(vocab_size , max_len)

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 33)]         0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 4096)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 33, 256)      1860096     input_2[0][0]                    
__________________________________________________________________________________________________
dropout (Dropout)               (None, 4096)         0           input_1[0][0]                    
_______________________________________________________________________________________

In [None]:
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

In [None]:
epochs = 20
steps = len(train_descriptions)
#as at each step data generator generates data of a single image thats why we need steps_per_epoch = 6000 as no. of images are 6000 in training dataset
for i in range(epochs):
  generator = data_generator(train_descriptions, train_features, tokenizer, max_len, vocab_size)
  model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
  filename = 'Image_Caption_Project/'
  model.save(filename + 'model_' + str(i) + '.h5')

Instructions for updating:
Please use Model.fit, which supports generators.


In [None]:
def idx_to_word(tokenizer , idx):
  if idx in tokenizer.index_word:
    return tokenizer.index_word[idx]
  return None

In [None]:
def generate_descriptions(model , tokenizer , photo , maxlen):
  in_text = 'startseq'
  for i in range(maxlen):
    sequence = tokenizer.texts_to_sequences([in_text])[0]
    sequence = pad_sequences([sequence] , maxlen=max_len)
    y_hat = model.predict([photo , sequence] , verbose = 0)
    y_hat = np.argmax(y_hat)
    word = idx_to_word(tokenizer , y_hat)
    if word is None:
      break
    in_text += ' ' + word
    if word == 'endseq':
      break
  return in_text

In [None]:
from nltk.translate.bleu_score import corpus_bleu

In [None]:
def evaluate_model(model , descriptions , photos , tokenizer , max_len):
  actual , predicted = list() , list()
  for key , desc_list in descriptions.items():
    yhat = generate_descriptions(model , tokenizer , photos[key] , max_len)
    references = [d.split() for d in desc_list]
    actual.append(references)
    predicted.append(yhat.split())
  #calculating blue score
  print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
  print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
  print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
  print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [None]:
from keras.models import load_model

In [None]:
filename = 'Image_Caption_Project/Flickr8k_text/Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))
train_descriptions = load_clean_descriptions('Image_Caption_Project/descriptions.pkl', train)
print('Descriptions: train=%d' % len(train_descriptions))
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
max_len = Max_length(train_descriptions)
print('Description Length: %d' % max_len)

filename = 'Image_Caption_Project/Flickr8k_text/Flickr_8k.testImages.txt'
test = load_set(filename)
print('Dataset: %d' % len(test))
test_descriptions = load_clean_descriptions('Image_Caption_Project/descriptions.pkl', test)
print('Descriptions: test=%d' % len(test_descriptions))
test_features = load_photo_features('Image_Caption_Project/features.pkl', test)
print('Photos: test=%d' % len(test_features))
 
# load the model
for i in range(20):
  filename = 'Image_Caption_Project/model_' + str(i) + '.h5'
  model = load_model(filename)
  print('model'+str(i)+' :')
  evaluate_model(model, test_descriptions, test_features, tokenizer, max_len)

Dataset: 6000
Descriptions: train=6000
Vocabulary Size: 7266
Description Length: 33
Dataset: 1000
Descriptions: test=1000
Photos: test=1000
model0 :
BLEU-1: 0.489197
BLEU-2: 0.250274
BLEU-3: 0.161983
BLEU-4: 0.063646
model1 :
BLEU-1: 0.522877
BLEU-2: 0.265724
BLEU-3: 0.173097
BLEU-4: 0.074125
model2 :
BLEU-1: 0.525655
BLEU-2: 0.267379
BLEU-3: 0.172641
BLEU-4: 0.070918
model3 :
BLEU-1: 0.508897
BLEU-2: 0.254281
BLEU-3: 0.163904
BLEU-4: 0.067186
model4 :
BLEU-1: 0.528120
BLEU-2: 0.262177
BLEU-3: 0.166713
BLEU-4: 0.068417
model5 :
BLEU-1: 0.515237
BLEU-2: 0.255068
BLEU-3: 0.162821
BLEU-4: 0.066602
model6 :


KeyboardInterrupt: ignored

In [None]:

dump(tokenizer, open('Image_Caption_Project/tokenizer.pkl', 'wb'))

In [None]:
#image caption model for new input image not present in dataset
def extract_feature(directory):
  model = VGG16()
  model = Model(inputs = model.input , outputs = model.layers[-2].output)
  image = load_img(directory, target_size=(224, 224))
  image = img_to_array(image)
  image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
  image = preprocess_input(image)
  feature = model.predict(image, verbose=0)
  return feature

In [None]:
filename = 'Image_Caption_Project/tokenizer.pkl'
tokenizer = pickle.load(open(filename , 'rb'))
model = load_model('Image_Caption_Project/model_2.h5')
photo = extract_feature('Image_Caption_Project/example2.jpg')
maxlen = 34
desc = generate_descriptions(model , tokenizer , photo , maxlen)
words = desc.split()
description = str()
for w in words:
  if w == 'startseq' or w == 'endseq':
    continue
  description += w + ' '

In [None]:
print(description)

two children are playing in the field 
