# Image Captioning Basic Model CNN + LSTM


## Load Flickr8k DataSet for Image Captioning

In [1]:
from google.colab import drive

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import zipfile

archive = zipfile.ZipFile('drive/MyDrive/archive.zip')

for file in archive.namelist():
    archive.extract(file, 'CaptionData')

In [5]:
import pandas as pd
import numpy as np

In [6]:
caption_df = pd.read_csv('CaptionData/captions.txt')
caption_df.head()

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...


In [71]:
image_ids = list(caption_df['image'].values)
len(image_ids)

40455

In [39]:
unique_image_ids = caption_df['image'].unique()

## Preprocess the Data

In [43]:
# Load Images
import os
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model

IMAGE_SHAPE = (299, 299, 3)

def preprocess_image(img_path):
  img = image.load_img(img_path)
  img = image.img_to_array(img)
  img = img/255.0
  img.resize(IMAGE_SHAPE)
  img = preprocess_input(img)
  img = np.expand_dims(img, axis=0)
  return img


In [44]:
out = preprocess_image('/content/CaptionData/Images/1007129816_e794419615.jpg')
out.shape

(1, 299, 299, 3)

In [72]:
# For ease of computation here dealing with less number of images
image_ids = image_ids[:5000]

In [46]:


def encode_images(image_dir, image_ids):
  model = InceptionV3(weights='imagenet')
  model = Model(model.input, model.layers[-2].output, name='feature_extractor')
  image_features = {}

  for image_id in image_ids:
    if image_id not in image_features:
      try:
        img = preprocess_image(os.path.join(image_dir, image_id))
        feature = model.predict(img, verbose=0)
        image_features[image_id] = feature
      except Exception as e:
        print(f"Error processing image {image_id}: {e}")
  return image_features

image_dir = '/content/CaptionData/Images'
image_features = encode_images(image_dir, image_ids)

len(image_features)

1000

In [48]:
print(f'Total number of unique images extracted: {len(image_features)}')

Total number of unique images extracted: 1000


In [49]:
import pickle

with open('image_features.pkl', 'wb') as f:
    pickle.dump(image_features, f)

In [56]:
## Load the Captions

captions_map = dict()
all_captions = [] # populating to find out the vocabulary size


def load_captions(captions_file_path):
  with open(captions_file_path, 'r') as f:
    lines = f.readlines()

    for line in lines:
      tokens = line.split(',')
      image_id, caption = tokens[0], ' '.join(tokens[1:]).lower()
      if image_id in image_ids:
        caption = '<startseq> ' + caption + ' <endseq>'
        if image_id not in captions_map:
          captions_map[image_id] = []

        captions_map[image_id].append(caption)
        all_captions.append(caption)

load_captions('/content/CaptionData/captions.txt')

In [57]:
len(captions_map)

1000

In [66]:
with open('captions_map.pkl', 'wb') as f:
    pickle.dump(captions_map, f)

In [135]:
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [59]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary size:", vocab_size)

Vocabulary size: 3224


In [61]:
# Convert captions to sequences
sequences = tokenizer.texts_to_sequences(all_captions)

# Find Max Length of a sequence
max_length = max(len(seq) for seq in sequences)
print("Max length of sequences:", max_length)

Max length of sequences: 35


In [64]:
from itertools import groupby
from collections import Counter

sequence_lengths = [len(seq) for seq in sequences]
sequence_length_counts = dict(Counter(sequence_lengths))
sequence_length_counts

{19: 132,
 9: 393,
 10: 439,
 11: 544,
 14: 467,
 17: 212,
 20: 120,
 21: 58,
 22: 39,
 15: 402,
 16: 307,
 13: 544,
 18: 178,
 12: 567,
 7: 126,
 8: 315,
 25: 13,
 4: 1,
 24: 18,
 26: 13,
 6: 43,
 28: 6,
 23: 25,
 29: 5,
 31: 1,
 27: 4,
 5: 21,
 35: 3,
 30: 2,
 32: 1,
 33: 1}

In [65]:
# Let's say we take max caption token length is 20
max_length = 20


## Create Training Data

Training Data Input:



*   Image
*   Partial Caption output text so far


Training Data Output:


*   Next Caption token





In [126]:
def generate_training_data(captions_map, image_ids, image_features, tokenizer, max_length):
  X1, X2, y = [], [], []
  for image_id in image_ids:
    captions = captions_map[image_id]
    for caption in captions:
      seq = tokenizer.texts_to_sequences([caption])[0]
      for i in range(1, len(seq)):
        in_seq, out_seq = seq[:i], seq[i]
        in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
        # X1.append(np.squeeze(image_features[image_id], axis=0))
        X1.append(image_features[image_id])
        X2.append(in_seq)
        y.append(out_seq)
  return np.array(X1), np.array(X2), np.array(y)





In [85]:
image_ids = list(image_features.keys())

In [86]:

train_image_ids = image_ids[:800]
val_image_ids = image_ids[800:900]
test_image_ids = image_ids[900:]

In [127]:
train_data_X1, train_data_X2, train_data_y = generate_training_data(captions_map, train_image_ids, image_features, tokenizer, max_length)

In [128]:
train_data_X1.shape, train_data_X2.shape, train_data_y.shape

((48082, 1, 2048), (48082, 20), (48082, 3224))

In [129]:
val_data_X1, val_data_X2, val_data_y = generate_training_data(captions_map, val_image_ids, image_features, tokenizer, max_length)

In [81]:
# Save the prepared Data
np.save('train_data_X1.npy', train_data_X1)
np.save('train_data_X2.npy', train_data_X2)
np.save('train_data_y.npy', train_data_y)
np.save('val_data_X1.npy', val_data_X1)
np.save('val_data_X2.npy', val_data_X2)
np.save('val_data_y.npy', val_data_y)
np.save('test_data_X1.npy', test_data_X1)
np.save('test_data_X2.npy', test_data_X2)
np.save('test_data_y.npy', test_data_y)

In [132]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, add, Flatten

# Feature extractor (encoder) model
inputs1 = Input(shape=(1, 2048,))
fe1 = Dense(256, activation='relu')(inputs1)
fe1 = Flatten()(fe1)

# Sequence processor (decoder) model
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = LSTM(256)(se1)

# Decoder (feed-forward) model
decoder1 = add([fe1, se2])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

# Combined model
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())


Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_29 (InputLayer)       [(None, 1, 2048)]            0         []                            
                                                                                                  
 input_30 (InputLayer)       [(None, 20)]                 0         []                            
                                                                                                  
 dense_13 (Dense)            (None, 1, 256)               524544    ['input_29[0][0]']            
                                                                                                  
 embedding_4 (Embedding)     (None, 20, 256)              825344    ['input_30[0][0]']            
                                                                                            

In [136]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

In [151]:
history = model.fit([train_data_X1, train_data_X2], train_data_y,
          validation_data=([val_data_X1, val_data_X2], val_data_y),
          epochs=20,
          verbose=2,
          callbacks=[callback]
          )


Epoch 1/20
1503/1503 - 61s - loss: 1.4959 - accuracy: 0.6073 - val_loss: 7.4749 - val_accuracy: 0.2812 - 61s/epoch - 40ms/step
Epoch 2/20
1503/1503 - 61s - loss: 1.3646 - accuracy: 0.6398 - val_loss: 8.0107 - val_accuracy: 0.2804 - 61s/epoch - 41ms/step
Epoch 3/20
1503/1503 - 60s - loss: 1.2514 - accuracy: 0.6710 - val_loss: 8.3940 - val_accuracy: 0.2844 - 60s/epoch - 40ms/step
Epoch 4/20
1503/1503 - 60s - loss: 1.1670 - accuracy: 0.6925 - val_loss: 8.7380 - val_accuracy: 0.2752 - 60s/epoch - 40ms/step
Epoch 5/20
1503/1503 - 60s - loss: 1.0997 - accuracy: 0.7105 - val_loss: 9.1460 - val_accuracy: 0.2774 - 60s/epoch - 40ms/step
Epoch 6/20
1503/1503 - 60s - loss: 1.0448 - accuracy: 0.7264 - val_loss: 9.6006 - val_accuracy: 0.2770 - 60s/epoch - 40ms/step
Epoch 7/20
1503/1503 - 61s - loss: 1.0000 - accuracy: 0.7388 - val_loss: 9.7736 - val_accuracy: 0.2715 - 61s/epoch - 40ms/step
Epoch 8/20
1503/1503 - 60s - loss: 0.9776 - accuracy: 0.7439 - val_loss: 10.0158 - val_accuracy: 0.2836 - 60s/e

In [139]:
model.save('image_captioning_model_1.h5')

# Inference

In [153]:
def encode_image(image_path):
  img = preprocess_image(image_path)
  inception_v3 = InceptionV3(weights='imagenet')
  inception_v3 = Model(inception_v3.input, inception_v3.layers[-2].output, name='feature_extractor')
  feature = inception_v3.predict(img, verbose=0)
  return feature

In [154]:
def generate_caption(model, tokenizer, image, max_length):
    in_text = '<startseq>'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        # print(sequence.shape, image.shape)
        yhat = model.predict([image, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word[yhat]
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text

# Example usage
img_path = 'fish-8896355_1280.jpg'
image_feature = encode_image(img_path)
image_feature = np.expand_dims(image_feature, axis=0)
caption = generate_caption(model, tokenizer, image_feature, max_length)
print("Generated Caption:", caption)


Generated Caption: <startseq> a man in a black shirt and jeans is kissing another man while he is holding a hello colored drink


In [155]:
# Test Images

test_image_features = [image_features[image_id] for image_id in test_image_ids]
test_image_features = np.array(test_image_features)
test_image_features.shape

(100, 1, 2048)

In [156]:
test_captions = [captions_map[image_id] for image_id in test_image_ids]
len(test_captions), test_captions[0]

(100,
 ['<startseq> a group of horses and people in front of a snowy mountain .\n <endseq>',
  '<startseq> the riders and horses are taking a break and resting on the mountain trail .\n <endseq>',
  '<startseq> three men are standing around pack horses in front of a red tent up in the mountains .\n <endseq>',
  '<startseq> three riders stand around their horses in the mountains .\n <endseq>',
  '<startseq> two men and some horses on a snowy mountain .\n <endseq>'])

In [157]:
generate_caption(model, tokenizer, test_image_features[5], max_length)

'<startseq> a man in a black shirt and jeans is kissing another man while he is holding a hello colored drink'

In [158]:
for i in range(10):
  print(generate_caption(model, tokenizer, test_image_features[i], max_length))
  print(test_captions[i])
  print()

<startseq> a man in a black shirt and jeans is kissing another man while he is holding a hello colored drink
['<startseq> a group of horses and people in front of a snowy mountain .\n <endseq>', '<startseq> the riders and horses are taking a break and resting on the mountain trail .\n <endseq>', '<startseq> three men are standing around pack horses in front of a red tent up in the mountains .\n <endseq>', '<startseq> three riders stand around their horses in the mountains .\n <endseq>', '<startseq> two men and some horses on a snowy mountain .\n <endseq>']

<startseq> a man in a black shirt and jeans is kissing another man while he is holding a hello colored drink
['<startseq> a black and brown dog is biting on a stick in the forest .\n <endseq>', '<startseq> a brown dog chewing on a large piece of wood .\n <endseq>', '<startseq> a brown dog is chewing on a stick .\n <endseq>', '<startseq> a dark brown dog is chewing on a stick .\n <endseq>', '<startseq> the brown dog is playing with a