# Image Captioning Basic Model CNN + LSTM


## Load Flickr8k DataSet for Image Captioning

In [7]:
from google.colab import drive

In [8]:
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
import zipfile

archive = zipfile.ZipFile('drive/MyDrive/archive.zip')

for file in archive.namelist():
    archive.extract(file, 'CaptionData')

In [10]:
import pandas as pd
import numpy as np

In [11]:
caption_df = pd.read_csv('CaptionData/captions.txt')
caption_df.head()

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...


In [129]:
# Clean the captions

import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

def clean_caption(caption):
    """
    Clean individual caption text.
    """
    # Convert to lowercase
    caption = caption.lower()

    # Remove punctuation
    caption = caption.translate(str.maketrans('', '', string.punctuation))

    # Remove numbers
    caption = re.sub(r'\d+', '', caption)

    # Remove extra whitespace
    caption = caption.strip()
    caption = re.sub(r'\s+', ' ', caption)

    # Tokenize the caption
    tokens = word_tokenize(caption)

    # # Remove short and long words (optional)
    # tokens = [word for word in tokens if len(word) > 1 and len(word) < 15]

    # # Remove stopwords (optional)
    # stop_words = set(stopwords.words('english'))
    # tokens = [word for word in tokens if word not in stop_words]

    # Rejoin tokens into a single string
    cleaned_caption = ' '.join(tokens)

    return cleaned_caption

def clean_captions(captions):
    """
    Clean a list of captions.
    """
    cleaned_captions = [clean_caption(caption) for caption in captions]
    return cleaned_captions

# Example usage
captions = [
    "A dog is playing with a ball.",
    "Children are playing in the playground!",
    "An airplane is flying in the sky, #Amazing!",
    "The quick brown fox jumps over the lazy dog.",
    "123 cats are sleeping on the couch."
]

cleaned_captions = clean_captions(captions)
print(cleaned_captions)


['a dog is playing with a ball', 'children are playing in the playground', 'an airplane is flying in the sky amazing', 'the quick brown fox jumps over the lazy dog', 'cats are sleeping on the couch']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [131]:
caption_df['caption'] = caption_df['caption'].apply(lambda x: clean_caption(x))

In [132]:
caption_df.head()

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,a child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,a girl going into a wooden building
2,1000268201_693b08cb0e.jpg,a little girl climbing into a wooden playhouse
3,1000268201_693b08cb0e.jpg,a little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,a little girl in a pink dress going into a woo...


In [12]:
image_ids = list(caption_df['image'].values)
len(image_ids)

40455

In [13]:
unique_image_ids = caption_df['image'].unique()

## Preprocess the Data

In [14]:
# Load Images
import os
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model

IMAGE_SHAPE = (299, 299, 3)

def preprocess_image(img_path):
  img = image.load_img(img_path)
  img = image.img_to_array(img)
  img = img/255.0
  img.resize(IMAGE_SHAPE)
  img = preprocess_input(img)
  img = np.expand_dims(img, axis=0)
  return img


In [15]:
out = preprocess_image('/content/CaptionData/Images/1007129816_e794419615.jpg')
out.shape

(1, 299, 299, 3)

In [16]:
# For ease of computation here dealing with less number of images
image_ids = image_ids[:5000]

In [18]:


def encode_images(image_dir, image_ids):
  model = InceptionV3(weights='imagenet')
  model = Model(model.input, model.layers[-2].output, name='feature_extractor')
  image_features = {}

  for image_id in image_ids:
    if image_id not in image_features:
      try:
        img = preprocess_image(os.path.join(image_dir, image_id))
        feature = model.predict(img, verbose=0)
        image_features[image_id] = feature
      except Exception as e:
        print(f"Error processing image {image_id}: {e}")
  return image_features

image_dir = '/content/CaptionData/Images'
image_features = encode_images(image_dir, image_ids)

len(image_features)

1000

In [19]:
print(f'Total number of unique images extracted: {len(image_features)}')

Total number of unique images extracted: 1000


In [20]:
import pickle

with open('image_features.pkl', 'wb') as f:
    pickle.dump(image_features, f)

In [134]:
## Load the Captions

captions_map = dict()
all_captions = [] # populating to find out the vocabulary size


def load_captions(captions_file_path):
  with open(captions_file_path, 'r') as f:
    lines = f.readlines()

    for line in lines:
      tokens = line.split(',')
      image_id, caption = tokens[0], ' '.join(tokens[1:]).lower()
      if image_id in image_ids:
        caption = clean_caption(caption)
        caption = '<startseq> ' + caption + ' <endseq>'
        if image_id not in captions_map:
          captions_map[image_id] = []

        captions_map[image_id].append(caption)
        all_captions.append(caption)

load_captions('/content/CaptionData/captions.txt')

In [135]:
len(captions_map)

1000

In [136]:
with open('captions_map.pkl', 'wb') as f:
    pickle.dump(captions_map, f)

In [24]:
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [25]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary size:", vocab_size)

Vocabulary size: 3224


In [26]:
# Convert captions to sequences
sequences = tokenizer.texts_to_sequences(all_captions)

# Find Max Length of a sequence
max_length = max(len(seq) for seq in sequences)
print("Max length of sequences:", max_length)

Max length of sequences: 35


In [137]:
from itertools import groupby
from collections import Counter

sequence_lengths = [len(seq) for seq in sequences]
sequence_length_counts = dict(Counter(sequence_lengths))
sequence_length_counts

{19: 132,
 9: 393,
 10: 439,
 11: 544,
 14: 467,
 17: 212,
 20: 120,
 21: 58,
 22: 39,
 15: 402,
 16: 307,
 13: 544,
 18: 178,
 12: 567,
 7: 126,
 8: 315,
 25: 13,
 4: 1,
 24: 18,
 26: 13,
 6: 43,
 28: 6,
 23: 25,
 29: 5,
 31: 1,
 27: 4,
 5: 21,
 35: 3,
 30: 2,
 32: 1,
 33: 1}

In [28]:
# Let's say we take max caption token length is 20
max_length = 20


In [103]:
# Load Image features and Captions Map if stored in pickle file
import pickle

with open('image_features.pkl', 'rb') as f:
  image_features = pickle.load(f)

with open('captions_map.pkl', 'rb') as f:
  captions_map = pickle.load(f)

In [109]:
len(image_features), len(captions_map)

(1000, 1000)

## Create Training Data

Training Data Input:



*   Image
*   Partial Caption output text so far


Training Data Output:


*   Next Caption token





In [138]:
def generate_training_data(captions_map, image_ids, image_features, tokenizer, max_length):
  X1, X2, y = [], [], []
  for image_id in image_ids:
    captions = captions_map[image_id]
    for caption in captions:
      seq = tokenizer.texts_to_sequences([caption])[0]
      for i in range(1, len(seq)):
        in_seq, out_seq = seq[:i], seq[i]
        in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
        # X1.append(np.squeeze(image_features[image_id], axis=0))
        X1.append(image_features[image_id])
        X2.append(in_seq)
        y.append(out_seq)
  return np.array(X1), np.array(X2), np.array(y)





In [30]:
image_ids = list(image_features.keys())

In [31]:

train_image_ids = image_ids[:800]
val_image_ids = image_ids[800:900]
test_image_ids = image_ids[900:]

In [139]:
train_data_X1, train_data_X2, train_data_y = generate_training_data(captions_map, train_image_ids, image_features, tokenizer, max_length)

In [140]:
train_data_X1.shape, train_data_X2.shape, train_data_y.shape

((47845, 1, 2048), (47845, 20), (47845, 3224))

In [141]:
val_data_X1, val_data_X2, val_data_y = generate_training_data(captions_map, val_image_ids, image_features, tokenizer, max_length)

In [143]:
# Save the prepared Data
np.save('train_data_X1.npy', train_data_X1)
np.save('train_data_X2.npy', train_data_X2)
np.save('train_data_y.npy', train_data_y)
np.save('val_data_X1.npy', val_data_X1)
np.save('val_data_X2.npy', val_data_X2)
np.save('val_data_y.npy', val_data_y)

In [38]:
train_data_X1.shape

(48082, 1, 2048)

In [42]:
# Load train and test data if saved earlier

train_data_X1 = np.load('train_data_X1.npy')
train_data_X2 = np.load('train_data_X2.npy')
train_data_y = np.load('train_data_y.npy')

val_data_X1 = np.load('val_data_X1.npy')
val_data_X2 = np.load('val_data_X2.npy')
val_data_y = np.load('val_data_y.npy')

In [91]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, MultiHeadAttention, Concatenate, Lambda, GRU, Flatten

# Feature extractor (encoder) model
image_input = Input(shape=(1, 2048,), name='image_input')
# image_input = Input(shape=(8 * 8, 2048))
image_features = Dense(256, activation='relu', name='dense1')(image_input)
# image_features = Flatten()(fe1)

# Sequence processor (decoder) model
text_input = Input(shape=(max_length,), name='text_input')
text_embedding = Embedding(vocab_size, 256, mask_zero=True, name='embedding')(text_input)
# text_embedding = Flatten()(text_embedding)

attention_output, attention_weights = MultiHeadAttention(num_heads=8,
                                                         key_dim=256,
                                                         name='multi_head_attention')(query=text_embedding,
                                                                      value=image_features,
                                                                      return_attention_scores=True)
# context_vector = attention_weights * image_features
# context_vector = Lambda(lambda x: tf.reduce_sum(x, axis=1), name='context_vector')(context_vector)



decoder_input = Concatenate(axis=-1, name='decoder_input')([attention_output, text_embedding])


# Decoder

decoder_lstm = GRU(256, return_sequences=True)(decoder_input)
decoder_lstm = GRU(256)(decoder_lstm)
outputs = Dense(vocab_size, activation='softmax')(decoder_lstm)

# Combined model
model = Model(inputs=[image_input, text_input], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())


Model: "model_9"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 text_input (InputLayer)     [(None, 20)]                 0         []                            
                                                                                                  
 image_input (InputLayer)    [(None, 1, 2048)]            0         []                            
                                                                                                  
 embedding (Embedding)       (None, 20, 256)              825344    ['text_input[0][0]']          
                                                                                                  
 dense1 (Dense)              (None, 1, 256)               524544    ['image_input[0][0]']         
                                                                                            

In [89]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

In [87]:
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    'content/checkpoint',
    monitor="val_loss",
    verbose=0)

In [92]:
history = model.fit([train_data_X1, train_data_X2], train_data_y,
          validation_data=([val_data_X1, val_data_X2], val_data_y),
          epochs=20,
          verbose=2,
          callbacks=[early_stopping, model_checkpoint]
          )


Epoch 1/20
1503/1503 - 142s - loss: 5.2052 - accuracy: 0.1741 - val_loss: 5.3199 - val_accuracy: 0.1895 - 142s/epoch - 95ms/step
Epoch 2/20
1503/1503 - 124s - loss: 4.9473 - accuracy: 0.1869 - val_loss: 5.4138 - val_accuracy: 0.1804 - 124s/epoch - 83ms/step
Epoch 3/20
1503/1503 - 126s - loss: 4.8994 - accuracy: 0.1880 - val_loss: 5.4043 - val_accuracy: 0.1895 - 126s/epoch - 84ms/step
Epoch 4/20
1503/1503 - 125s - loss: 4.8813 - accuracy: 0.1894 - val_loss: 5.4488 - val_accuracy: 0.1804 - 125s/epoch - 83ms/step
Epoch 5/20
1503/1503 - 127s - loss: 4.8630 - accuracy: 0.1899 - val_loss: 5.4887 - val_accuracy: 0.1777 - 127s/epoch - 84ms/step
Epoch 6/20
1503/1503 - 125s - loss: 4.8413 - accuracy: 0.1889 - val_loss: 5.5240 - val_accuracy: 0.1838 - 125s/epoch - 83ms/step
Epoch 7/20
1503/1503 - 124s - loss: 4.8320 - accuracy: 0.1897 - val_loss: 5.5615 - val_accuracy: 0.1905 - 124s/epoch - 83ms/step
Epoch 8/20
1503/1503 - 126s - loss: 4.8159 - accuracy: 0.1896 - val_loss: 5.5899 - val_accuracy: 

In [146]:
history2 = model.fit([train_data_X1, train_data_X2], train_data_y,
          # validation_data=([val_data_X1, val_data_X2], val_data_y),
          epochs=1,
          verbose=2
          # callbacks=[early_stopping, model_checkpoint]
          )

1496/1496 - 110s - loss: 4.7097 - accuracy: 0.1909 - 110s/epoch - 73ms/step


In [93]:
model.save('image_captioning_model_2.h5')

  saving_api.save_model(


# Inference

In [94]:
def encode_image(image_path):
  img = preprocess_image(image_path)
  inception_v3 = InceptionV3(weights='imagenet')
  inception_v3 = Model(inception_v3.input, inception_v3.layers[-2].output, name='feature_extractor')
  feature = inception_v3.predict(img, verbose=0)
  return feature

In [126]:
def generate_caption(model, tokenizer, image, max_length):
    in_text = '<startseq>'
    image = np.expand_dims(image, axis=0)

    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        # print(sequence.shape, image.shape)
        yhat = model.predict([image, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word[yhat]
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text

# # Example usage
# img_path = 'fish-8896355_1280.jpg'
# image_feature = encode_image(img_path)
# image_feature = np.expand_dims(image_feature, axis=0)
# caption = generate_caption(model, tokenizer, image_feature, max_length)
# print("Generated Caption:", caption)


In [112]:
# Test Images

test_image_features = [image_features[image_id] for image_id in test_image_ids]
test_image_features = np.array(test_image_features)
test_image_features.shape

(100, 1, 2048)

In [113]:
test_captions = [captions_map[image_id] for image_id in test_image_ids]
len(test_captions), test_captions[0]

(100,
 ['<startseq> a group of horses and people in front of a snowy mountain .\n <endseq>',
  '<startseq> the riders and horses are taking a break and resting on the mountain trail .\n <endseq>',
  '<startseq> three men are standing around pack horses in front of a red tent up in the mountains .\n <endseq>',
  '<startseq> three riders stand around their horses in the mountains .\n <endseq>',
  '<startseq> two men and some horses on a snowy mountain .\n <endseq>'])

In [120]:
image_features[test_image_ids[0]].shape

(1, 2048)

In [127]:
for i, image_id in enumerate(test_image_ids[:10]):
  print(generate_caption(model, tokenizer, image_features[image_id], max_length))
  print(captions_map[image_id])
  print()

<startseq> a man in a a a a a endseq
['<startseq> a group of horses and people in front of a snowy mountain .\n <endseq>', '<startseq> the riders and horses are taking a break and resting on the mountain trail .\n <endseq>', '<startseq> three men are standing around pack horses in front of a red tent up in the mountains .\n <endseq>', '<startseq> three riders stand around their horses in the mountains .\n <endseq>', '<startseq> two men and some horses on a snowy mountain .\n <endseq>']

<startseq> a man in a a a a a endseq
['<startseq> a black and brown dog is biting on a stick in the forest .\n <endseq>', '<startseq> a brown dog chewing on a large piece of wood .\n <endseq>', '<startseq> a brown dog is chewing on a stick .\n <endseq>', '<startseq> a dark brown dog is chewing on a stick .\n <endseq>', '<startseq> the brown dog is playing with a stick .\n <endseq>']

<startseq> a man in a a a a a endseq
['<startseq> a black dog carrying a colorful ball swims .\n <endseq>', '<startseq> a