In [32]:
import numpy as np
import pandas as pd
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.utils import to_categorical

In [34]:
image_dir = 'Images/'
annotation_file = 'results.csv'

In [36]:
captions = []
image_paths = []


annotation_data = pd.read_csv(annotation_file, header=None, names=['image', 'caption'], on_bad_lines='skip')
annotation_data = annotation_data.dropna(subset=['image', 'caption'])

for idx, row in annotation_data.iterrows():
    img = row['image']
    caption = row['caption']
    captions.append(caption)
    image_paths.append(os.path.join(image_dir, img))

image_paths = [path.split('|')[0] for path in image_paths]

In [38]:
# Tokenize captions
tokenizer = Tokenizer(oov_token="<UNK>")
tokenizer.fit_on_texts(captions)
vocab_size = len(tokenizer.word_index) + 1
sequences = tokenizer.texts_to_sequences(captions)
max_caption_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_caption_length, padding='post')

In [40]:
# Define CNN for image feature extraction
def build_image_model(input_shape=(224, 224, 3)):
    input_layer = layers.Input(shape=input_shape)
    
    x = layers.Conv2D(64, (3, 3), activation='relu')(input_layer)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Conv2D(128, (3, 3), activation='relu')(x)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Conv2D(256, (3, 3), activation='relu')(x)
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dense(512, activation='relu')(x)
    
    model = models.Model(inputs=input_layer, outputs=x)
    return model

image_model = build_image_model()

In [42]:
def build_image_captioning_model(image_model, vocab_size, max_caption_length):
    image_model.trainable = False
    image_input = layers.Input(shape=(512,))

    caption_input = layers.Input(shape=(max_caption_length,))
    caption_embedding = layers.Embedding(vocab_size, 256)(caption_input)
    caption_lstm = layers.LSTM(256)(caption_embedding)

    combined = layers.Concatenate()([image_input, caption_lstm])
    dense = layers.Dense(256, activation='relu')(combined)
    output = layers.Dense(vocab_size, activation='softmax')(dense)

    model = models.Model(inputs=[image_input, caption_input], outputs=output)
    return model

captioning_model = build_image_captioning_model(image_model, vocab_size, max_caption_length)
captioning_model.compile(optimizer=optimizers.Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

In [44]:
# Preprocess images
def load_and_preprocess_image(image_path):
    img = image.load_img(image_path, target_size=(224, 224))
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    return preprocess_input(img_array)

In [None]:
# Extract image features
def extract_image_features(image_model, image_paths):
    features = []
    for path in image_paths:
        img_data = load_and_preprocess_image(path)
        feature = image_model.predict(img_data)
        features.append(feature[0])
    return np.array(features)

image_features = extract_image_features(image_model, image_paths)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 274ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 153ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 180ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 166ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 153ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 174ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 194ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 171ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 152ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 166ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 165ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 179ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 169ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [None]:
def create_target_data(sequences, vocab_size):
    X_caption, y = [], []
    for seq in sequences:
        for i in range(1, len(seq)):
            X_caption.append(seq[:i])
            y.append(seq[i])
    X_caption = pad_sequences(X_caption, maxlen=max_caption_length, padding='post')
    y = to_categorical(y, num_classes=vocab_size)
    return X_caption, y

X_caption, y = create_target_data(sequences, vocab_size)

In [None]:
captioning_model.fit([image_features, X_caption], y, epochs=20, batch_size=32)

In [None]:
# Generate caption function
def generate_caption(model, image_path, tokenizer, max_caption_length):
    feature = extract_image_features(image_model, [image_path])[0]
    caption = ['<start>']
    for _ in range(max_caption_length):
        seq = tokenizer.texts_to_sequences([caption])[0]
        seq = pad_sequences([seq], maxlen=max_caption_length, padding='post')
        prediction = model.predict([np.expand_dims(feature, axis=0), seq])
        predicted_word_index = np.argmax(prediction[0])
        predicted_word = tokenizer.index_word.get(predicted_word_index, '<UNK>')
        if predicted_word == '<end>' or predicted_word == '<UNK>':
            break
        caption.append(predicted_word)
    return ' '.join(caption[1:])


In [None]:
# Test the model on an example
image_test_path = 'images.jpeg'
print("Generated Caption:", generate_caption(captioning_model, image_test_path, tokenizer, max_caption_length))