In [1]:
import os
import numpy as np
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Dropout, add
from tensorflow.keras.utils import to_categorical

  from pandas.core import (


In [2]:
# ------------------------ Load Captions ------------------------
def load_captions(captions_path):
    with open(captions_path, 'r') as f:
        lines = f.readlines()
    img_to_captions = {}
    for line in lines:
        img, caption = line.strip().split('\t')
        img = img.split('#')[0]
        img_to_captions.setdefault(img, []).append(caption)
    return img_to_captions

In [12]:
# ------------------------ Tokenize Captions ------------------------
def tokenize_captions(img_to_captions):
    all_captions = []
    for captions in img_to_captions.values():
        for cap in captions:
            all_captions.append("startseq " + cap + " endseq")
    tokenizer = Tokenizer(num_words=5000, oov_token='<unk>')
    tokenizer.fit_on_texts(all_captions)
    max_length = max(len(cap.split()) for cap in all_captions)
    return tokenizer, max_length

In [13]:
# ------------------------ CNN Feature Extraction ------------------------
def load_cnn_model():
    base_model = InceptionV3(weights='imagenet')
    return Model(base_model.input, base_model.layers[-2].output)

def preprocess_img(img_path):
    img = image.load_img(img_path, target_size=(299, 299))
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    return preprocess_input(img_array)

def extract_features(image_folder, img_to_captions, model):
    image_features = {}
    for img_name in tqdm(img_to_captions.keys()):
        img_path = os.path.join(image_folder, img_name)
        img_input = preprocess_img(img_path)
        feature = model.predict(img_input)
        image_features[img_name] = feature
    return image_features


In [14]:
# ------------------------ Build Model ------------------------
def build_model(vocab_size, max_length):
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)

    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)

    decoder = add([fe2, se3])
    decoder = Dense(256, activation='relu')(decoder)
    outputs = Dense(vocab_size, activation='softmax')(decoder)

    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model


In [15]:

# ------------------------ Data Generator ------------------------
def data_generator(tokenizer, image_features, img_to_captions, max_length, vocab_size, batch_size=32):
    while True:
        X1, X2, y = [], [], []
        for img, caps in img_to_captions.items():
            for cap in caps:
                cap_seq = tokenizer.texts_to_sequences(["startseq " + cap + " endseq"])[0]
                for i in range(1, len(cap_seq)):
                    in_seq, out_seq = cap_seq[:i], cap_seq[i]
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    X1.append(image_features[img][0])
                    X2.append(in_seq)
                    y.append(out_seq)
                    if len(X1) == batch_size:
                        yield [np.array(X1), np.array(X2)], np.array(y)
                        X1, X2, y = [], [], []


In [16]:
# ------------------------ Caption Generation ------------------------
def generate_caption(model, tokenizer, photo, max_length):
    in_text = 'startseq'
    for _ in range(max_length):
        seq = tokenizer.texts_to_sequences([in_text])[0]
        seq = pad_sequences([seq], maxlen=max_length)
        yhat = model.predict([photo, seq], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat)
        if word is None or word == 'endseq':
            break
        in_text += ' ' + word
    return in_text.replace('startseq', '').strip()

In [17]:
# ------------------------ Main ------------------------
if __name__ == "__main__":
        captions_path =r"C:/Users/HP/OneDrive/Pictures/trip matheran/IMG-20241221-WA0019.jpg"

        img_to_captions = load_captions(captions_path)
        tokenizer, max_length = tokenize_captions(img_to_captions)
        vocab_size = len(tokenizer.word_index) + 1

        cnn_model = load_cnn_model()
        image_features = extract_features(image_folder, img_to_captions, cnn_model)

        model = build_model(vocab_size, max_length)
        model.fit(data_generator(tokenizer, image_features, img_to_captions, max_length, vocab_size),steps_per_epoch=1000, epochs=10, verbose=1)

# Test on a sample image    
test_img = list(img_to_captions.keys())[0]
print("Caption:", generate_caption(model, tokenizer, image_features[test_img], max_length))


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

ImportError: cannot import name 'BlipProcessor' from 'transformers' (c:\Users\HP\anaconda3\Lib\site-packages\transformers\__init__.py)