## Step 1: Load the Dataset
Assume we have a dataset of images and their captions stored locally. The images are in a directory, and captions are in a text file where each line has the format image_file_name|caption.

### 1.1 Load Images and Captions
First, we need to create a custom Dataset class to handle our data

In [5]:
import os
import pandas as pd
from PIL import Image

# Assuming you have the Flickr8k dataset downloaded and extracted
image_folder = 'D:/PhD file/image caption/image caption model with app/Flickr8k_Dataset/training_Dataset'
caption_file = 'D:/PhD file/image caption/image caption model with app/Flickr8k_text (1)/training.txt'

# Read the captions file
captions = pd.read_csv(caption_file, delimiter='\t', header=None, names=['image', 'caption'])

# Show the first few entries
print(captions.head())

# Example of reading an image and its caption
image_name = captions.iloc[0, 0].split('#')[0]  # Remove the #0 suffix
caption = captions.iloc[0, 1]

image_path = os.path.join(image_folder, image_name)
image = Image.open(image_path)

# Display the image and its caption
image.show()
print(caption)


                         image  \
0  1000268201_693b08cb0e.jpg#0   
1  1000268201_693b08cb0e.jpg#1   
2  1000268201_693b08cb0e.jpg#2   
3  1000268201_693b08cb0e.jpg#3   
4  1000268201_693b08cb0e.jpg#4   

                                             caption  
0  A child in a pink dress is climbing up a set o...  
1              A girl going into a wooden building .  
2   A little girl climbing into a wooden playhouse .  
3  A little girl climbing the stairs to her playh...  
4  A little girl in a pink dress going into a woo...  
A child in a pink dress is climbing up a set of stairs in an entry way .


### Step 2: Preprocess Both Image and Caption

In [6]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
import numpy as np

# Preprocess the images
def preprocess_image(image_path):
    image = Image.open(image_path)
    image = image.resize((224, 224))
    image = np.array(image)
    image = np.expand_dims(image, axis=0)
    image = preprocess_input(image)
    return image

# Preprocess the captions
tokenizer = Tokenizer(num_words=5000, oov_token='<UNK>')
tokenizer.fit_on_texts(captions['caption'])
sequences = tokenizer.texts_to_sequences(captions['caption'])
padded_sequences = pad_sequences(sequences, padding='post')

print(padded_sequences.shape)

# Example of preprocessing an image and a caption
image_path = os.path.join(image_folder, image_name)
preprocessed_image = preprocess_image(image_path)
print(preprocessed_image.shape)

caption_sequence = padded_sequences[0]
print(caption_sequence)


(229, 23)
(1, 224, 224, 3)
[  2  32   4   2 136 137   9  39  55   2 138   8 173   4  23 271 272   0
   0   0   0   0   0]


### Step 3: Train and Test Model

In [12]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate
from tensorflow.keras.optimizers import Adam

# Example preprocessed image data and padded sequences for debugging
preprocessed_image = np.random.rand(229, 224, 224, 3)  # Dummy data: 229 images of shape 224x224x3
padded_sequences = np.random.randint(5000, size=(229, 20))  # Dummy data: 229 sequences of length 20

# Define the image model
image_input = Input(shape=(224, 224, 3))
vgg_model = VGG16(include_top=False, weights='imagenet')
vgg_model.trainable = False
image_features = vgg_model(image_input)
image_features = tf.keras.layers.Flatten()(image_features)
image_features = tf.keras.layers.Dense(256, activation='relu')(image_features)
image_features = tf.keras.layers.RepeatVector(19)(image_features)  # Repeat image features for each time step (19)

# Define the caption model
caption_input = Input(shape=(None,))
embedding = Embedding(input_dim=5000, output_dim=256)(caption_input)
lstm = LSTM(256, return_sequences=True)(embedding)

# Concatenate image and caption features
decoder = Concatenate()([image_features, lstm])
output = Dense(5000, activation='softmax')(decoder)

# Define the model
model = Model(inputs=[image_input, caption_input], outputs=output)
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])

# Debugging prints
print("Length of preprocessed_image:", len(preprocessed_image))
print("Shape of preprocessed_image:", preprocessed_image.shape)
print("Length of padded_sequences:", len(padded_sequences))
print("Shape of padded_sequences:", padded_sequences.shape)

# Ensure both inputs have the same number of samples
assert len(preprocessed_image) == padded_sequences.shape[0], "Number of samples in preprocessed_image and padded_sequences must match."

# Slice padded_sequences to match the model's input requirements
inputs = padded_sequences[:, :-1]  # Slice to use as input
targets = padded_sequences[:, 1:]  # Slice to use as target labels

# Reshape targets to be (batch_size, sequence_length, 1)
targets = np.expand_dims(targets, axis=-1)

# Train the model
model.fit([preprocessed_image, inputs], targets, epochs=10, batch_size=32)


Length of preprocessed_image: 229
Shape of preprocessed_image: (229, 224, 224, 3)
Length of padded_sequences: 229
Shape of padded_sequences: (229, 20)
Epoch 1/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 11s/step - accuracy: 0.0000e+00 - loss: 8.5606
Epoch 2/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 11s/step - accuracy: 4.8739e-04 - loss: 8.4559
Epoch 3/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 11s/step - accuracy: 9.1283e-04 - loss: 8.2559
Epoch 4/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 8s/step - accuracy: 0.0011 - loss: 7.9827
Epoch 5/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 7s/step - accuracy: 9.1179e-04 - loss: 7.8460
Epoch 6/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 7s/step - accuracy: 7.9910e-04 - loss: 7.7727
Epoch 7/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 7s/step - accuracy: 9.2197e-04 - loss: 7.7017
Epoch 8/1

<keras.src.callbacks.history.History at 0x1a521ff14d0>

In [14]:
# Assuming you have test data ready for evaluation
test_images = ...  # Define or load your test images
test_captions = ...  # Define or load your test captions
test_targets = ...  # Define or load your test targets (labels or indices)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate([test_images, test_captions], test_targets)

# Print the evaluation results
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')


ValueError: Unrecognized data type: x=[Ellipsis, Ellipsis] (of type <class 'list'>)

## Step 3: Build the Model
We'll use a pre-trained CNN for feature extraction and an RNN with attention for generating captions.

### 3.1 Encoder-Decoder Model with Attention

In [None]:
import torch
import torch.nn as nn
import torchvision.models as models

class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        super(EncoderCNN, self).__init__()
        resnet = models.resnet50(pretrained=True)
        for param in resnet.parameters():
            param.requires_grad = False
        modules = list(resnet.children())[:-1]
        self.resnet = nn.Sequential(*modules)
        self.linear = nn.Linear(resnet.fc.in_features, embed_size)
        self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)

    def forward(self, images):
        features = self.resnet(images)
        features = features.view(features.size(0), -1)
        features = self.bn(self.linear(features))
        return features

class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.init_weights()

    def init_weights(self):
        self.embed.weight.data.uniform_(-0.1, 0.1)
        self.linear.weight.data.uniform_(-0.1, 0.1)
        self.linear.bias.data.fill_(0)

    def forward(self, features, captions):
        embeddings = self.embed(captions)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        hiddens, _ = self.lstm(embeddings)
        outputs = self.linear(hiddens)
        return outputs

class EncoderDecoder(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(EncoderDecoder, self).__init__()
        self.encoder = EncoderCNN(embed_size)
        self.decoder = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers)

    def forward(self, images, captions):
        features = self.encoder(images)
        outputs = self.decoder(features, captions)
        return outputs

# Example usage:
embed_size = 256
hidden_size = 512
num_layers = 1
vocab_size = len(vocab)

model = EncoderDecoder(embed_size, hidden_size, vocab_size, num_layers).to(device)


## Step 4: Train the Model
Define the loss function and the optimizer, and then train the model.

### 4.1 Training Loop

In [None]:
import torch.optim as optim

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training the model
num_epochs = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

for epoch in range(num_epochs):
    for i, (images, captions) in enumerate(dataloader):
        images = images.to(device)
        captions = captions.to(device)

        outputs = model(images, captions)
        loss = criterion(outputs.view(-1, vocab_size), captions.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if i % 100 == 0:
            print(f'Epoch [{epoch}/{num_epochs}], Step [{i}/{len(dataloader)}], Loss: {loss.item():.4f}')


## Step 5: Generate Captions and Visualize Attention
After training, we generate captions for new images and visualize the attention maps.

### 5.1 Generate Captions and Visualize Attention

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import skimage.transform

def visualize_attention(image, caption, attention_map):
    fig = plt.figure(figsize=(15, 15))
    len_s = len(caption)
    for i in range(len_s):
        ax = fig.add_subplot(len_s // 5 + 1, 5, i + 1)
        ax.imshow(image)
        ax.set_title(caption[i])
        current_alpha = attention_map[i, :].cpu().data.numpy().reshape(7, 7)
        alpha_img = skimage.transform.pyramid_expand(current_alpha, upscale=32, sigma=20)
        ax.imshow(alpha_img, alpha=0.7)
    plt.show()

# Example usage:
image, _ = dataset[0]
image = image.unsqueeze(0).to(device)
features = model.encoder(image)
output = model.decoder.generate_caption(features, vocab)

# Assuming attention_map is obtained during generation
visualize_attention(image, output, attention_map)


## Step 6: Save the Trained Model
After training your model, you need to save it to disk.

In [None]:
# Save the model
torch.save(model.state_dict(), 'image_captioning_model.pth')


## Step 7: Create a Streamlit App
We'll create a Streamlit app that allows users to upload an image, generates a caption, and visualizes the attention map.