## Step 1: Load the Dataset
Assume we have a dataset of images and their captions stored locally. The images are in a directory, and captions are in a text file where each line has the format image_file_name|caption.

### 1.1 Load Images and Captions
First, we need to create a custom Dataset class to handle our data

In [49]:
import os
import pandas as pd
from PIL import Image

# Assuming you have the Flickr8k dataset downloaded and extracted
image_folder = 'D:/PhD file/image caption/image caption model with app/Flickr8k_Dataset/training_Dataset'
caption_file = 'D:/PhD file/image caption/image caption model with app/Flickr8k_text (1)/training.txt'

# Read the captions file
captions = pd.read_csv(caption_file, delimiter='\t', header=None, names=['image', 'caption'])

# Show the first few entries
print(captions.head())

# Example of reading an image and its caption
image_name = captions.iloc[0, 0]
caption = captions.iloc[0, 4]

image_path = os.path.join(image_folder, image_name)
image = Image.open(image_path)
# Display the image and its caption
image.show()
print(caption)

                         image  \
0  1000268201_693b08cb0e.jpg#0   
1  1000268201_693b08cb0e.jpg#1   
2  1000268201_693b08cb0e.jpg#2   
3  1000268201_693b08cb0e.jpg#3   
4  1000268201_693b08cb0e.jpg#4   

                                             caption  
0  A child in a pink dress is climbing up a set o...  
1              A girl going into a wooden building .  
2   A little girl climbing into a wooden playhouse .  
3  A little girl climbing the stairs to her playh...  
4  A little girl in a pink dress going into a woo...  


FileNotFoundError: [Errno 2] No such file or directory: 'D:/PhD file/image caption/image caption model with app/Flickr8k_Dataset/training_Dataset\\1000268201_693b08cb0e.jpg#0'

## Step 3: Build the Model
We'll use a pre-trained CNN for feature extraction and an RNN with attention for generating captions.

### 3.1 Encoder-Decoder Model with Attention

In [None]:
import torch
import torch.nn as nn
import torchvision.models as models

class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        super(EncoderCNN, self).__init__()
        resnet = models.resnet50(pretrained=True)
        for param in resnet.parameters():
            param.requires_grad = False
        modules = list(resnet.children())[:-1]
        self.resnet = nn.Sequential(*modules)
        self.linear = nn.Linear(resnet.fc.in_features, embed_size)
        self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)

    def forward(self, images):
        features = self.resnet(images)
        features = features.view(features.size(0), -1)
        features = self.bn(self.linear(features))
        return features

class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.init_weights()

    def init_weights(self):
        self.embed.weight.data.uniform_(-0.1, 0.1)
        self.linear.weight.data.uniform_(-0.1, 0.1)
        self.linear.bias.data.fill_(0)

    def forward(self, features, captions):
        embeddings = self.embed(captions)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        hiddens, _ = self.lstm(embeddings)
        outputs = self.linear(hiddens)
        return outputs

class EncoderDecoder(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(EncoderDecoder, self).__init__()
        self.encoder = EncoderCNN(embed_size)
        self.decoder = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers)

    def forward(self, images, captions):
        features = self.encoder(images)
        outputs = self.decoder(features, captions)
        return outputs

# Example usage:
embed_size = 256
hidden_size = 512
num_layers = 1
vocab_size = len(vocab)

model = EncoderDecoder(embed_size, hidden_size, vocab_size, num_layers).to(device)


## Step 4: Train the Model
Define the loss function and the optimizer, and then train the model.

### 4.1 Training Loop

In [None]:
import torch.optim as optim

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training the model
num_epochs = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

for epoch in range(num_epochs):
    for i, (images, captions) in enumerate(dataloader):
        images = images.to(device)
        captions = captions.to(device)

        outputs = model(images, captions)
        loss = criterion(outputs.view(-1, vocab_size), captions.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if i % 100 == 0:
            print(f'Epoch [{epoch}/{num_epochs}], Step [{i}/{len(dataloader)}], Loss: {loss.item():.4f}')


## Step 5: Generate Captions and Visualize Attention
After training, we generate captions for new images and visualize the attention maps.

### 5.1 Generate Captions and Visualize Attention

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import skimage.transform

def visualize_attention(image, caption, attention_map):
    fig = plt.figure(figsize=(15, 15))
    len_s = len(caption)
    for i in range(len_s):
        ax = fig.add_subplot(len_s // 5 + 1, 5, i + 1)
        ax.imshow(image)
        ax.set_title(caption[i])
        current_alpha = attention_map[i, :].cpu().data.numpy().reshape(7, 7)
        alpha_img = skimage.transform.pyramid_expand(current_alpha, upscale=32, sigma=20)
        ax.imshow(alpha_img, alpha=0.7)
    plt.show()

# Example usage:
image, _ = dataset[0]
image = image.unsqueeze(0).to(device)
features = model.encoder(image)
output = model.decoder.generate_caption(features, vocab)

# Assuming attention_map is obtained during generation
visualize_attention(image, output, attention_map)


## Step 6: Save the Trained Model
After training your model, you need to save it to disk.

In [None]:
# Save the model
torch.save(model.state_dict(), 'image_captioning_model.pth')


## Step 7: Create a Streamlit App
We'll create a Streamlit app that allows users to upload an image, generates a caption, and visualizes the attention map.