# Create a VQA Model using CNN and LSTM

In [None]:
import numpy as np
import pandas as pd
import random
import os
import matplotlib.pyplot as plt
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torchvision.models as models
from torchvision.models import resnet50, ResNet50_Weights
from torch.optim import AdamW


In [None]:
import kagglehub

path = kagglehub.dataset_download("bhavikardeshna/visual-question-answering-computer-vision-nlp")

print("Path to dataset files:", path)


In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda") 
    print(f"GPU is available: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")


In [None]:
train_path = r"/kaggle/input/visual-question-answering-computer-vision-nlp/dataset/data_train.csv"
eval_path = r"/kaggle/input/visual-question-answering-computer-vision-nlp/dataset/data_eval.csv"
image_path = r"/kaggle/input/visual-question-answering-computer-vision-nlp/dataset/images"
test_image_list = r"/kaggle/input/visual-question-answering-computer-vision-nlp/dataset/test_images_list.txt"
train_image_list = r"/kaggle/input/visual-question-answering-computer-vision-nlp/dataset/train_images_list.txt"

## Data visulization

In [None]:
pre_dataset = pd.DataFrame(pd.read_csv(train_path))
pre_dataset

In [None]:
sample_df = pd.DataFrame(pd.read_csv(train_path))
sample_df['image_id'] = sample_df['image_id'] + '.png'

samples = sample_df.sample(n=random.randint(10, 15)).reset_index(drop = True)

for i, row in samples.iterrows():
    image = os.path.join(image_path, row['image_id'])
    question = row['question']
    answer = row['answer']

    img = Image.open(image)

    plt.figure(figsize = (5, 5))
    plt.imshow(img)
    plt.title(f"Q: {question}\nA: {answer}")
    plt.show()

## Answer Generate using LLMs

In [None]:
# os.environ['GOOGLE_API_KEY'] = "AIzaSyD6fv2qZAcRc30uDjn96CbsM6pUJwLkdFE"

In [None]:
# df = pd.DataFrame(pd.read_csv(train_path))
# eval_df = pd.DataFrame(pd.read_csv(eval_path))

In [None]:
# # Generate data for train dataset
# new_answer = []
# for i, row in df.iterrows():
#     question = row['question']
#     answer = row['answer']
#     new_answer.append(generate_answer("gemini-1.5-pro", question, answer))

In [None]:
# # Generate data for eval dataset
# new_eval_answer = []
# for i, row in eval_df.iterrows():
#     question = row['question']
#     answer = row['answer']
#     new_eval_answer.append(generate_answer("gemini-1.5-pro", question, answer))

In [None]:
full_ans_train = r"/kaggle/input/vqa-fullanswer/train_data.csv"
full_ans_val = r"/kaggle/input/vqa-fullanswer/eval_data.csv"

In [None]:
dataframe = pd.DataFrame(pd.read_csv(full_ans_train))
dataframe

In [None]:
eval_dataframe = pd.DataFrame(pd.read_csv(full_ans_val))
eval_dataframe

In [None]:
def create_qa_pairs(df, output_file='all_pair_qa.csv'):
    df[['question', 'response']].to_csv(output_file, index=False)
    print(f"File saved: {output_file}")

create_qa_pairs(dataframe, output_file= r'/kaggle/working/all_pair_qa.csv')

In [None]:
dataframe['image_id'] = dataframe['image_id'] + '.png'
dataframe

In [None]:
samples = dataframe.sample(n=random.randint(10, 15)).reset_index(drop = True)

for i, row in samples.iterrows():
    image = os.path.join(image_path, row['image_id'])
    question = row['question']
    answer = row['response']

    img = Image.open(image)

    plt.figure(figsize = (5, 5))
    plt.imshow(img)
    plt.title(f"Q: {question}\nA: {answer}")
    plt.show()

## Data preprocessing


Prepare for image

In [None]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),

    transforms.RandomRotation(30),  
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
    
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 
])

Prepare for question and answer

In [None]:
import spacy
from collections import Counter

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
def spacy_tokenizer(text):
    tokens = []
    for token in nlp(text):
        if not token.is_punct and not token.is_space: 
            tokens.append(token.text.lower())
    return tokens


In [None]:
def build_vocab(texts, min_freq = 1):

    counter = Counter()
    for text in texts:
        counter.update(spacy_tokenizer(text)) 

    vocab = {}
    vocab["<unk>"] = 0
    vocab["<pad>"] = 1
    vocab["<sos>"] = 2
    vocab["<eos>"] = 3
    
    index = 4 
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = index
            index += 1
    return vocab


In [None]:
build_vocab({"Hello, this is Dibyajyoti from IITBHU"})

In [None]:
vocab_questions = build_vocab(dataframe['question'])
vocab_answers = build_vocab(dataframe['response'])

In [None]:
print("Sample vocab from questions:")
for word, index in list(vocab_questions.items())[:10]:  
    print(f"{word}: {index}")

print("\nSample vocab from answers:")
for word, index in list(vocab_answers.items())[:10]:
    print(f"{word}: {index}")


In [None]:
questions_vocab_size = len(vocab_questions)
answers_vocab_size = len(vocab_answers)

In [None]:
idx2word_answers = {idx: word for word, idx in vocab_answers.items()}

In [None]:
print("Sample vocab from questions:")
for word, index in list(idx2word_answers.items())[:10]:  
    print(f"{word}: {index}")

print("\nSample vocab from answers:")
for word, index in list(idx2word_answers.items())[:10]:  
    print(f"{word}: {index}")

In [None]:
def decode_answer(tensor, vocab_dict):
    return " ".join([vocab_dict[idx] for idx in tensor if idx not in {0, 1}])

In [None]:
def get_max_len(texts):
    max_len = 0
    for text in texts:
        tokens = spacy_tokenizer(text)
        if len(tokens) > max_len:
            max_len = len(tokens)
    return max_len

In [None]:
# len_max_question = get_max_len(dataframe['question'])
# len_max_answer = get_max_len(dataframe['response'])

len_max_question = 24
len_max_answer = 36

In [None]:
def text_to_tensor(text, vocab, max_len):
    tokens = spacy_tokenizer(text)  
    tokens = ["<sos>"] + tokens + ["<eos>"]
    
    indices = [vocab.get(token, vocab["<unk>"]) for token in tokens]  

    if len(indices) < max_len:  # Padding
        indices += [vocab["<pad>"]] * (max_len - len(indices))  
    else:  # Truncate
        indices = indices[:max_len]  

    return torch.tensor(indices, dtype=torch.long)  

In [None]:
class VQADataset(Dataset):
    def __init__(self, csv_path, image_folder, transform=None):
        self.df = pd.read_csv(csv_path)
        self.df['image_id'] = self.df['image_id'] + '.png'
        self.image_folder = image_folder
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = os.path.join(self.image_folder, row['image_id'])
        question = text_to_tensor(row['question'], vocab_questions, len_max_question)
        answer = text_to_tensor(row['response'], vocab_answers, len_max_answer)

        if os.path.exists(image_path):
            img = Image.open(image_path).convert("RGB")
            if self.transform:
                img = self.transform(img)
        else:
            img = torch.zeros((3, 224, 224))
            print(f"Image not found: {image_path}")
        
        return img, question, answer


In [None]:
train_dataset = VQADataset(full_ans_train, image_path, transform)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers = 4)

In [None]:
eval_dataset = VQADataset(full_ans_val, image_path, transform)
eval_loader = DataLoader(eval_dataset, batch_size=16, shuffle=True, num_workers = 4)

In [None]:
sample_idx = torch.randint(len(train_dataset), size=(1,)).item()
sample_image, sample_question, sample_answer = train_dataset[sample_idx]

print(f"Sample index: {sample_idx}")
print(f"Question Tensor: {sample_question}")
print(f"Answer Tensor: {sample_answer}")
print(f"Image Shape: {sample_image.shape}")

In [None]:
for images, questions, answers in train_loader:
    print(f"Batch size: {images.shape}")  
    print(f"First Question Tensor: {questions[0]}")
    print(f"First Answer Tensor: {answers[0]}")
    break  

In [None]:
for images, questions, answers in eval_loader:
    print(f"Batch size: {images.shape}") 
    print(f"First Question Tensor: {questions[0]}")
    print(f"First Answer Tensor: {answers[0]}")
    break  

## Build Model

In [None]:
class CNN_Feature_Extractor_pretrained(nn.Module):
    def __init__(self):
        super(CNN_Feature_Extractor_pretrained, self).__init__()

        resnet = models.resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
        self.feature_extractor = nn.Sequential(*list(resnet.children())[:-1]) 

        self.fc = nn.Linear(2048, 512)
    
    def forward(self, x):
        x = self.feature_extractor(x)
        x = torch.flatten(x, start_dim=1)
        x = self.fc(x)
        return x

In [None]:
class Question_Encoder(nn.Module):
    def __init__(self, questions_vocab_size, embedding_dim = 256, hidden_dim = 512):
        super(Question_Encoder, self).__init__()
        self.embedding = nn.Embedding(questions_vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, dropout = 0.2, batch_first = True)

    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.lstm(x)
        return hidden[-1]

In [None]:
class Attention(nn.Module):
    def __init__(self, hidden_dim=512):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_dim * 3, hidden_dim)  
        self.v = nn.Linear(hidden_dim, 1, bias=False)
    
    def forward(self, hidden, combined_feat):
        if hidden.dim() > 2:
            hidden = hidden.squeeze(0)  
        
        if hidden.dim() == 1:
            hidden = hidden.unsqueeze(0)
        
        energy = torch.tanh(self.attn(torch.cat((hidden, combined_feat), dim=1)))  # (batch_size, hidden_dim)
        attention_weights = F.softmax(self.v(energy), dim=1)  # (batch_size, 1)
        
        context = attention_weights * combined_feat  # (batch_size, hidden_dim * 2)
        return context, attention_weights

In [None]:
class Answer_Decoder(nn.Module):
    def __init__(self, answer_vocab_size, embedding_size=256, hidden_dim=512, k_beam = 3):
        super(Answer_Decoder, self).__init__()
        self.embedding = nn.Embedding(answer_vocab_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size + 1024, hidden_dim, num_layers=3, dropout = 0.2, batch_first=True)
        self.attention = Attention(hidden_dim)
        self.fc = nn.Linear(hidden_dim, answer_vocab_size)

        self.k_beam = k_beam
    
    def forward(self, question_feat, image_feat, answer_seq=None, answer_maxlength=36):
        combined_feat = torch.cat((question_feat, image_feat), dim=1)    # (batch_size, 2, hidden_dim)

        if answer_seq is not None:
            x = self.embedding(answer_seq)
            hidden_state = None
            outputs = []
            
            for i in range(x.size(1)):
                context, _ = self.attention(hidden_state[0][-1] if hidden_state else question_feat, combined_feat)
                lstm_input = torch.cat((x[:, i, :], context), dim=1).unsqueeze(1)
                output, hidden_state = self.lstm(lstm_input, hidden_state)
                outputs.append(self.fc(output.squeeze(1)))
            
            return torch.stack(outputs, dim=1)


        else:
            batch_size = combined_feat.size(0)
            device = image_feat.device
            end_token = 3 
            
            all_results = []
            
            for b in range(batch_size):
                b_question_feat = question_feat[b:b+1]
                b_combined_feat = combined_feat[b:b+1]
                
                beams = [(torch.tensor([[2]], dtype=torch.long, device=device),  # Start token
                          0.0,  # Log probability score
                          None)]  # Initial hidden state
                
                completed_beams = []
                
                for _ in range(answer_maxlength):
                    candidates = []
                    
                    for seq, score, hidden_state in beams:
                        if seq[0, -1].item() == end_token:
                            completed_beams.append((seq, score, hidden_state))
                            continue
                        
                        x = self.embedding(seq[:, -1])
                        
                        prev_hidden = hidden_state[0][-1] if hidden_state else b_question_feat
                        context, _ = self.attention(prev_hidden, b_combined_feat)
                        
                        lstm_input = torch.cat((x, context), dim=1).unsqueeze(1)
                        
                        output, new_hidden = self.lstm(lstm_input, hidden_state)
                        
                        logits = self.fc(output.squeeze(1))
                        log_probs = F.log_softmax(logits, dim=1)
                        
                        topk_log_probs, topk_indices = log_probs.topk(self.k_beam)
                        
                        for i in range(self.k_beam):
                            next_token = topk_indices[:, i:i+1]
                            next_score = score + topk_log_probs[:, i].item()
                            next_seq = torch.cat([seq, next_token], dim=1)
                            candidates.append((next_seq, next_score, new_hidden))
                    
                    if not candidates:
                        break
                    
                    candidates.sort(key=lambda x: x[1], reverse=True)
                    beams = candidates[:self.k_beam]
                    
                    if all(beam[0][0, -1].item() == end_token for beam in beams):
                        completed_beams.extend(beams)
                        break
                
                if completed_beams:
                    completed_beams.sort(key=lambda x: x[1], reverse=True)
                    best_seq = completed_beams[0][0]
                else:
                    beams.sort(key=lambda x: x[1], reverse=True)
                    best_seq = beams[0][0]
                
                all_results.append(best_seq)
            
            max_len = max(seq.size(1) for seq in all_results)
            padded_results = []
            
            for seq in all_results:
                if seq.size(1) < max_len:
                    padding = torch.full((1, max_len - seq.size(1)), end_token, dtype=torch.long, device=device)
                    padded_seq = torch.cat([seq, padding], dim=1)
                    padded_results.append(padded_seq)
                else:
                    padded_results.append(seq)
            
            return torch.cat(padded_results, dim=0)
        

In [None]:
first_image_batch = next(iter(train_loader))[0].to(device)
first_questions_batch = next(iter(train_loader))[1].to(device)
first_answers_batch = next(iter(train_loader))[2].to(device)

cnn_test = CNN_Feature_Extractor_pretrained().to(device)

image_feat = cnn_test(first_image_batch)

print("Input shape:", first_image_batch.shape)
print("Output shape:", image_feat.shape)


encoder_test = Question_Encoder(questions_vocab_size).to(device)

question_feat = encoder_test(first_questions_batch)

print("Input shape:", first_questions_batch.shape) 
print("Output shape:", question_feat.shape) 

decoder_test = Answer_Decoder(answers_vocab_size).to(device)

predicted_training = decoder_test(question_feat, image_feat, first_answers_batch)
predicted_generate = decoder_test(question_feat, image_feat)


question_feat = encoder_test(first_questions_batch)


In [None]:
class VQA_Model(nn.Module):
    def __init__(self, questions_vocab_size, answers_vocab_size, k_beam = 3):
        super(VQA_Model, self).__init__()

        self.image_encoder_resnet50_pretrained = CNN_Feature_Extractor_pretrained().to(device)
        self.question_encoder = Question_Encoder(questions_vocab_size).to(device)

        self.answer_decoder = Answer_Decoder(answers_vocab_size, k_beam = k_beam).to(device)

    def forward(self, image, question, answer_seq = None):
        image_feat = self.image_encoder_resnet50_pretrained(image)
        question_feat = self.question_encoder(question)
        output = self.answer_decoder(question_feat, image_feat, answer_seq)
        return output

In [None]:
first_image_batch = next(iter(train_loader))[0].to(device)
first_questions_batch = next(iter(train_loader))[1].to(device)
first_answers_batch = next(iter(train_loader))[2].to(device)

cnn_test = CNN_Feature_Extractor_pretrained().to(device)

image_feat = cnn_test(first_image_batch)

print("Input shape:", first_image_batch.shape)
print("Output shape:", image_feat.shape)


encoder_test = Question_Encoder(questions_vocab_size).to(device)

question_feat = encoder_test(first_questions_batch)

print("Input shape:", first_questions_batch.shape)
print("Output shape:", question_feat.shape)  

decoder_test = Answer_Decoder(answers_vocab_size).to(device)

predicted_training = decoder_test(question_feat, image_feat, first_answers_batch)
predicted_generate = decoder_test(question_feat, image_feat)


question_feat = encoder_test(first_questions_batch)
print("Input shape:", first_answers_batch.shape)  
print("Output shape:", predicted_training.shape)  
print("Output shape:", predicted_generate.shape)  # 

In [None]:
predicted_training.argmax(dim = -1)

In [None]:
predicted_generate

In [None]:
demo_model = VQA_Model(questions_vocab_size, answers_vocab_size, k_beam = 3)

output_train = demo_model(first_image_batch, first_questions_batch, first_answers_batch[:, :-1])
print('Output train shape:', output_train.shape)

In [None]:
output_generate = demo_model(first_image_batch, first_questions_batch)
print('Output predtci shape:', output_generate.shape)

## Utils

In [None]:
def tensor_to_text(tensor, idx2word):
    sentences = []
    for seq in tensor:
        words = [idx2word[idx.item()] for idx in seq if idx.item() in idx2word]
        
        if "<sos>" in words:
            words.remove("<sos>")
        sentence = " ".join(words).split("<eos>")[0]
        
        sentences.append(sentence.strip())
    
    return sentences


In [None]:
first_answers_batch = next(iter(train_loader))[2].to(device)
answer_text = tensor_to_text(first_answers_batch, idx2word_answers)
answer_text


In [None]:
predicted_text = tensor_to_text(output_generate, idx2word_answers)
predicted_text


In [None]:
import math
from collections import Counter

def ngram_precision(reference, candidate, n):
    ref_ngrams = Counter([tuple(reference[i:i+n]) for i in range(len(reference)-n+1)])
    cand_ngrams = Counter([tuple(candidate[i:i+n]) for i in range(len(candidate)-n+1)])
    
    overlap = sum(min(cand_ngrams[ngram], ref_ngrams.get(ngram, 0)) for ngram in cand_ngrams)
    total = sum(cand_ngrams.values())
    
    return overlap / total if total > 0 else 0

def brevity_penalty(reference, candidate):
    ref_len = len(reference)
    cand_len = len(candidate)
    
    if cand_len > ref_len:
        return 1
    else:
        return math.exp(1 - ref_len / cand_len) if cand_len > 0 else 0

def compute_bleu(reference_sentences, candidate_sentences, max_n=4):
    assert len(reference_sentences) == len(candidate_sentences), "Số lượng câu tham chiếu và câu dự đoán phải bằng nhau."
    
    bleu_scores = []
    for ref, cand in zip(reference_sentences, candidate_sentences):
        precisions = [ngram_precision(ref, cand, n) for n in range(1, max_n+1)]
        geometric_mean = math.exp(sum(math.log(p) for p in precisions if p > 0) / max_n) if any(precisions) else 0
        bp = brevity_penalty(ref, cand)
        bleu_scores.append(bp * geometric_mean)
    
    return sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0


In [None]:
reference_sentences = [
    "the cat is on the mat".split(),
    "there is a cat on the mat".split()
]

candidate_sentences = [
    "the cat is mat".split(),
    "there is cat on mat".split()
]

bleu_score = compute_bleu(reference_sentences, candidate_sentences)
bleu_score

In [None]:
bleu_score = compute_bleu(answer_text, predicted_text)
bleu_score

In [None]:
import time

def train_model(model, train_loader, eval_loader, criterion, optimizer, best_model_path, num_epochs=10, patience=5, device=device):
    model.to(device)
    best_loss = float('inf')
    no_improve_epochs = 0  

    history = {
        "train_loss": [],
        "eval_loss": [],
        "bleu_score": []
    }

    for epoch in range(num_epochs):
        start_time = time.time()

        model.train()
        total_loss = 0
        for images, questions, answers in train_loader:
            images, questions, answers = images.to(device), questions.to(device), answers.to(device)

            optimizer.zero_grad()
            output = model(images, questions, answers[:, :-1])  
            loss = criterion(output.view(-1, output.size(-1)), answers[:, 1:].reshape(-1))  
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)

        model.eval()
        eval_loss = 0
        bleu_scores = []

        with torch.no_grad():
            for images, questions, answers in eval_loader:
                images, questions, answers = images.to(device), questions.to(device), answers.to(device)
                output = model(images, questions, answers[:, :-1])

                loss = criterion(output.view(-1, output.size(-1)), answers[:, 1:].reshape(-1))
                eval_loss += loss.item()

                predicted_answers_test = tensor_to_text(model(images, questions), idx2word_answers)
                answers_text = tensor_to_text(answers, idx2word_answers)

                bleu = compute_bleu(predicted_answers_test, answers_text)
                bleu_scores.append(bleu)

        avg_eval_loss = eval_loss / len(eval_loader)
        avg_bleu_score = sum(bleu_scores) / len(bleu_scores)

        end_time = time.time()
        epoch_time = end_time - start_time

        print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {avg_train_loss:.4f}, Evaluation Loss: {avg_eval_loss:.4f}, BLEU Score: {avg_bleu_score:.4f} -- Time: {epoch_time:.2f}s")

        history["train_loss"].append(avg_train_loss)
        history["eval_loss"].append(avg_eval_loss)
        history["bleu_score"].append(avg_bleu_score)

        if avg_eval_loss < best_loss:
            best_loss = avg_eval_loss
            no_improve_epochs = 0
            torch.save(model.state_dict(), best_model_path)
            print("Best model saved!")
        else:
            no_improve_epochs += 1
            print(f"No improvement for {no_improve_epochs}/{patience} epochs.")

            if no_improve_epochs >= patience:
                print("Early stopping triggered! Training stopped.")
                break
    
    return history


## Train model

In [None]:
VQA_model = VQA_Model(questions_vocab_size, answers_vocab_size)

criterion = nn.CrossEntropyLoss(ignore_index=1)
optimizer = AdamW(VQA_model.parameters(), lr=1e-4, weight_decay=1e-2)

def train_vqa(train_loader, eval_loader, num_epochs=10):
    return train_model(VQA_model, train_loader, eval_loader, criterion, optimizer, r'/kaggle/working/VAQ_model.pth', num_epochs)

VQA_model_history = train_vqa(train_loader, eval_loader, num_epochs=50)


## Result

In [None]:
def plot_training_history(history):
    epochs = range(1, len(history['train_loss']) + 1)
    
    fig, ax1 = plt.subplots(2, 1, figsize=(10, 8))
    
    ax1[0].plot(epochs, history['train_loss'], label='Train Loss', marker='o', linestyle='-')
    ax1[0].plot(epochs, history['eval_loss'], label='Eval Loss', marker='s', linestyle='--')
    ax1[0].set_title('Training and Evaluation Loss')
    ax1[0].set_xlabel('Epochs')
    ax1[0].set_ylabel('Loss')
    ax1[0].legend()
    ax1[0].grid(True)
    
    ax1[1].plot(epochs, history['bleu_score'], label='BLEU Score', marker='d', color='g')
    ax1[1].set_title('BLEU Score Over Epochs')
    ax1[1].set_xlabel('Epochs')
    ax1[1].set_ylabel('BLEU Score')
    ax1[1].legend()
    ax1[1].grid(True)
    
    plt.tight_layout()
    plt.show()


In [None]:
plot_training_history(VQA_model_history)

## Test Model

In [None]:
def test_model(model, question, image_path, ground_truth, idx2word):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    image = Image.open(image_path).convert("RGB")
    image_tensor = transform(image).unsqueeze(0).to(device)
    
    question_tensor = text_to_tensor(question, vocab_questions, len_max_question).unsqueeze(0).to(device)

    model.eval()
    with torch.no_grad():
        output = model(image_tensor, question_tensor)
    
    predicted_answer = tensor_to_text(output, idx2word)[0]
    
    plt.imshow(image)
    plt.axis("off")
    plt.title(f"Q: {question}\nPredicted answer: {predicted_answer}\nGround Truth: {ground_truth}", fontsize=12)
    plt.show()
    
    return predicted_answer


In [None]:
def test_random_samples(model, eval_dataframe, idx2word):
    samples = eval_dataframe.sample(n=15)
    
    for index, row in samples.iterrows():
        question = row['question']
        image_path = r'/kaggle/input/visual-question-answering-computer-vision-nlp/dataset/images/' + row['image_id'] + '.png'
        ground_truth = row['response']
        
        predicted_answer = test_model(model, question, image_path, ground_truth, idx2word)


In [None]:
test_random_samples(VQA_model, eval_dataframe, idx2word_answers)