In [1]:
# Load model directly
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from torchvision import transforms
import os
from sklearn.metrics import accuracy_score
import torch
from transformers import AutoProcessor, BlipForQuestionAnswering

processor = AutoProcessor.from_pretrained("dineshcr7/Final-BLIP-LORA")
model = BlipForQuestionAnswering.from_pretrained("dineshcr7/Final-BLIP-LORA")



Downloading (…)rocessor_config.json:   0%|          | 0.00/471 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]

In [2]:
class VQAMedDataset(Dataset):
    def __init__(self, qa_pairs_path, image_dir, transform=None):
        """
        Args:
            qa_pairs_path (str): Path to the file containing QA pairs.
            image_dir (str): Directory with all the images.
            transform (callable, optional): Optional transform to be applied on the image.
        """
        with open(qa_pairs_path, 'r', encoding="utf-8") as f:
            lines = f.readlines()
            self.data = [line.strip().split('|') for line in lines]

        self.image_dir = image_dir
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image_id, question, answer = self.data[idx]
        image_path = os.path.join(self.image_dir, f"{image_id}.jpg")
        image = Image.open(image_path).convert("RGB")

        if self.transform:
            image = self.transform(image)

        return image, question, answer

In [3]:
# the path to the training images
test_image_dir_med = "/kaggle/input/combined-all-data/Data/MED/Test_Images"
qa_pairs_path_med = "/kaggle/input/test-text/VQAMed2019_Test_Questions_w_Ref_Answers.txt"

test_image_dir_rad = "/kaggle/input/combined-all-data/Data/RAD/Images"
qa_pairs_path_rad = "/kaggle/input/test-text/vqa_rad_test_converted.txt"

test_image_dir_slake = "/kaggle/input/combined-all-data/Data/SLAKE/Train_Images"
qa_pairs_path_slake = "/kaggle/input/test-text/slake_test.txt"

# Instantiate the dataset (without image transformations for now)
sample_dataset_med = VQAMedDataset(qa_pairs_path_med, test_image_dir_med)
sample_dataset_rad = VQAMedDataset(qa_pairs_path_rad, test_image_dir_rad)
sample_dataset_slake = VQAMedDataset(qa_pairs_path_slake, test_image_dir_slake)

# Check a sample from the dataset
sample_dataset_med[0]

(<PIL.Image.Image image mode=RGB size=555x711>,
 'what modality is shown?',
 'cta - ct angiography')

In [4]:
sample_dataset_rad[1]

(<PIL.Image.Image image mode=RGB size=986x1200>,
 'are 12 ribs present in the image',
 'yes')

In [5]:
sample_dataset_slake[2]

(<PIL.Image.Image image mode=RGB size=512x512>,
 'What is the main organ in the image?',
 'Lung, Spinal Cord')

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
tokenizer = processor.tokenizer

In [7]:
# Transformations for the images
transform = transforms.Compose([
    transforms.Resize((384, 384)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]),
])

In [8]:
test_dataset_med = VQAMedDataset(qa_pairs_path_med, test_image_dir_med, transform=transform)
test_data_loader_med = DataLoader(test_dataset_med, batch_size=16, shuffle=False)

test_dataset_med[0]

(tensor([[[-1.6317, -1.3105, -0.8872,  ..., -1.2959, -0.9018, -1.6171],
          [ 0.0617, -0.0259, -0.6682,  ..., -0.0259, -0.5952, -1.5587],
          [-0.6244, -0.9456, -1.6317,  ..., -0.9164, -1.6755, -1.7777],
          ...,
          [-1.7193, -1.7047, -1.1645,  ..., -1.3397, -1.7777, -1.7777],
          [-1.7193, -1.6609, -0.9018,  ..., -1.5879, -1.7777, -1.7777],
          [-1.7047, -1.6755, -1.6025,  ..., -1.7777, -1.7777, -1.7777]],
 
         [[-1.5870, -1.2568, -0.8216,  ..., -1.2418, -0.8366, -1.5720],
          [ 0.1539,  0.0638, -0.5965,  ...,  0.0638, -0.5215, -1.5120],
          [-0.5515, -0.8816, -1.5870,  ..., -0.8516, -1.6320, -1.7371],
          ...,
          [-1.6771, -1.6621, -1.1068,  ..., -1.2869, -1.7371, -1.7371],
          [-1.6771, -1.6170, -0.8366,  ..., -1.5420, -1.7371, -1.7371],
          [-1.6621, -1.6320, -1.5570,  ..., -1.7371, -1.7371, -1.7371]],
 
         [[-1.3238, -1.0110, -0.5986,  ..., -0.9967, -0.6128, -1.3096],
          [ 0.3257,  0.2404,

In [9]:
def predict_answers(model, data_loader, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for images, questions, _ in data_loader:
            inputs = tokenizer(questions, return_tensors="pt", padding=True, truncation=True)
            images = images.to(device)
            inputs = {key: val.to(device) for key, val in inputs.items()}
            
            outputs = model.generate(pixel_values=images, input_ids=inputs["input_ids"])
            decoded_predictions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
            predictions.extend(decoded_predictions)
    return predictions

predicted_answers_med = predict_answers(model, test_data_loader_med, device)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


In [10]:
def classify_answer(answer):
    return 'OPEN' if answer.lower() not in ['yes', 'no'] else 'CLOSED'

predicted_answer_types_med = [classify_answer(ans) for ans in predicted_answers_med]

In [13]:
test_dataset_slake = VQAMedDataset(qa_pairs_path_slake, test_image_dir_slake, transform=transform)
test_data_loader_slake = DataLoader(test_dataset_slake, batch_size=16, shuffle=False)

predicted_answers_slake = predict_answers(model, test_data_loader_slake, device)
predicted_answer_types_slake = [classify_answer(ans) for ans in predicted_answers_slake]

ground_truth_slake = []
with open("/kaggle/input/combined-all-data/Data/SLAKE/slake_test.txt", 'r', encoding="utf-8") as file:
    for line in file:
        _, _, _, answer_type = line.strip().split('|')
        ground_truth_slake.append(answer_type)



In [14]:
open_accuracy_slake = accuracy_score(
    [gt for gt, pred in zip(ground_truth_slake, predicted_answer_types_slake) if gt == 'OPEN'],
    [pred for gt, pred in zip(ground_truth_slake, predicted_answer_types_slake) if gt == 'OPEN']
)
closed_accuracy_slake = accuracy_score(
    [gt for gt, pred in zip(ground_truth_slake, predicted_answer_types_slake) if gt == 'CLOSED'],
    [pred for gt, pred in zip(ground_truth_slake, predicted_answer_types_slake) if gt == 'CLOSED']
)
overall_accuracy_slake = accuracy_score(ground_truth_slake, predicted_answer_types_slake)

print("SLAKE English Dataset")
print(f"Open Accuracy: {open_accuracy_slake}")
print(f"Closed Accuracy: {closed_accuracy_slake}")
print(f"Overall Accuracy: {overall_accuracy_slake}")

SLAKE English Dataset
Open Accuracy: 0.7782945736434108
Closed Accuracy: 0.8581730769230769
Overall Accuracy: 0.8096135721017907
