In [None]:
import nltk
nltk.download("wordnet")
nltk.download("omw-1.4")

In [None]:
pip install monai

In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from torch.nn.utils.rnn import pad_sequence
from PIL import Image
import pandas as pd
import ast
import monai
from monai.transforms import Compose, LoadImage, EnsureChannelFirst, Resize, ScaleIntensity, ToTensor
from monai.networks.nets import DenseNet121
from monai.data import Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Check current working directory
os.chdir('/content/drive/My Drive/Handxray_Dataset/handxray')
print("Current Working Directory:", os.getcwd())

In [None]:
# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

In [None]:
# Load the DataFrame
file_path = 'VQA_QAEncoded.xlsx'
df = pd.read_excel(file_path)

In [None]:
# Ensure question_encoded and answer_encoded are properly formatted
#df['question_encoded'] = df['question_encoded'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df['answer_encoded'] = df['answer_encoded'].astype(int)

In [None]:
#Ensure all answers in your dataset are integers and lie in the range [0, answer_vocab_size - 1]
print("Unique answer labels:", df['answer_encoded'].unique())
print("Answer vocabulary size:", len(df['answer_encoded'].unique()))

In [None]:
# Ensure token indices start from 0
unique_tokens = sorted(set(token for q in df['question_encoded'] for token in q))
question_vocab = {token: idx for idx, token in enumerate(unique_tokens)}  # 0-based index


In [None]:
df['question_encoded'] = df['question_encoded'].apply(lambda q: [question_vocab[token] for token in q])


In [None]:
# # Ensure correct token mapping
# unique_tokens = set(token for q in df['question_encoded'] for token in q)
# question_vocab = {token: idx for idx, token in enumerate(sorted(unique_tokens))}


In [None]:
# Build question_vocab from unique tokens in questions
#question_vocab = set(token for q in df['question_encoded'] for token in q)
#question_vocab = {token: idx for idx, token in enumerate(question_vocab)}

In [None]:
# 🔹 MONAI Image Transforms (NOW DEFINED BEFORE USE)
monai_transforms = Compose([
    LoadImage(image_only=True),
    EnsureChannelFirst(),
    Resize((224, 224)),  # Adjust size based on model input requirements
    ScaleIntensity(),  # Normalize pixel values
    ToTensor()  # Convert to PyTorch tensor
])

In [None]:
class VQADataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform
        self.image_loader = LoadImage(image_only=True)
        self.monai_transforms = Compose([
            EnsureChannelFirst(),
            ScaleIntensity(),
            Resize((224, 224))
        ])

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Load Image using MONAI
        img_path = self.dataframe.iloc[idx]['image_path']
        image = self.image_loader(img_path)
        image = self.monai_transforms(image)
        image = torch.tensor(image, dtype=torch.float32)

        # Load Question and Answer
        question_encoded = self.dataframe.iloc[idx]['question_encoded']
        attention_mask = [1] * len(question_encoded)

        question = torch.tensor(question_encoded, dtype=torch.long)
        attention_mask = torch.tensor(attention_mask, dtype=torch.long)
        answer = torch.tensor(self.dataframe.iloc[idx]['answer_encoded'], dtype=torch.long)

        return image, question, attention_mask, answer

In [None]:
def collate_fn(batch):
    images, input_ids, attention_masks, answers = zip(*batch)

    images = torch.stack(images)
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    attention_masks = pad_sequence(attention_masks, batch_first=True, padding_value=0)
    answers = torch.tensor(answers, dtype=torch.long)

    return images, input_ids, attention_masks, answers

In [None]:
dataset = VQADataset(df, transform=monai_transforms)

In [None]:
from torch.utils.data import DataLoader, random_split

# Split dataset (80% train, 20% test)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Define DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)


In [None]:
# #Create DataLoader
# dataset = VQADataset(df, transform=monai_transforms)
# data_loader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

#Check Data Shapes
for images, input_ids, attention_masks, answers in train_loader:
    print(f"Images shape: {images.shape}")
    print(f"Input IDs shape: {input_ids.shape}")
    print(f"Attention masks shape: {attention_masks.shape}")
    print(f"Answers shape: {answers.shape}")
    break  # Stop after the first batch

In [None]:
class MONAI_LSTM_Model(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256, image_feature_dim=256, num_classes=10):
        super(MONAI_LSTM_Model, self).__init__()

        self.image_feature_dim = image_feature_dim  # ✅ Store image feature dim

        # 🔹 Image Feature Extractor (CNN)
        self.cnn = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((8, 8))  # ✅ Output: [batch_size, 128, 8, 8]
        )

        # 🔹 Fully Connected Layer for Image Features
        self.image_fc = nn.Linear(128 * 8 * 8, image_feature_dim)  # ✅ Match LSTM hidden_dim

        # 🔹 Question Processing (LSTM)
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

        # 🔹 Fully Connected Layers for Final Prediction
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim + image_feature_dim, 128),
            nn.ReLU(),
            nn.Linear(128, num_classes)
        )

    def forward(self, images, questions, attention_masks):
        # 🟢 Process Image Features
        image_features = self.cnn(images)
        batch_size = images.size(0)
        image_features = image_features.view(batch_size, -1)
        image_features = self.image_fc(image_features)

        # 🟢 Process Question with LSTM
        embedded = self.embedding(questions)
        _, (hidden, _) = self.lstm(embedded)
        hidden = hidden.squeeze(0)

        # 🟢 Concatenate Features
        combined = torch.cat((hidden, image_features), dim=1)

        # 🟢 Final Prediction
        output = self.fc(combined)
        return output


In [None]:
# Model Initialization
model = MONAI_LSTM_Model(
    vocab_size=14,  # Adjust as needed
    embedding_dim=128,
    hidden_dim=256,
    image_feature_dim=256,  # Ensure it matches the processed image features
    num_classes=10  # Adjust based on your dataset
)

# Move Model to Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define Loss & Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Start Training
#train(model, data_loader, criterion, optimizer, num_epochs=2)


In [None]:
# # 🔹 Define Model, Loss, and Optimizervocab_size=len(question_vocab)
# #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = MONAI_LSTM_Model(vocab_size=len(question_vocab), num_classes=df['answer_encoded'].nunique()).to(device)
# criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:

# 🔹 Training Loop
def train(model, dataloader, criterion, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        correct = 0
        total = 0

        for images, input_ids, attention_masks, answers in dataloader:
            images, input_ids, attention_masks, answers = images.to(device), input_ids.to(device), attention_masks.to(device), answers.to(device)

            optimizer.zero_grad()
            outputs = model(images, input_ids, attention_masks)
            loss = criterion(outputs, answers)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == answers).sum().item()
            total += answers.size(0)

        accuracy = correct / total
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss:.4f}, Accuracy: {accuracy:.4f}")
        print(f"Predicted:[{predicted}]")

In [None]:
import torch
import matplotlib.pyplot as plt

# 🔹 Train Function with Loss Tracking
def train(model, train_loader, test_loader, criterion, optimizer, num_epochs, device):
    model.to(device)
    train_losses, test_losses = [], []

    for epoch in range(num_epochs):
        model.train()  # Set model to training mode
        total_train_loss = 0
        correct_train = 0
        total_train = 0

        # 🔹 Training Loop
        for images, input_ids, attention_masks, answers in train_loader:
            images, input_ids, attention_masks, answers = images.to(device), input_ids.to(device), attention_masks.to(device), answers.to(device)

            optimizer.zero_grad()
            outputs = model(images, input_ids, attention_masks)
            loss = criterion(outputs, answers)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct_train += (predicted == answers).sum().item()
            total_train += answers.size(0)

        train_accuracy = correct_train / total_train
        avg_train_loss = total_train_loss / len(train_loader)
        train_losses.append(avg_train_loss)

        # 🔹 Validation Loop (Testing)
        model.eval()
        total_test_loss = 0
        correct_test = 0
        total_test = 0

        with torch.no_grad():
            for images, input_ids, attention_masks, answers in test_loader:
                images, input_ids, attention_masks, answers = images.to(device), input_ids.to(device), attention_masks.to(device), answers.to(device)

                outputs = model(images, input_ids, attention_masks)
                loss = criterion(outputs, answers)
                total_test_loss += loss.item()

                _, predicted = torch.max(outputs, 1)
                correct_test += (predicted == answers).sum().item()
                total_test += answers.size(0)

        test_accuracy = correct_test / total_test
        avg_test_loss = total_test_loss / len(test_loader)
        test_losses.append(avg_test_loss)

        print(f"Epoch [{epoch+1}/{num_epochs}]")
        print(f"Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
        print(f"Test Loss: {avg_test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}\n")

    # 🔹 Save Model
    torch.save(model.state_dict(), "vqa_model.pth")
    print("Model saved as vqa_model.pth")

    # 🔹 Plot Training & Validation Loss
    plt.plot(range(1, num_epochs + 1), train_losses, label="Train Loss", marker="o")
    plt.plot(range(1, num_epochs + 1), test_losses, label="Test Loss", marker="s")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.title("Training & Validation Loss Curve")
    plt.legend()
    plt.grid()
    plt.savefig("loss_curve.png")
    plt.show()


In [None]:
num_epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train(model, train_loader, test_loader, criterion, optimizer, num_epochs, device)


In [None]:
# 🔹 Run Training
train(model, train_loader, criterion, optimizer, num_epochs=10)

In [None]:
from nltk.corpus import wordnet
import numpy as np

def wup_similarity(pred, gt, threshold=0.8):
    #Compute Wu-Palmer Similarity between predicted and ground truth answers
    pred_synsets = wordnet.synsets(pred)
    gt_synsets = wordnet.synsets(gt)

    if not pred_synsets or not gt_synsets:
        return 0  # If no synsets are found, similarity is 0

    max_sim = max(wordnet.wup_similarity(p, g) or 0 for p in pred_synsets for g in gt_synsets)
    return 1 if max_sim >= threshold else max_sim


In [None]:
from torch.nn.functional import softmax
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

def evaluate_model(model, test_loader, device, threshold=0.8):
    model.eval()
    all_results = []  # Store per-image results
    all_preds, all_labels, all_wups = [], [], []

    with torch.no_grad():
        for batch_idx, (images, input_ids, attention_masks, answers) in enumerate(test_loader):
            images, input_ids, attention_masks, answers = images.to(device), input_ids.to(device), attention_masks.to(device), answers.to(device)

            # 🔹 Get model outputs and apply Softmax for probabilities
            outputs = model(images, input_ids, attention_masks)
            probabilities = softmax(outputs, dim=1).cpu().numpy()  # Convert to NumPy array
            preds = np.argmax(probabilities, axis=1)

            labels = answers.cpu().numpy()

            # Compute WUPS for each prediction
            batch_wups = [wup_similarity(str(p), str(l), threshold) for p, l in zip(preds, labels)]
            all_wups.extend(batch_wups)

            # 🔹 Store results per image
            for i in range(len(images)):
                result = {
                    "image_index": batch_idx * len(images) + i,  # Unique index for each image
                    "true_label": labels[i],
                    "predicted_label": preds[i],
                    "class_probabilities": probabilities[i].tolist(),  # Convert to list for readability
                    "wups_score": batch_wups[i]
                }
                all_results.append(result)
                all_preds.append(preds[i])
                all_labels.append(labels[i])

    # Compute Overall Metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average="weighted", zero_division=0)
    recall = recall_score(all_labels, all_preds, average="weighted", zero_division=0)
    f1 = f1_score(all_labels, all_preds, average="weighted", zero_division=0)
    wups_score = np.mean(all_wups)

    # Print Overall Metrics
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"WUPS Score (Threshold {threshold}): {wups_score:.4f}")

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "wups": wups_score,
        "results_per_image": all_results  # ✅ Detailed per-image results
    }

# 🔹 Run Evaluation
metrics = evaluate_model(model, test_loader, device)


In [None]:
import pandas as pd

df_results = pd.DataFrame(metrics["results_per_image"])
print(df_results.to_string(index=False))

In [None]:
from tabulate import tabulate

# Convert to table format
table = tabulate(metrics["results_per_image"], headers="keys", tablefmt="grid")

# Print the table
print(table)


In [None]:
# 🔹 Run Evaluation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
evaluate_model(model, test_loader, device)

In [None]:
# Debugging: Check max index in `question_encoded`
max_token_idx = max(max(q) for q in df['question_encoded'])  # Find highest token index
print("Max token index:", max_token_idx)
print("Vocab size:", len(question_vocab))

# Ensure all tokens are within range
assert max_token_idx < len(question_vocab), "ERROR: Some question tokens exceed vocab size!"
