# Model Evaluation and Visualisation

This notebook evaluates the performance of each trained model on the test set. For each model, it:

- Computes and displays the confusion matrix, AUC-ROC curve, and precision-recall curve.
- Provides key metrics such as accuracy, recall, precision, and F1-score.
- Generates sample Grad-CAM heatmaps to visually interpret the predictions.


In [None]:
import torch
from torch import nn
from torch.utils.data import Dataset, ConcatDataset, Subset, DataLoader
import torchvision.transforms as T
import matplotlib.pyplot as plt

import PIL
from PIL import Image
from tqdm import tqdm
import os
import cv2
import pandas as pd
import numpy as np
from torchvision.io import read_image

from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc as calc_auc, precision_recall_curve, average_precision_score

import torch.nn.functional as F
from matplotlib import colormaps
from torchvision.transforms.functional import to_pil_image

In [None]:
class CustomImageDataset(Dataset):
    def __init__(self, annotations_file, img_dir, transform=None, target_transform=None):
        self.img_labels = pd.read_csv(annotations_file)
        self.img_dir = img_dir
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 1])
        image_rgb = Image.open(img_path).convert("RGB")
        label = self.img_labels.iloc[idx, 2]
        
        if self.transform:
            image_rgb = self.transform(image_rgb)
        if self.target_transform:
            label = self.target_transform(label)
        return image_rgb, label

In [None]:
test_transform  = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [None]:
# cropped calcification data
calc_test_label_dir = "/Users/giulia/Desktop/dissertation-mammogram-classification/mammogram-ai-project/Data/Data png cropped/Calc-Test-png-cropped/labels/calc-test_labels.csv"
calc_test_img_dir = "/Users/giulia/Desktop/dissertation-mammogram-classification/mammogram-ai-project/Data/Data png cropped/Calc-Test-png-cropped/images"

# now apply the transformations to the calcification images
calc_test_data = CustomImageDataset(calc_test_label_dir, calc_test_img_dir, test_transform)

# cropped mass data
mass_test_label_dir = "/Users/giulia/Desktop/dissertation-mammogram-classification/mammogram-ai-project/Data/Data png cropped/Mass-Test-png-cropped/labels/mass-test_labels.csv"
mass_test_img_dir = "/Users/giulia/Desktop/dissertation-mammogram-classification/mammogram-ai-project/Data/Data png cropped/Mass-Test-png-cropped/images"

# now apply the transformations to the mass images
mass_test_data = CustomImageDataset(mass_test_label_dir, mass_test_img_dir, test_transform)

# Merge test datasets
combined_test_data = ConcatDataset([calc_test_data, mass_test_data])

test_indices = np.random.permutation(len(combined_test_data))

shuffled_test_data = Subset(combined_test_data, test_indices)

print(f"Total testing samples: {len(shuffled_test_data)}")

# Create DataLoaders
batch_size = 32  # Adjust based on your GPU memory
num_workers = 0

test_dataloader = DataLoader(shuffled_test_data, batch_size=batch_size, shuffle=False, num_workers=num_workers)


In [None]:
labels_map = {
    0: "Benign",
    1: "Malignant",
}

figure = plt.figure(figsize=(8, 8))
cols, rows = 3, 3
for i in range(1, cols * rows + 1):
    sample_idx = torch.randint(len(shuffled_test_data), size=(1,)).item()
    img, label = shuffled_test_data[sample_idx]

    image_np = np.array(img)
    print(image_np.shape)

    figure.add_subplot(rows, cols, i)
    plt.title(labels_map[label])
    plt.axis("off")
    if(torch.is_tensor(img)):
        plt.imshow(img.permute(1, 2, 0))
    else:
        plt.imshow(img)
plt.show()

In [None]:
# Function to unnormalize images
def unnormalise(img_tensor, mean, std):
    """
    Reverses ImageNet normalisation to recover original image values.
    """
    mean = torch.tensor(mean).view(3, 1, 1)
    std = torch.tensor(std).view(3, 1, 1)
    return img_tensor * std + mean  # Reverse normalization


In [None]:
# Define global variables for Grad-CAM
gradients = None
activations = None

def backward_hook(module, grad_input, grad_output):
    global gradients
    gradients = grad_output

def forward_hook(module, args, output):
    global activations
    activations = output 

def generate_gradcam(model, image):
    """
    Generates Grad-CAM heatmap for a given image.
    """
    global gradients, activations
    
    model.zero_grad()
    output = model(image)
    prob = output.sigmoid()
    pred_label = (prob > 0.5).float()
    
    # Backward pass to get gradients
    output.backward(torch.ones_like(output))  
    
    # Pool gradients across the channels
    pooled_gradients = torch.mean(gradients[0], dim=[0, 2, 3])

    # Weight the channels by corresponding gradients
    for i in range(activations.size()[1]):
        activations[:, i, :, :] *= pooled_gradients[i]

    # Compute heatmap
    heatmap = torch.mean(activations, dim=1).squeeze()
    heatmap = F.relu(heatmap)
    heatmap /= torch.max(heatmap)

    return heatmap.detach().cpu()

def overlay_heatmap(img_tensor, heatmap):
    """
    Overlays the Grad-CAM heatmap on the original image.
    """
    unnorm_img = unnormalise(img_tensor, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    
    original_img = to_pil_image(unnorm_img.clamp(0, 1), mode='RGB')

    # Resize the heatmap to match image size
    overlay = to_pil_image(heatmap, mode='F').resize((224, 224), resample=PIL.Image.BICUBIC)

    # Apply colormap
    cmap = colormaps['jet']
    overlay = (255 * cmap(np.asarray(overlay) ** 2)[:, :, :3]).astype(np.uint8)

    return original_img, overlay 

In [None]:
def final_test(dataloader, model, model_code, num_rows=2):
    print("\nEvaluating on Test Set...")
    num_batches = len(dataloader)
    
    model.eval()
    
    correct = 0
    total = 0
    all_labels = []
    all_preds = []
    all_probs = []
    
    sample_images = []
    sample_heatmaps = []
    sample_labels = []
    sample_preds = []
    sample_confidences = []

    num_examples = num_rows * 5
    
    # Identify the last convolutional layer
    if model_code == "densenet121":
        last_conv_layer = model.features.denseblock4.denselayer16.conv2
    elif model_code == "densenet169":
        last_conv_layer = model.features.denseblock4.denselayer32.conv2
    elif model_code == "mobilenet_v3":
        last_conv_layer = model.features[-1]
    elif model_code == "efficientnet":
        last_conv_layer = model.features[-1][0]
        
    # Register hooks for Grad-CAM
    last_conv_layer.register_full_backward_hook(backward_hook)
    last_conv_layer.register_forward_hook(forward_hook)
    
    with torch.no_grad():
        for X, y in tqdm(dataloader, desc="Testing", total=num_batches, leave=True):
            X, y = X.to(device), y.to(device).view(-1, 1).float()
            pred = model(X)
            prob = pred.sigmoid()
            pred_labels = (prob > 0.5).float()

            correct += (pred_labels == y).sum().item()
            total += y.size(0)

            all_labels.extend(y.cpu().numpy())
            all_preds.extend(pred_labels.cpu().numpy())
            all_probs.extend(prob.cpu().numpy())

            # Store sample images and Grad-CAM visualisations
            if len(sample_images) < num_examples:
                for i in range(min(num_examples - len(sample_images), X.shape[0])): 
                    img_tensor = X[i].cpu()
                    # Enable gradients only for Grad-CAM
                    with torch.set_grad_enabled(True):
                        heatmap = generate_gradcam(model, X[i].unsqueeze(0))

                    original_img, overlay_img = overlay_heatmap(img_tensor, heatmap)

                    sample_images.append(original_img)
                    sample_heatmaps.append(overlay_img)
                    sample_labels.append(int(y[i].cpu().item()))
                    sample_preds.append(int(pred_labels[i].cpu().item()))

                    # Compute adjusted confidence score
                    prob_value = prob[i].cpu().item()
                    if pred_labels[i] == 1:
                        confidence = (prob_value - 0.5) * 200
                    else:
                        confidence = (0.5 - prob_value) * 200

                    sample_confidences.append(confidence)

    # Compute final metrics
    accuracy = correct / total * 100
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    auc = roc_auc_score(all_labels, all_probs)

    print(f"Test Accuracy: {accuracy:.2f}%")
    print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1-score: {f1:.3f}, AUC: {auc:.3f}")

    # Compute confusion matrix
    cm = confusion_matrix(all_labels, all_preds)

    # Plot confusion matrix
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Benign", "Malignant"])
    disp.plot(cmap="Blues", values_format="d")
    plt.title("Confusion Matrix")
    plt.show()

    # Compute ROC Curve and AUC
    fpr, tpr, _ = roc_curve(all_labels, all_probs)
    roc_auc = calc_auc(fpr, tpr)

    # Compute Precision-Recall Curve and Average Precision
    precision_vals, recall_vals, _ = precision_recall_curve(all_labels, all_probs)
    avg_precision = average_precision_score(all_labels, all_probs)

    # Plot side-by-side
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))

    # ROC Curve
    axes[0].plot(fpr, tpr, color='darkorange', lw=2, label=f'AUC = {roc_auc:.2f}')
    axes[0].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Guess')
    axes[0].set_xlim([0.0, 1.0])
    axes[0].set_ylim([0.0, 1.05])
    axes[0].set_xlabel('False Positive Rate')
    axes[0].set_ylabel('True Positive Rate')
    axes[0].set_title('ROC Curve')
    axes[0].legend(loc='lower right')
    axes[0].grid(True)

    # Precision-Recall Curve
    axes[1].plot(recall_vals, precision_vals, color='green', lw=2, label=f'AP = {avg_precision:.2f}')
    axes[1].set_xlim([0.0, 1.0])
    axes[1].set_ylim([0.0, 1.05])
    axes[1].set_xlabel('Recall')
    axes[1].set_ylabel('Precision')
    axes[1].set_title('Precision-Recall Curve')
    axes[1].legend(loc='lower left')
    axes[1].grid(True)

    plt.tight_layout()
    plt.show()


    # Plot original images and Grad-CAM heatmaps
    fig, axes = plt.subplots(num_rows * 2, 5, figsize=(15, 6 * num_rows))
    fig.suptitle("Sample Predictions with Grad-CAM", fontsize=16)

    for i in range(num_examples):
        row = (i // 5) * 2
        col = i % 5

        # Plot original image
        axes[row, col].imshow(sample_images[i], cmap="gray")
        true_label = "Malignant" if sample_labels[i] == 1 else "Benign"
        pred_label = "Malignant" if sample_preds[i] == 1 else "Benign"
        confidence = sample_confidences[i]

        axes[row, col].set_title(f"True: {true_label}\nPred: {pred_label}\nConf: {confidence:.1f}%")
        axes[row, col].axis("off")

        # Plot Grad-CAM heatmap
        axes[row + 1, col].imshow(sample_images[i], cmap="gray")
        axes[row + 1, col].imshow(sample_heatmaps[i], alpha=0.4, interpolation="nearest")
        axes[row + 1, col].set_title("Grad-CAM")
        axes[row + 1, col].axis("off")

    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.show()

    return accuracy, precision, recall, f1, auc


In [None]:
print("MPS Available:", torch.backends.mps.is_available())
print("MPS Built:", torch.backends.mps.is_built())

In [None]:
# Enable MPS if available
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Using device:", device)

In [None]:
from torchvision.models import densenet121

# Load the best model state
model_d121_path = "/Users/giulia/Desktop/dissertation-mammogram-classification/mammogram-ai-project/webapp/static/models/best_model_run13.pth"

model_d121 = densenet121(weights=None)
# Modify the classifier for binary classification
model_d121.classifier = nn.Sequential(
    nn.Dropout(0.5),
    nn.Linear(in_features=1024, out_features=1) 
)

# Move model to GPU if available
model_d121 = model_d121.to(device)

model_d121.load_state_dict(torch.load(model_d121_path, map_location=torch.device('cpu')))
model_d121.eval()

# Run final test
test_accuracy, test_precision, test_recall, test_f1, test_auc = final_test(test_dataloader, model_d121, model_code="densenet121", num_rows=3)

In [None]:
from torchvision.models import densenet169

# Load the best model state
model_d169_path = "/Users/giulia/Desktop/dissertation-mammogram-classification/mammogram-ai-project/webapp/static/models/best_model_run25.pth"

model_d169 = densenet169(weights=None)
# Modify the classifier for binary classification
model_d169.classifier = nn.Sequential(
    nn.Linear(in_features=1664, out_features=1)
)

# Move model to GPU if available
model_d169 = model_d169.to(device)

model_d169.load_state_dict(torch.load(model_d169_path, map_location=torch.device('cpu')))
model_d169.eval()

# Run final test
test_accuracy, test_precision, test_recall, test_f1, test_auc = final_test(test_dataloader, model_d169, model_code="densenet169", num_rows=3)

In [None]:
from torchvision.models import mobilenet_v3_small

# Load the best model state
model_mns_path = "/Users/giulia/Desktop/dissertation-mammogram-classification/mammogram-ai-project/webapp/static/models/best_model_run15.pth"

model_mns = mobilenet_v3_small(weights=None)  # Initialize the model
# Check the number of input features for the classifier
num_features = model_mns.classifier[0].in_features

# Modify the classifier for binary classification
model_mns.classifier = nn.Sequential(
    nn.Dropout(0.5), #add 50% Dropout
    nn.Linear(num_features, 1)
)

# Move model to GPU if available
model_mns = model_mns.to(device)

model_mns.load_state_dict(torch.load(model_mns_path, map_location=torch.device('cpu')))
model_mns.eval()  # Set the model to evaluation mode

# Run final test
test_accuracy, test_precision, test_recall, test_f1, test_auc = final_test(test_dataloader, model_mns, model_code="mobilenet_v3", num_rows=3)

In [None]:
from torchvision.models import mobilenet_v3_large

# Load the best model state
model_mnl_path = "/Users/giulia/Desktop/dissertation-mammogram-classification/mammogram-ai-project/webapp/static/models/best_model_run39.pth"

model_mnl = mobilenet_v3_large(weights=None)
# Check the number of input features for the classifier
num_features = model_mnl.classifier[0].in_features

# Modify the classifier for binary classification
model_mnl.classifier = nn.Sequential(
    nn.Dropout(0.4),
    nn.Linear(num_features, 1)
)


# Move model to GPU if available
model_mnl = model_mnl.to(device)

model_mnl.load_state_dict(torch.load(model_mnl_path, map_location=torch.device('cpu')))
model_mnl.eval()

# Run final test
test_accuracy, test_precision, test_recall, test_f1, test_auc = final_test(test_dataloader, model_mnl, model_code="mobilenet_v3", num_rows=3)

In [None]:
from torchvision.models import efficientnet_b0

# Load the best model state
model_eb0_path = "/Users/giulia/Desktop/dissertation-mammogram-classification/mammogram-ai-project/webapp/static/models/best_model_run20.pth"

model_eb0 = efficientnet_b0(weights=None) 

# Modify the classifier for binary classification
model_eb0.classifier = nn.Sequential(
    nn.Dropout(0.5),
    nn.Linear(in_features=1280, out_features=1)
)

model_eb0 = model_eb0.to(device)

model_eb0.load_state_dict(torch.load(model_eb0_path, map_location=torch.device('cpu')))
model_eb0.eval() 

# Run final test
test_accuracy, test_precision, test_recall, test_f1, test_auc = final_test(test_dataloader, model_eb0, model_code="efficientnet", num_rows=3)