<h1>Evaluation</h3>
Here we will Evaluate the results of teh segmentation on three metrics: Dice-Score, HD95-Score, ASSD-Score. HD95 and ASSD depend on the target spacing which is why we have to specify the traget spacing of the model we want to evaluate

In [None]:
target_spacing=(1.0, 1.0, 1.0)

<h3>Native Spacing</h3>
First we evaluate the model on the images with the native resolution

In [None]:
import os
from panoptica import Panoptica_Statistic, InputType, Panoptica_Evaluator,Panoptica_Aggregator, ConnectedComponentsInstanceApproximator, NaiveThresholdMatching
from panoptica.metrics import Metric
import nibabel as nib
import numpy as np
import sys
# Add script directory to Python path
module_path = r"KiTS23/scripts"
if module_path not in sys.path:
    sys.path.append(module_path)
import resample_to_target
importlib.reload(resample_to_target)

from resample_to_target import resample_dataset

In [None]:
# Enter the path to your predictions
pred_dir = "KiTS23/predictions/original_resolution"

# Enter the path to your ground truths
gt_dir = "KiTS23/dataset/nnUNet_data/test/nnUNet_raw/Dataset220_KiTS2023/labelsTr"

output_file = "KiTS23/evaluation/evaluation_native.tsv"

# Enter the voxel spacing of your model (the one that was used to get the predictions)
voxel_spacing = (1.0, 1.0, 1.0)

In [None]:
# === Collect all case names from the ground truth folder ===
case_ids = sorted([
    f.replace(".nii.gz", "")
    for f in os.listdir(gt_dir)
    if f.endswith(".nii.gz")
])

# === Create PAIR ===
PAIR = []

for case_id in case_ids:
    pred_path = os.path.join(pred_dir, case_id + ".nii.gz")
    gt_path = os.path.join(gt_dir, case_id + ".nii.gz")

    if not os.path.exists(pred_path):
        print(f"[Warning] Prediction for {case_id} not found, skipping.")
        continue

    # Load prediction and GT
    pred_img = nib.load(pred_path)
    gt_img = nib.load(gt_path)

    pred = pred_img.get_fdata().astype(np.uint8)
    mask = gt_img.get_fdata().astype(np.uint8)

    # Optional: check shape match
    if pred.shape != mask.shape:
        print(f"[Error] Shape mismatch in {case_id}: pred {pred.shape}, gt {mask.shape}")
        continue

    PAIR.append((pred, mask, case_id))

print(f"Loaded {len(PAIR)} pairs for evaluation.")

evaluator = Panoptica_Aggregator(
    Panoptica_Evaluator.load_from_config("KiTS23/scripts/panoptica_evaluator_kits23.yaml"),
    output_file = output_file,
    log_times = True,
    continue_file = True,
)

for pred, gt, case in PAIR:
    evaluator.evaluate(pred, gt, case, voxelspacing=voxel_spacing)

<h3>Target Spacing</h3>
Now we evaluate the model on the images with the target resolution. We have to resample the ground thruths to the target resolution aswell.

In [None]:
# Add script directory to Python path
module_path = r"KiTS23/scripts"
if module_path not in sys.path:
    sys.path.append(module_path)

# Define input folder and target spacing
input_folder = "KiTS23/dataset/nnUNet_data/test/nnUNet_raw/Dataset220_KiTS2023/labelsTr"
target_spacing = (2.0, 2.0, 2.0)

# Call the function
resample_dataset(
    input_folder=input_folder,
    target_spacing=target_spacing,
    seg=True
)

Now we do the evaluation on the target resolution

In [None]:
# Enter the path to your predictions
pred_dir = "KiTS23/predictions/target_resolution"

# Enter the path to your ground truths
gt_dir = "KiTS23/dataset/nnUNet_data/test/nnUNet_raw_resampled/Dataset220_KiTS2023/labelsTr"

output_file = "KiTS23/evaluation/evaluation_tsv/evaluation_target.tsv"

# Enter the voxel spacing of your model (the one that was used to get the predictions)
voxel_spacing = (2.0, 2.0, 2.0)

In [None]:
# === Collect all case names from the ground truth folder ===
case_ids = sorted([
    f.replace(".nii.gz", "")
    for f in os.listdir(gt_dir)
    if f.endswith(".nii.gz")
])

# === Create PAIR ===
PAIR = []

for case_id in case_ids:
    pred_path = os.path.join(pred_dir, case_id + ".nii.gz")
    gt_path = os.path.join(gt_dir, case_id + ".nii.gz")

    if not os.path.exists(pred_path):
        print(f"[Warning] Prediction for {case_id} not found, skipping.")
        continue

    # Load prediction and GT
    pred_img = nib.load(pred_path)
    gt_img = nib.load(gt_path)

    pred = pred_img.get_fdata().astype(np.uint8)
    mask = gt_img.get_fdata().astype(np.uint8)

    # Optional: check shape match
    if pred.shape != mask.shape:
        print(f"[Error] Shape mismatch in {case_id}: pred {pred.shape}, gt {mask.shape}")
        continue

    PAIR.append((pred, mask, case_id))

print(f"Loaded {len(PAIR)} pairs for evaluation.")

evaluator = Panoptica_Aggregator(
    Panoptica_Evaluator.load_from_config("KiTS23/scripts/panoptica_evaluator_kits23.yaml"),
    output_file = output_file,
    log_times = True,
    continue_file = True,
)

for pred, gt, case in PAIR:
    evaluator.evaluate(pred, gt, case, voxelspacing=voxel_spacing)


<h3>Plots</h3>
Now we can create plots with the tsv files we just created

In [None]:
from panoptica import Panoptica_Statistic
from panoptica_statistics import make_curve_over_setups

# Add additional tsv files here to compare models
tsv1 = "KiTS23/evaluation/evaluation_tsv/example.tsv"

stat1 = Panoptica_Statistic.from_file(tsv1)

# Also add new models in this dict
statistics_dict = {
    f"Model_Name": stat1,
}

metric = "global_bin_dsc"  # or any metric name found in your TSVs
groups = ["kidney", "masses", "tumor"]  # optional: choose groups to compare

fig = make_curve_over_setups(
    statistics_dict=statistics_dict,
    metric=metric,
    groups=groups,
    plot_as_barchart=True,
    plot_std=True,
    figure_title=None,
    xaxis_title="Model",
    yaxis_title="Dice",
    height=600,
    width=1200,
)

fig.show()
fig.write_image("KiTS23/evaluation/plots/compare_models.png")