# Evaluation

## 0. Initiate

In [1]:
import json
from main_classification import get_args_parser
import torch.backends.cudnn as cudnn
from torch.utils.data import DataLoader

from trainer import test_classification
from evaluation.plex_metrics import plex_evaluate
from dataloader import *
from utils import metric_AUROC

In [2]:
# Parameters
args = get_args_parser(main_args=False).get_default_values()
args.device = "cpu"

## 1. Define parameters

In [3]:
# Data path
args.data_dir = "/Users/felixkrones/python_projects/data/NIH/images/"  # /Users/felixkrones/python_projects/data/NIH/images/ /Users/felixkrones/python_projects/data/ChestXpert/ /Users/felixkrones/python_projects/data/Padchest/0_224_224/ /Users/felixkrones/python_projects/data/VinDrCXR/
args.test_list = "dataset/Xray14_test_official.txt"  # dataset/Xray14_test_official.txt dataset/CheXpert_valid_official_frontal.csv dataset/CheXpert_test_Glocker.csv dataset/PADCHEST_chest_x_ray_images_labels_160K_01.02.19.csv dataset/VinDrCXR_test_pe_global_one.txt
args.metadata_file = ""
if "CheXpert_valid_official_frontal.csv" in args.test_list:
    args.metadata_file = "dataset/chestxpert_valid_metadata.csv"

In [4]:
# Benchmark models
args.nc = 3
model_list = [
    ("ResNet50", "/Users/felixkrones/python_projects/models/BenchmarkTransferLearning_f/Classification/ChestXray14/ResNet50_random/ResNet50_random_run_0_best.pth.tar"),
    ("ResNet50", "/Users/felixkrones/python_projects/models/BenchmarkTransferLearning_f/Classification/ChestXray14/ResNet50_random/ResNet50_random_run_1_best.pth.tar"),
    ("ResNet50", "/Users/felixkrones/python_projects/models/BenchmarkTransferLearning_f/Classification/ChestXray14/ResNet50_random/ResNet50_random_run_2_best.pth.tar"),
    ("ResNet50", "/Users/felixkrones/python_projects/models/BenchmarkTransferLearning_f/Classification/ChestXray14/ResNet50_imagenet/ResNet50_imagenet_run_0_best.pth.tar"),
    ("ResNet50", "/Users/felixkrones/python_projects/models/BenchmarkTransferLearning_f/Classification/ChestXray14/ResNet50_imagenet/ResNet50_imagenet_run_1_best.pth.tar"),
    ("ResNet50", "/Users/felixkrones/python_projects/models/BenchmarkTransferLearning_f/Classification/ChestXray14/ResNet50_imagenet/ResNet50_imagenet_run_2_best.pth.tar"),
]
diseases_model = [
    "Atelectasis",
    "Cardiomegaly",
    "Effusion",
    "Infiltration",
    "Mass",
    "Nodule",
    "Pneumonia",
    "Pneumothorax",
    "Consolidation",
    "Edema",
    "Emphysema",
    "Fibrosis",
    "Pleural_Thickening",
    "Hernia",
]

In [5]:
# GNMML models nc 1
args.nc = 1
model_list = [
#    ("", "2.1"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/2.1_random_scratch_1D/seed_0/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/2.1_random_scratch_1D/seed_11/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/2.1_random_scratch_1D/seed_21/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/2.1_random_scratch_1D/seed_42/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/2.1_random_scratch_1D/seed_100/best_checkpoint.pth"),
#    ("", "2.3"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/supervised_scratch_MIMIC_1D_1000e_100k/seed_0/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/supervised_scratch_MIMIC_1D_1000e_100k/seed_11/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/supervised_scratch_MIMIC_1D_1000e_100k/seed_21/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/supervised_scratch_MIMIC_1D_1000e_100k/seed_42/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/supervised_scratch_MIMIC_1D_1000e_100k/seed_100/best_checkpoint.pth"),
    ("", "0.2.1"),
    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.1_scratch_NIH_1D_v1/seed_0/best_checkpoint.pth"),
    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.1_scratch_NIH_1D_v1/seed_11/best_checkpoint.pth"),
    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.1_scratch_NIH_1D_v1/seed_21/best_checkpoint.pth"),
    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.1_scratch_NIH_1D_v1/seed_42/best_checkpoint.pth"),
    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.1_scratch_NIH_1D_v1/seed_100/best_checkpoint.pth"),
#    ("", "3.1"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/3.1_scratch_MIMIC_1D/seed_0/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/3.1_scratch_MIMIC_1D/seed_11/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/3.1_scratch_MIMIC_1D/seed_21/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/3.1_scratch_MIMIC_1D/seed_42/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/3.1_scratch_MIMIC_1D/seed_100/best_checkpoint.pth"),
#    ("", "3.3"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/3.3_scratch_OCT_1D/seed_0/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/3.3_scratch_OCT_1D/seed_11/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/3.3_scratch_OCT_1D/seed_21/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/3.3_scratch_OCT_1D/seed_42/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/3.3_scratch_OCT_1D/seed_100/best_checkpoint.pth"),
#    ("", "3.5"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/3.5_scratch_CovidxCT_1D/seed_0/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/3.5_scratch_CovidxCT_1D/seed_11/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/3.5_scratch_CovidxCT_1D/seed_21/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/3.5_scratch_CovidxCT_1D/seed_42/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/3.5_scratch_CovidxCT_1D/seed_100/best_checkpoint.pth"),
]
diseases_model = [
    "Atelectasis",
    "Cardiomegaly",
    "Consolidation",
    "Edema",
    "Effusion",
    "Emphysema",
    "Fibrosis",
    "Hernia",
    "Infiltration",
    "Mass",
    "Nodule",
    "Pleural_Thickening",
    "Pneumonia",
    "Pneumothorax",
]

In [6]:
# GNMML models nc 3
args.nc = 3
model_list = [
#    ("", "2.2"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/2.2_imagenet_scratch_3D/seed_0/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/2.2_imagenet_scratch_3D/seed_11/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/2.2_imagenet_scratch_3D/seed_21/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/2.2_imagenet_scratch_3D/seed_42/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/2.2_imagenet_scratch_3D/seed_100/best_checkpoint.pth"),
#    ("", "2.4"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/supervised_imagenet_MIMIC_3D_1000e_100k/seed_0/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/supervised_imagenet_MIMIC_3D_1000e_100k/seed_11/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/supervised_imagenet_MIMIC_3D_1000e_100k/seed_21/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/supervised_imagenet_MIMIC_3D_1000e_100k/seed_42/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/supervised_imagenet_MIMIC_3D_1000e_100k/seed_100/best_checkpoint.pth"),
    ("", "0.2.4"),
    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.4_scratch_NIH_3D/seed_0/best_checkpoint.pth"),
    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.4_scratch_NIH_3D/seed_11/best_checkpoint.pth"),
    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.4_scratch_NIH_3D/seed_21/best_checkpoint.pth"),
    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.4_scratch_NIH_3D/seed_42/best_checkpoint.pth"),
    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.4_scratch_NIH_3D/seed_100/best_checkpoint.pth"),
    ("", "0.2.5"),
    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.5_sit_imagenet_3D/seed_0/best_checkpoint.pth"),
    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.5_sit_imagenet_3D/seed_11/best_checkpoint.pth"),
    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.5_sit_imagenet_3D/seed_21/best_checkpoint.pth"),
    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.5_sit_imagenet_3D/seed_42/best_checkpoint.pth"),
    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.5_sit_imagenet_3D/seed_100/best_checkpoint.pth"),
    ("", "0.2.6"),
    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.6_timm_imagenet_3D/seed_0/best_checkpoint.pth"),
    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.6_timm_imagenet_3D/seed_11/best_checkpoint.pth"),
    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.6_timm_imagenet_3D/seed_21/best_checkpoint.pth"),
    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.6_timm_imagenet_3D/seed_42/best_checkpoint.pth"),
    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.6_timm_imagenet_3D/seed_100/best_checkpoint.pth"),
#    ("", "3.2"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/3.2_timm_MIMIC_3D/seed_0/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/3.2_timm_MIMIC_3D/seed_11/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/3.2_timm_MIMIC_3D/seed_21/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/3.2_timm_MIMIC_3D/seed_42/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/3.2_timm_MIMIC_3D/seed_100/best_checkpoint.pth"),
#    ("", "3.4"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/3.4_timm_OCT_3D/seed_0/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/3.4_timm_OCT_3D/seed_11/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/3.4_timm_OCT_3D/seed_21/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/3.4_timm_OCT_3D/seed_42/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/3.4_timm_OCT_3D/seed_100/best_checkpoint.pth"),
#    ("", "3.6"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/3.6_timm_CovidxCT_3D/seed_0/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/3.6_timm_CovidxCT_3D/seed_11/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/3.6_timm_CovidxCT_3D/seed_21/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/3.6_timm_CovidxCT_3D/seed_42/best_checkpoint.pth"),
#    ("vit_small", "/Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/3.6_timm_CovidxCT_3D/seed_100/best_checkpoint.pth"),
]
diseases_model = [
    "Atelectasis",
    "Cardiomegaly",
    "Consolidation",
    "Edema",
    "Effusion",
    "Emphysema",
    "Fibrosis",
    "Hernia",
    "Infiltration",
    "Mass",
    "Nodule",
    "Pleural_Thickening",
    "Pneumonia",
    "Pneumothorax",
]

In [7]:
# Define eval params
eval_params = {
    "selective_threshold": [0, 0.05, 0.1, 0.15, 0.2, 0.25],
    "independent_reg_variable": "StudyDate",
    "subpopulation_groups": ["sex_label", "race_label"],
    "ece_num_bins": 15,
}
decision_thresholds = [0.1, 0.2, 0.3, 0.4, 0.5]
diseases_to_test = ["Atelectasis", "Cardiomegaly", "Consolidation", "Edema", "Effusion"]
diseases_to_test = diseases_model
index_to_test_model = [diseases_model.index(disease) for disease in diseases_to_test]

## 2. Load data

In [8]:
# Get data
if "nih" in args.data_dir.lower():
    dataset_test = ChestXray14Dataset(
        images_path=args.data_dir,
        file_path=args.test_list,
        augment=build_transform_classification(
            normalize=args.normalization, mode="test", test_augment=args.test_augment, nc=args.nc
        ),
        nc=args.nc,
    )
    diseases = [
        "Atelectasis",
        "Cardiomegaly",
        "Effusion",
        "Infiltration",
        "Mass",
        "Nodule",
        "Pneumonia",
        "Pneumothorax",
        "Consolidation",
        "Edema",
        "Emphysema",
        "Fibrosis",
        "Pleural_Thickening",
        "Hernia",
    ]
    index_to_test_dataset = [diseases.index(disease) for disease in diseases_to_test]
elif "chestxpert" in args.data_dir.lower():
    diseases_to_test = [
        disease.replace("Effusion", "Pleural Effusion") for disease in diseases_to_test
    ]
    dataset_test = CheXpertDataset(
        images_path=args.data_dir,
        file_path=args.test_list,
        augment=build_transform_classification(
            normalize=args.normalization, mode="test", test_augment=args.test_augment, nc=args.nc
        ),
        uncertain_label=args.uncertain_label,
        unknown_label=args.unknown_label,
        nc=args.nc,
    )
    diseases = [
        "No Finding",
        "Enlarged Cardiomediastinum",
        "Cardiomegaly",
        "Lung Opacity",
        "Lung Lesion",
        "Edema",
        "Consolidation",
        "Pneumonia",
        "Atelectasis",
        "Pneumothorax",
        "Pleural Effusion",
        "Pleural Other",
        "Fracture",
        "Support Devices",
    ]
    index_to_test_dataset = [diseases.index(disease) for disease in diseases_to_test]
elif "padchest" in args.data_dir.lower():
    diseases_to_test = [
        disease.replace("Effusion", "Pleural Effusion") for disease in diseases_to_test
    ]
    dataset_test = PadchestDataset(
        images_path=args.data_dir,
        file_path=args.test_list,
        augment=build_transform_classification(
            normalize=args.normalization, mode="test", test_augment=args.test_augment, nc=args.nc
        ),
        diseases_to_test=diseases_to_test,
        nc=args.nc,
    )
    diseases = dataset_test.possible_labels
    index_to_test_dataset = [
        diseases.index(disease.lower()) for disease in diseases_to_test
    ]
elif "vindr" in args.data_dir.lower():
    dataset_test = VinDrCXR(
        images_path=args.data_dir,
        file_path=args.test_list,
        augment=build_transform_classification(
            normalize=args.normalization, mode="test", test_augment=args.test_augment, nc=args.nc
        ),
        nc=args.nc,
    )
    diseases = dataset_test.possible_labels
    index_to_test_dataset = [
        diseases.index(disease.replace("Effusion", "Pleural effusion"))
        for disease in diseases_to_test
    ]
else:
    raise ValueError(f"Dataset {args.data_dir} not supported")
print(f"Dataset size: {len(dataset_test)}")
print(f"index_to_test_dataset: {index_to_test_dataset}")
print(f"index_to_test_model: {index_to_test_model}")
if not len(diseases_to_test) == len(index_to_test_dataset):
    print(f"len(index_to_test_dataset): {len(index_to_test_dataset)}")
    print(f"len(diseases_to_test): {len(diseases_to_test)}")
    raise ValueError("Number of classes does not match the number of diseases to test")

# Get dataloader and model
device = torch.device(args.device)
cudnn.benchmark = True
data_loader_test = DataLoader(
    dataset=dataset_test,
    sampler=torch.utils.data.SequentialSampler(dataset_test),
    batch_size=args.batch_size,
    num_workers=args.workers,
    pin_memory=True,
    drop_last=False
)

Dataset size: 25596
index_to_test_dataset: [0, 1, 8, 9, 2, 10, 11, 13, 3, 4, 5, 12, 6, 7]
index_to_test_model: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]


## 3. Loop through models

In [9]:
metrics = {}
mean_auc = []
for (model_name, model_path) in model_list:
    if len(model_name) > 0:
        print(f"-------------- Model {model_name} from path: {model_path} --------------")

        # Load model
        saved_model = os.path.join(model_path)
        args.model_name = model_name

        # Get predictions
        y_test, p_test = test_classification(saved_model, data_loader_test, device, args)
        print("Finished calculating predictions")

        # Filter predictions
        y_test_filtered = y_test[:, index_to_test_dataset].type(torch.int64)
        p_test_filtered = p_test[:, index_to_test_model]

        # For padchest combine all atelectasis labels
        if "padchest" in args.data_dir.lower():
            index_atelectasis = [i for i, d in enumerate(diseases) if "atelectasis" in d.lower()]
            y_test_filtered[:, diseases_to_test.index("Atelectasis")] = torch.max(y_test[:, index_atelectasis], dim=1).values.type(torch.int64)

        # Default metrics
        all_results = metric_AUROC(y_test_filtered, p_test_filtered)
        mean_over_all_classes = np.array([i for i in all_results if i > 0]).mean()

        # Print results
        print(f"diseases_to_test: {diseases_to_test}")
        print(f"index_to_test_dataset: {index_to_test_dataset}")
        try:
            print(f"Count from dataset: {sum(dataset_test.img_label[:, index_to_test_dataset])}")
            print(f"Count from __get__: {sum(y_test_filtered)}")
        except:
            print(
                f"Count: {sum(torch.from_numpy(np.array(dataset_test.img_label))[:, index_to_test_dataset])}"
            )
            print(f"Count from __get__: {sum(y_test_filtered.cpu().numpy())}")
        print(f"AUC: {all_results}")
        print(f"Mean AUC: {round(mean_over_all_classes, 4)}")
        mean_auc.append(mean_over_all_classes)

        # Evaluate
        eval_metrics = []
        for decision_threshold in decision_thresholds:
            eval_params["decision_threshold"] = decision_threshold
            eval_metrics.append(
                plex_evaluate(
                    preds=p_test_filtered.cpu().numpy(),
                    target_labels=y_test_filtered.cpu().numpy(),
                    eval_args=eval_params,
                    meta_data=pd.read_csv(args.metadata_file)
                    if args.metadata_file
                    else None,
                )
            )

        # Save metrics
        metrics[model_path] = eval_metrics

    else:
        print(f"-------------- Starting with experiment {model_path} --------------")
        mean_auc.append(f"-- {model_path} --")

print("Mean AUCs for all models")
print(mean_auc)

-------------- Starting with experiment 0.2.4 --------------
-------------- Model vit_small from path: /Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.4_scratch_NIH_3D/seed_0/best_checkpoint.pth --------------
Loading  weights for vit_small from timm.
Creating empty model:
state_dict to load: odict_keys(['cls_token', 'pos_embed', 'patch_embed.proj.weight', 'patch_embed.proj.bias', 'blocks.0.norm1.weight', 'blocks.0.norm1.bias', 'blocks.0.attn.qkv.weight', 'blocks.0.attn.qkv.bias', 'blocks.0.attn.proj.weight', 'blocks.0.attn.proj.bias', 'blocks.0.norm2.weight', 'blocks.0.norm2.bias', 'blocks.0.mlp.fc1.weight', 'blocks.0.mlp.fc1.bias', 'blocks.0.mlp.fc2.weight', 'blocks.0.mlp.fc2.bias', 'blocks.1.norm1.weight', 'blocks.1.norm1.bias', 'blocks.1.attn.qkv.weight', 'blocks.1.attn.qkv.bias', 'blocks.1.attn.proj.weight', 'blocks.1.attn.proj.bias', 'blocks.1.norm2.weight', 'blocks.1.norm2.bias', 'blocks.1.mlp.fc1.weight', 'blocks.1.mlp.fc1.bias', 'blocks.1.mlp.fc2.weight', 'blo

100%|██████████| 400/400 [10:40<00:00,  1.60s/it]


Finished calculating predictions
diseases_to_test: ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']
index_to_test_dataset: [0, 1, 8, 9, 2, 10, 11, 13, 3, 4, 5, 12, 6, 7]
Count: tensor([3279, 1069, 1815,  925, 4658, 1093,  435,   86, 6112, 1748, 1623, 1143,
         555, 2665])
Count from __get__: [3279 1069 1815  925 4658 1093  435   86 6112 1748 1623 1143  555 2665]
AUC: [0.7703688416661402, 0.8992600239754108, 0.7593741004202372, 0.8459370505035455, 0.8274840067391506, 0.8863832197256201, 0.8219885546277376, 0.8427132998459338, 0.7067871159353736, 0.82488615445495, 0.7347167802430437, 0.7937632351225389, 0.7270363450787556, 0.8600412707246463]
Mean AUC: 0.8072
-------------- Model vit_small from path: /Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.4_scratch_NIH_3D/seed_11/best_checkpoint.pth --------------
Loading  weights for 

100%|██████████| 400/400 [11:12<00:00,  1.68s/it] 


Finished calculating predictions
diseases_to_test: ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']
index_to_test_dataset: [0, 1, 8, 9, 2, 10, 11, 13, 3, 4, 5, 12, 6, 7]
Count: tensor([3279, 1069, 1815,  925, 4658, 1093,  435,   86, 6112, 1748, 1623, 1143,
         555, 2665])
Count from __get__: [3279 1069 1815  925 4658 1093  435   86 6112 1748 1623 1143  555 2665]
AUC: [0.7701098137577724, 0.8957463993309067, 0.7583570721029578, 0.8435406270848693, 0.8324221686460191, 0.8738176616273325, 0.8227718778423275, 0.8356071034614789, 0.7053020741075239, 0.8181584651879908, 0.7282259727447025, 0.7790895770589099, 0.7186727640543382, 0.8566241509420995]
Mean AUC: 0.8027
-------------- Model vit_small from path: /Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.4_scratch_NIH_3D/seed_21/best_checkpoint.pth --------------
Loading  weights fo

100%|██████████| 400/400 [10:38<00:00,  1.60s/it]


Finished calculating predictions
diseases_to_test: ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']
index_to_test_dataset: [0, 1, 8, 9, 2, 10, 11, 13, 3, 4, 5, 12, 6, 7]
Count: tensor([3279, 1069, 1815,  925, 4658, 1093,  435,   86, 6112, 1748, 1623, 1143,
         555, 2665])
Count from __get__: [3279 1069 1815  925 4658 1093  435   86 6112 1748 1623 1143  555 2665]
AUC: [0.7718750435158004, 0.8999686224261055, 0.7597705902911358, 0.8454848509082226, 0.8302197872957109, 0.8862267514043782, 0.8136460961522737, 0.8297065446291012, 0.7055054309899599, 0.8233230031619019, 0.7291555330821317, 0.783999830553222, 0.7130752053119371, 0.8559973500728959]
Mean AUC: 0.8034
-------------- Model vit_small from path: /Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.4_scratch_NIH_3D/seed_42/best_checkpoint.pth --------------
Loading  weights for

100%|██████████| 400/400 [10:34<00:00,  1.59s/it]


Finished calculating predictions
diseases_to_test: ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']
index_to_test_dataset: [0, 1, 8, 9, 2, 10, 11, 13, 3, 4, 5, 12, 6, 7]
Count: tensor([3279, 1069, 1815,  925, 4658, 1093,  435,   86, 6112, 1748, 1623, 1143,
         555, 2665])
Count from __get__: [3279 1069 1815  925 4658 1093  435   86 6112 1748 1623 1143  555 2665]
AUC: [0.772944450928683, 0.8985539236784663, 0.7629778987623869, 0.8457266491898245, 0.8342068340883824, 0.8813823196733869, 0.8129618132788063, 0.823703882654317, 0.704800840580968, 0.8260677775607068, 0.7324766471337557, 0.7854162818246255, 0.7211862995138424, 0.8528474402733447]
Mean AUC: 0.8039
-------------- Model vit_small from path: /Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.4_scratch_NIH_3D/seed_100/best_checkpoint.pth --------------
Loading  weights for 

100%|██████████| 400/400 [10:36<00:00,  1.59s/it]


Finished calculating predictions
diseases_to_test: ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']
index_to_test_dataset: [0, 1, 8, 9, 2, 10, 11, 13, 3, 4, 5, 12, 6, 7]
Count: tensor([3279, 1069, 1815,  925, 4658, 1093,  435,   86, 6112, 1748, 1623, 1143,
         555, 2665])
Count from __get__: [3279 1069 1815  925 4658 1093  435   86 6112 1748 1623 1143  555 2665]
AUC: [0.7728407646055631, 0.897510172157882, 0.7622360976880056, 0.8473905789377395, 0.8351914058480371, 0.8865126360724581, 0.8329707488372582, 0.8754072730256259, 0.7061501278132896, 0.820911599646733, 0.7282052701566938, 0.7845562750245717, 0.711208500941339, 0.8557794191776734]
Mean AUC: 0.8083
-------------- Starting with experiment 0.2.5 --------------
-------------- Model vit_small from path: /Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.5_sit_imagenet_3D/see

100%|██████████| 400/400 [10:39<00:00,  1.60s/it]


Finished calculating predictions
diseases_to_test: ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']
index_to_test_dataset: [0, 1, 8, 9, 2, 10, 11, 13, 3, 4, 5, 12, 6, 7]
Count: tensor([3279, 1069, 1815,  925, 4658, 1093,  435,   86, 6112, 1748, 1623, 1143,
         555, 2665])
Count from __get__: [3279 1069 1815  925 4658 1093  435   86 6112 1748 1623 1143  555 2665]
AUC: [0.7731294942896543, 0.8958618102201796, 0.7575871563554626, 0.8503422882977827, 0.8354773253352913, 0.8915220307060258, 0.8088371119873075, 0.8674806049611188, 0.7022846675913974, 0.8262851391190738, 0.7331667334007073, 0.7892290668917276, 0.7250487938519566, 0.8498781522804812]
Mean AUC: 0.8076
-------------- Model vit_small from path: /Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.5_sit_imagenet_3D/seed_11/best_checkpoint.pth --------------
Loading  weights f

100%|██████████| 400/400 [10:37<00:00,  1.59s/it]


Finished calculating predictions
diseases_to_test: ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']
index_to_test_dataset: [0, 1, 8, 9, 2, 10, 11, 13, 3, 4, 5, 12, 6, 7]
Count: tensor([3279, 1069, 1815,  925, 4658, 1093,  435,   86, 6112, 1748, 1623, 1143,
         555, 2665])
Count from __get__: [3279 1069 1815  925 4658 1093  435   86 6112 1748 1623 1143  555 2665]
AUC: [0.7742986797174642, 0.8969081171041416, 0.7664941674506223, 0.8549446937919233, 0.8353187984595876, 0.8946937020128498, 0.8126770266152643, 0.8929740275131504, 0.6952185890409743, 0.8267351790170699, 0.7364099203923165, 0.7890009434421648, 0.7212344008079002, 0.8610400988429027]
Mean AUC: 0.8113
-------------- Model vit_small from path: /Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.5_sit_imagenet_3D/seed_21/best_checkpoint.pth --------------
Loading  weights f

100%|██████████| 400/400 [10:39<00:00,  1.60s/it]


Finished calculating predictions
diseases_to_test: ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']
index_to_test_dataset: [0, 1, 8, 9, 2, 10, 11, 13, 3, 4, 5, 12, 6, 7]
Count: tensor([3279, 1069, 1815,  925, 4658, 1093,  435,   86, 6112, 1748, 1623, 1143,
         555, 2665])
Count from __get__: [3279 1069 1815  925 4658 1093  435   86 6112 1748 1623 1143  555 2665]
AUC: [0.7734191258910208, 0.8947596858093005, 0.7648079936954554, 0.8492616235058779, 0.836583296629797, 0.8863422030328905, 0.8151343965551503, 0.8687249870091983, 0.7074932094571356, 0.8227380868306291, 0.7393823802445239, 0.7895500175511225, 0.724188726884306, 0.8581460917543396]
Mean AUC: 0.8093
-------------- Model vit_small from path: /Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.5_sit_imagenet_3D/seed_42/best_checkpoint.pth --------------
Loading  weights for

100%|██████████| 400/400 [10:38<00:00,  1.60s/it]


Finished calculating predictions
diseases_to_test: ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']
index_to_test_dataset: [0, 1, 8, 9, 2, 10, 11, 13, 3, 4, 5, 12, 6, 7]
Count: tensor([3279, 1069, 1815,  925, 4658, 1093,  435,   86, 6112, 1748, 1623, 1143,
         555, 2665])
Count from __get__: [3279 1069 1815  925 4658 1093  435   86 6112 1748 1623 1143  555 2665]
AUC: [0.7756238558376521, 0.9007752400392031, 0.7594788904214688, 0.8523937832688998, 0.8364008230806436, 0.8983893302980358, 0.8133642331888385, 0.9071467641508573, 0.69802840644653, 0.8321158311372484, 0.7314227941636642, 0.792059983730104, 0.7259814264965817, 0.8525448030198761]
Mean AUC: 0.8126
-------------- Model vit_small from path: /Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.5_sit_imagenet_3D/seed_100/best_checkpoint.pth --------------
Loading  weights for

100%|██████████| 400/400 [14:43<00:00,  2.21s/it]


Finished calculating predictions
diseases_to_test: ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']
index_to_test_dataset: [0, 1, 8, 9, 2, 10, 11, 13, 3, 4, 5, 12, 6, 7]
Count: tensor([3279, 1069, 1815,  925, 4658, 1093,  435,   86, 6112, 1748, 1623, 1143,
         555, 2665])
Count from __get__: [3279 1069 1815  925 4658 1093  435   86 6112 1748 1623 1143  555 2665]
AUC: [0.7722252060652078, 0.9049816732771121, 0.760777656260299, 0.8463291949076879, 0.8349928806965348, 0.892168776390844, 0.8160335257036637, 0.8537652812850409, 0.7024614428901793, 0.8241538036089744, 0.7360807094055982, 0.7848847212709625, 0.7193995720891613, 0.8567315782079905]
Mean AUC: 0.8075
-------------- Starting with experiment 0.2.6 --------------
-------------- Model vit_small from path: /Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.6_timm_imagenet_3D/s

100%|██████████| 400/400 [13:23:12<00:00, 120.48s/it]      


Finished calculating predictions
diseases_to_test: ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']
index_to_test_dataset: [0, 1, 8, 9, 2, 10, 11, 13, 3, 4, 5, 12, 6, 7]
Count: tensor([3279, 1069, 1815,  925, 4658, 1093,  435,   86, 6112, 1748, 1623, 1143,
         555, 2665])
Count from __get__: [3279 1069 1815  925 4658 1093  435   86 6112 1748 1623 1143  555 2665]
AUC: [0.7729182406660479, 0.8972912881216832, 0.7577640922916564, 0.8470833794355338, 0.8350892620840011, 0.8944055210073985, 0.8225155972548284, 0.8694779976844467, 0.7021057551853529, 0.8140901385740507, 0.7163872151405493, 0.7816140871811544, 0.7201501249662265, 0.859232669867012]
Mean AUC: 0.8064
-------------- Model vit_small from path: /Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.6_timm_imagenet_3D/seed_11/best_checkpoint.pth --------------
Loading  weights f

100%|██████████| 400/400 [10:37<00:00,  1.59s/it]


Finished calculating predictions
diseases_to_test: ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']
index_to_test_dataset: [0, 1, 8, 9, 2, 10, 11, 13, 3, 4, 5, 12, 6, 7]
Count: tensor([3279, 1069, 1815,  925, 4658, 1093,  435,   86, 6112, 1748, 1623, 1143,
         555, 2665])
Count from __get__: [3279 1069 1815  925 4658 1093  435   86 6112 1748 1623 1143  555 2665]
AUC: [0.7722280963001126, 0.8976817629017151, 0.7603328490010371, 0.8496133659499555, 0.8319981366811934, 0.8970952228378853, 0.8361790528764869, 0.8725413198654428, 0.7076669281467086, 0.8181303264496655, 0.7289908376333932, 0.7881155160475508, 0.73187147852297, 0.8664293394090419]
Mean AUC: 0.8113
-------------- Model vit_small from path: /Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.6_timm_imagenet_3D/seed_21/best_checkpoint.pth --------------
Loading  weights fo

100%|██████████| 400/400 [10:34<00:00,  1.59s/it]


Finished calculating predictions
diseases_to_test: ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']
index_to_test_dataset: [0, 1, 8, 9, 2, 10, 11, 13, 3, 4, 5, 12, 6, 7]
Count: tensor([3279, 1069, 1815,  925, 4658, 1093,  435,   86, 6112, 1748, 1623, 1143,
         555, 2665])
Count from __get__: [3279 1069 1815  925 4658 1093  435   86 6112 1748 1623 1143  555 2665]
AUC: [0.7699973132430988, 0.8997749869056698, 0.7602557334761425, 0.8509581771792463, 0.8335258431925681, 0.9043490725541421, 0.8354039068856336, 0.9090333931973782, 0.7014993331553558, 0.8169653514977006, 0.7261838185744955, 0.7887463976012118, 0.7243325990420755, 0.8559284918954596]
Mean AUC: 0.8126
-------------- Model vit_small from path: /Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.6_timm_imagenet_3D/seed_42/best_checkpoint.pth --------------
Loading  weights 

100%|██████████| 400/400 [10:27<00:00,  1.57s/it]


Finished calculating predictions
diseases_to_test: ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']
index_to_test_dataset: [0, 1, 8, 9, 2, 10, 11, 13, 3, 4, 5, 12, 6, 7]
Count: tensor([3279, 1069, 1815,  925, 4658, 1093,  435,   86, 6112, 1748, 1623, 1143,
         555, 2665])
Count from __get__: [3279 1069 1815  925 4658 1093  435   86 6112 1748 1623 1143  555 2665]
AUC: [0.7717195721091267, 0.8968249152353549, 0.7580126181247779, 0.8504730250091199, 0.8325980492981363, 0.9028852414919859, 0.8296999507082435, 0.8992415195135515, 0.7035885339467691, 0.8234954338959867, 0.7248238217470933, 0.7879142622201054, 0.7263887944491755, 0.8591263553283228]
Mean AUC: 0.8119
-------------- Model vit_small from path: /Users/felixkrones/python_projects/models/GMML/Finetune/CXR8/0.2.6_timm_imagenet_3D/seed_100/best_checkpoint.pth --------------
Loading  weights

100%|██████████| 400/400 [10:28<00:00,  1.57s/it]


Finished calculating predictions
diseases_to_test: ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']
index_to_test_dataset: [0, 1, 8, 9, 2, 10, 11, 13, 3, 4, 5, 12, 6, 7]
Count: tensor([3279, 1069, 1815,  925, 4658, 1093,  435,   86, 6112, 1748, 1623, 1143,
         555, 2665])
Count from __get__: [3279 1069 1815  925 4658 1093  435   86 6112 1748 1623 1143  555 2665]
AUC: [0.7737738390230443, 0.897192143836599, 0.759032889997258, 0.8503933604067365, 0.8324703542130827, 0.8940179627350373, 0.8296206910256569, 0.8805247372211534, 0.7059382854813884, 0.817135071509338, 0.7315291985266131, 0.7907199731346714, 0.7248861416825955, 0.860287985254401]
Mean AUC: 0.8105
Mean AUCs for all models
['-- 0.2.4 --', 0.8071957142187917, 0.802746123424945, 0.8034253314139125, 0.8039466470815356, 0.8083479192809194, '-- 0.2.5 --', 0.807580741092012, 0.81128202458630

## 4. Print and save

In [None]:
# Print
for model_path, eval_metrics in metrics.items():
    print(
        f"---------------------------------------- Model path: {model_path} ----------------------------------------"
    )
    for eval_metric in eval_metrics:
        print(
            f"--------------- Decision threshold: {eval_metric['decision_threshold']} ---------------"
        )
        for key, value in eval_metric.items():
            print(f"{key}: {value}")
    print("--------------- Summary ---------------")
    print(f"AUC: {eval_metrics[0]['auc']}")
    print(f"Mean acc: {[i['mean_acc'] for i in eval_metrics]}")
    print(f"Subset acc: {[i['subset_acc'] for i in eval_metrics]}")
    print(f"Disparity: {[i['disparity'] for i in eval_metrics]}")
    print(f"Underdiagnosis: {[i['underdiagnosis_mean'] for i in eval_metrics]}")
    print(f"Calibration: {[i['calibration_error'] for i in eval_metrics]}")
    print(f"Oracle AUC: {eval_metrics[0]['oracle_auc_mean']}")

In [None]:
# Export
output_path = f"./Outputs/Evaluation/{args.test_list.split('/')[-1].split('.')[0]}/{len(diseases_to_test)}_diseases/"
output_file = os.path.join(output_path, f"{args.nc}D_{model_list[1][0]}_results.txt")
if not os.path.exists("/".join(output_file.split("/")[:-1])):
    os.makedirs(output_path)
with open(output_file, "w") as f:
    json.dump(metrics, f, indent=4)

## 5. Plot experiment results

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
csv_file = "/Users/felixkrones/Library/CloudStorage/OneDrive-Personal/Dokumente/Studium_und_Schule/07_Oxford/PhD/08_Reliable/size_experiments.csv"
output_dir = "/Users/felixkrones/Library/CloudStorage/OneDrive-Personal/Dokumente/Studium_und_Schule/07_Oxford/PhD/08_Reliable/"

In [None]:
df = pd.read_csv(csv_file)
df.head(2)

In [None]:
# calculate the mean and standard deviation of the runs
df['Mean'] = df[['Run1', 'Run2', 'Run3', 'Run4', 'Run5']].mean(axis=1)
df['Std'] = df[['Run1', 'Run2', 'Run3', 'Run4', 'Run5']].std(axis=1)

# list of unique experiments
experiments = df['Experiment'].unique()

df.head(2)

In [None]:
plt.figure(figsize=(12, 8))

# set font sizes
title_font_size = 20
axis_font_size = 18
tick_font_size = 16
legend_font_size = 16

# iterate over experiments
for experiment in experiments:
    subset = df[df['Experiment'] == experiment]
    # sort by size for plotting
    subset = subset.sort_values(by='Size') 
    plt.plot(subset['Size'], subset['Mean'], 'o-', label=experiment) # 'o-' adds round markers
    plt.fill_between(subset['Size'], subset['Mean'] - subset['Std'], subset['Mean'] + subset['Std'], alpha=0.1)

plt.xlabel('Size', fontsize=axis_font_size)
plt.ylabel('Mean of Runs', fontsize=axis_font_size)
plt.xticks(fontsize=tick_font_size)
plt.yticks(fontsize=tick_font_size)
plt.legend(title='Experiments', fontsize=legend_font_size)
plt.title('Mean of Runs with Standard Deviation for each Experiment', fontsize=title_font_size)

plt.savefig(output_dir + 'experiment_size.pdf', format='pdf', bbox_inches='tight')

plt.show()