### This notebook aims to visualize the performance and robustness metrics of different model variants and OoD detectors.

In [None]:
import yaml
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from adjustText import adjust_text
from matplotlib.lines import Line2D
os.chdir(os.path.dirname(os.getcwd()))
print("Current working directory: ", os.getcwd())
from utils.visualize import *

# Load configs: benchmarks, model variants, OoD datasets and save directory.
with open('config.yaml', 'r') as f:
    configs = yaml.safe_load(f)
    
score_functions = configs["score_functions"]
rand_seed = configs["rand_seed"]

# Define the order of perturbation functions and model variants in visualizations.
perturb_function_sorter = configs["perturb_functions"] + ["average"]
variant_sorter = ["NT", "DA", "AT", "PAT"]

sns.set_style("whitegrid")


#### 1. Performance - Robustness Scatter Plot
The visualization of performance and robustness metrics is in the format of scatter plot. All the statistics can be inspected in the tables generated in `results/eval/performance/` and `results/eval/robustness/` folders. The generated figures are saved in `results/eval/performance_robustness/` folder.

Specifically, we visualize the performance and robustness metrics in the following 2 types of figures:
- Model accuracy $(\uparrow)$ - MAE rate $(\downarrow)$ plot
- OoD detectors' FPR95 $(\downarrow)$ - DAE rate $(\downarrow)$ plot.

\* The $\uparrow$ and $\downarrow$ arrows indicate better performance/robustness.

In [None]:
save_dir = os.path.join("results", "eval", "performance_robustness")
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# Load DNN model and OoD detectors' performance metrics.
performance_dir = os.path.join("results", "eval", "performance")
robustness_dir = os.path.join("results", "eval", "robustness")
df_model_perf = None
df_detector_perf = None
file_path = os.path.join(performance_dir, "model_performance.csv")
if os.path.exists(file_path):
    df_model_perf = pd.read_csv(file_path).copy()
    df_model_perf = df_model_perf.set_index(["benchmark", "model"])
file_path = os.path.join(performance_dir, "detector_performance.csv")
if os.path.exists(file_path):
    df_detector_perf = pd.read_csv(file_path).copy()
    df_detector_perf = df_detector_perf.set_index(["benchmark", "model", "dataset"])
assert (df_model_perf is not None) and (df_detector_perf is not None)

for benchmark in configs["benchmark"]:
    
    # Load the MAE/DAE rate table on ID dataset.
    file_path = os.path.join(robustness_dir, f"{benchmark.lower()}_id_local_mae_dae_rate_mean_std.csv")
    df_id = None
    if os.path.exists(file_path):
        df_id = pd.read_csv(file_path).copy()   
        df_id["perturb_function"] = df_id["perturb_function"].astype("category")
        df_id["perturb_function"] = df_id["perturb_function"].cat.set_categories(perturb_function_sorter, ordered=True)
        df_id["variant"] = df_id["variant"].astype("category")
        df_id["variant"] = df_id["variant"].cat.set_categories(variant_sorter, ordered=True)
        df_id = df_id.sort_values(by=["model", "variant"])

    else:
        print("File "+file_path+" does not exist!")

    # Load the DAE rate table on OoD dataset.
    file_path = os.path.join(robustness_dir, f"{benchmark.lower()}_ood_local_dae_rate_mean_std.csv")
    df_ood = None
    if os.path.exists(file_path):    
        df_ood = pd.read_csv(file_path).copy()   
        df_ood["perturb_function"] = df_ood["perturb_function"].astype("category")
        df_ood["perturb_function"] = df_ood["perturb_function"].cat.set_categories(perturb_function_sorter, ordered=True)
        df_ood["variant"] = df_ood["variant"].astype("category")
        df_ood["variant"] = df_ood["variant"].cat.set_categories(variant_sorter, ordered=True)
        df_ood = df_ood.sort_values(by=["model", "variant"])

    else:
        print("File "+file_path+" does not exist!")
    
    models = list(configs["benchmark"][benchmark]["model"].keys())
    ncols = len(models)
    fig, axes = plt.subplots(ncols=ncols, nrows=1, figsize=(ncols*12, 8), layout="constrained")
    if ncols == 1:
        axes = [axes]
    markers = ["o", "^", "s", "d"]
    colors_id = ["lightsalmon", "indianred", "red", "darkred"]
    colors_ood = ["lightskyblue", "royalblue", "blue", "midnightblue"]

    # DAE rate of different score functions.
    for ax, model_name in zip(axes, models):
        # Select relevant performance data (FPR95)
        if (benchmark, model_name, "average") not in df_detector_perf.index:
            continue
        data_perf = df_detector_perf.loc[(benchmark, model_name, "average")].round(3).reset_index(drop=True)

        # Process robustness data (DAE rate)
        # Average ID/OoD DAE rate of different score functions.
        if (df_id is not None) and (df_ood is not None):
            data_id_dae = df_id[(df_id["model"]==model_name) & (df_id["perturb_function"]=="average")].copy()              
            dae_cols = [col for col in data_id_dae.columns if ("dae_" in col) and ("_mean" in col)]
            data_id_dae = data_id_dae.melt(id_vars=["variant"], value_vars=dae_cols, var_name="score_function", value_name="id_dae_rate")
            data_id_dae["score_function"] = data_id_dae["score_function"].apply(lambda s: s.replace("_mean","").replace("dae_","")).copy()
            
            data_ood_dae = df_ood[(df_ood["model"]==model_name) & (df_ood["perturb_function"]=="average") & (df_ood["dataset"]=="average")].copy()
            dae_cols = [col for col in data_ood_dae.columns if ("dae_" in col) and ("_mean" in col)]
            data_ood_dae = data_ood_dae.melt(id_vars=["variant"], value_vars=dae_cols, var_name="score_function", value_name="ood_dae_rate")
            data_ood_dae["score_function"] = data_ood_dae["score_function"].apply(lambda s: s.replace("_mean","").replace("dae_","")).copy()
            
            data_perf_dae = pd.merge(data_perf, data_id_dae, on=["variant", "score_function"]).copy()
            data_perf_dae = pd.merge(data_perf_dae, data_ood_dae, on=["variant", "score_function"]).copy()
            data_perf_dae["score_function"] = data_perf_dae["score_function"].apply(lambda s: s.replace("Mahalanobis", "MD").replace("KLMatching", "KLM").replace("EnergyBased", "EB").replace("MaxLogit", "ML")).copy()

            # DAE rate - FPR95 plot
            variants = data_perf_dae["variant"].unique()
            legend_elements = [] # Patch(facecolor=colors_id[2], edgecolor='w', label='ID'), Patch(facecolor=colors_ood[2], edgecolor='w', label='OoD')
            annos = []
            for v_i, variant in enumerate(variants):
                data_i = data_perf_dae[data_perf_dae["variant"]==variant].copy()
                x = data_i["FPR95"].to_numpy()
                y1 = data_i["id_dae_rate"].to_numpy()
                y2 = data_i["ood_dae_rate"].to_numpy()
                p = data_i["score_function"].to_numpy()
                sns.scatterplot(data=data_i, x="FPR95", y="id_dae_rate", ax=ax, color=colors_id[v_i], marker=markers[v_i])
                sns.scatterplot(data=data_i, x="FPR95", y="ood_dae_rate", ax=ax, color=colors_ood[v_i], marker=markers[v_i])
                legend_elements.append(Line2D([0], [0], marker=markers[v_i], label=variant, markerfacecolor=colors_id[v_i], markersize=10, color="w"))
                legend_elements.append(Line2D([0], [0], marker=markers[v_i], label=variant, markerfacecolor=colors_ood[v_i], markersize=10, color="w"))


                annos1_ = [ax.text(x[i], y1[i]+0.5, p[i], color=colors_id[v_i]) for i in range(len(x)) if (str(x[i])!="nan") and (str(y1[i])!="nan")]
                annos2_ = [ax.text(x[i], y2[i]+0.5, p[i], color=colors_ood[v_i]) for i in range(len(x)) if (str(x[i])!="nan") and (str(y1[i])!="nan")]
                annos += annos1_ + annos2_
                
            adjust_text(annos, ax=ax, avoid_self=False, explode_radius=0.1, expand=(1.2,1.2), force_text=(0.25,0.25))
            ax.set_xlabel("FPR95 (%) \u2193")
            ax.set_ylabel("DAE rate (%) \u2193")
            ax.set_title(f"Model={model_name}")
            legend_elements = legend_elements[::2] + legend_elements[1::2]
        
    plt.legend(handles=legend_elements, loc="upper left", bbox_to_anchor=(1.02, 1.0), ncols=2, title="ID                OoD")
    plt.suptitle(f"FPR95 - DAE rate\nBenchmark={benchmark}")
    plt.savefig(os.path.join(save_dir, f"{benchmark}_fpr95_dae_rate.png"))
    plt.show()
    plt.close("all")


In [None]:
save_dir = os.path.join("results", "eval", "performance_robustness")
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# Load DNN model and OoD detectors' performance metrics.
performance_dir = os.path.join("results", "eval", "performance")
robustness_dir = os.path.join("results", "eval", "robustness")
df_model_perf = None
df_detector_perf = None
file_path = os.path.join(performance_dir, "model_performance.csv")
if os.path.exists(file_path):
    df_model_perf = pd.read_csv(file_path).copy()
    df_model_perf = df_model_perf.set_index(["benchmark", "model"])
file_path = os.path.join(performance_dir, "detector_performance.csv")
if os.path.exists(file_path):
    df_detector_perf = pd.read_csv(file_path).copy()
    df_detector_perf = df_detector_perf.set_index(["benchmark", "model", "dataset"])
assert (df_model_perf is not None) and (df_detector_perf is not None)

for benchmark in configs["benchmark"]:
    
    # Load the MAE/DAE rate table on ID dataset.
    file_path = os.path.join(robustness_dir, f"{benchmark.lower()}_id_local_mae_dae_rate_mean_std.csv")
    df_id = None
    if os.path.exists(file_path):
        df_id = pd.read_csv(file_path).copy()   
        df_id["perturb_function"] = df_id["perturb_function"].astype("category")
        df_id["perturb_function"] = df_id["perturb_function"].cat.set_categories(perturb_function_sorter, ordered=True)
        df_id["variant"] = df_id["variant"].astype("category")
        df_id["variant"] = df_id["variant"].cat.set_categories(variant_sorter, ordered=True)
        df_id = df_id.sort_values(by=["model", "variant"])

    else:
        print("File "+file_path+" does not exist!")

    # Load the DAE rate table on OoD dataset.
    file_path = os.path.join(robustness_dir, f"{benchmark.lower()}_ood_local_dae_rate_mean_std.csv")
    df_ood = None
    if os.path.exists(file_path):    
        df_ood = pd.read_csv(file_path).copy()   
        df_ood["perturb_function"] = df_ood["perturb_function"].astype("category")
        df_ood["perturb_function"] = df_ood["perturb_function"].cat.set_categories(perturb_function_sorter, ordered=True)
        df_ood["variant"] = df_ood["variant"].astype("category")
        df_ood["variant"] = df_ood["variant"].cat.set_categories(variant_sorter, ordered=True)
        df_ood = df_ood.sort_values(by=["model", "variant"])

    else:
        print("File "+file_path+" does not exist!")

    # DAE rate of different score functions.
    for model_name in configs["benchmark"][benchmark]["model"]:
        # Select relevant performance data (FPR95)
        if (benchmark, model_name, "average") not in df_detector_perf.index:
            continue
        data_perf = df_detector_perf.loc[(benchmark, model_name, "average")].round(3).reset_index(drop=True)
        
        # Process robustness data (DAE rate)
        # Average ID/OoD DAE rate of different score functions.
        if (df_id is not None) and (df_ood is not None):
            data_id_dae = df_id[(df_id["model"]==model_name) & (df_id["perturb_function"]=="average")].copy()              
            dae_cols = [col for col in data_id_dae.columns if ("dae_" in col) and ("_mean" in col)]
            data_id_dae = data_id_dae.melt(id_vars=["variant"], value_vars=dae_cols, var_name="score_function", value_name="id_dae_rate")
            data_id_dae["score_function"] = data_id_dae["score_function"].apply(lambda s: s.replace("_mean","").replace("dae_","")).copy()
            
            data_ood_dae = df_ood[(df_ood["model"]==model_name) & (df_ood["perturb_function"]=="average") & (df_ood["dataset"]=="average")].copy()
            dae_cols = [col for col in data_ood_dae.columns if ("dae_" in col) and ("_mean" in col)]
            data_ood_dae = data_ood_dae.melt(id_vars=["variant"], value_vars=dae_cols, var_name="score_function", value_name="ood_dae_rate")
            data_ood_dae["score_function"] = data_ood_dae["score_function"].apply(lambda s: s.replace("_mean","").replace("dae_","")).copy()
            

            data_perf_dae = pd.merge(data_perf, data_id_dae, on=["variant", "score_function"]).copy()
            data_perf_dae = pd.merge(data_perf_dae, data_ood_dae, on=["variant", "score_function"]).copy()
            data_perf_dae["score_function"] = data_perf_dae["score_function"].apply(lambda s: s.replace("Mahalanobis", "MD").replace("KLMatching", "KLM").replace("EnergyBased", "EB").replace("MaxLogit", "ML")).copy()
            
            # DAE rate - FPR95 plot
            variants = sorted(set(data_perf_dae["variant"]), key=list(data_perf_dae["variant"]).index)
            ncols = len(variants)
            nrows = 1
            fig, axes = plt.subplots(ncols=ncols, nrows=nrows, figsize=(ncols*6, nrows*6), layout="constrained", sharey=True)
            if ncols > 1 or nrows > 1:
                axes = axes.flatten()
            else:
                axes = [axes]

            for a_i, ax in enumerate(axes):
                if a_i < len(variants):
                    variant = variants[a_i]
                    data_i = data_perf_dae[data_perf_dae["variant"]==variant].copy()
                    x = data_i["FPR95"].to_numpy()
                    y1 = data_i["id_dae_rate"].to_numpy()
                    y2 = data_i["ood_dae_rate"].to_numpy()
                    p = data_i["score_function"].to_numpy()
                    sns.scatterplot(data=data_i, x="FPR95", y="id_dae_rate", ax=ax, color="red", label="ID", marker="v")
                    sns.scatterplot(data=data_i, x="FPR95", y="ood_dae_rate", ax=ax, color="blue", label="OOD", marker="^")

                    annos1 = [ax.text(x[i], y1[i], p[i], color=[0.5, 0, 0]) for i in range(len(x)) if (str(x[i])!="nan") and (str(y1[i])!="nan")]
                    annos2 = [ax.text(x[i], y2[i], p[i], color=[0, 0, 0.5]) for i in range(len(x)) if (str(x[i])!="nan") and (str(y1[i])!="nan")]
                    adjust_text(annos1+annos2, ax=ax, avoid_self=False, explode_radius=0.1, expand=(1.2,1.2), force_text=(0.25,0.25))

                    ax.set_xlabel("FPR95 (%) \u2193")
                    ax.set_ylabel("DAE rate (%) \u2193")
                    ax.set_title(f"Variant={variant}")
                    ax.legend()
                    
                else:
                    ax.grid("off")

            plt.suptitle(f"FPR95 - DAE rate\nBenchmark={benchmark}, model={model_name}")
            plt.savefig(os.path.join(save_dir, f"{benchmark}_{model_name}_fpr95_dae_rate.png"))
            plt.show()
            plt.close("all")

    # MAE rate of different model variants.
    # Select relevant performance data (accuracy)
    if benchmark not in df_model_perf.index:
        continue
    data_perf = df_model_perf.loc[benchmark].reset_index()
    # Process robustness data (MAE rate)
    if (df_id is not None):
        data_id_mae = df_id[df_id["perturb_function"]=="average"].copy()
        data_id_mae = data_id_mae.melt(id_vars=["model", "variant"], value_vars="mae_mean", var_name="score_function", value_name="mae_rate")
        data_perf_mae = pd.merge(data_perf, data_id_mae, on=["model", "variant"]).copy()

        # Accuracy - MAE rate plot
        fig, ax = plt.subplots(figsize=(6, 4), layout="constrained")
        sns.scatterplot(data=data_perf_mae.copy(), x="accuracy", y="mae_rate", hue="model", style="variant", ax=ax)
        ax.set_xlabel("Accuracy (%) \u2191")
        ax.set_ylabel("MAE rate (%) \u2193")
        ax.legend(loc="upper left", bbox_to_anchor=(1.05, 1))

        plt.suptitle(f"Model accuracy - MAE rate\nBenchmark={benchmark}")
        plt.savefig(os.path.join(save_dir, f"{benchmark}_accuracy_mae_rate.png"))
        plt.show()
        plt.close("all")

#### 2. Local MAE/DAE rate decomposition.
The visualization of local MAE/DAE rate is in the format of radar plots. All the statistics data can be inspected in the tables saved in `results/eval/robustness/` folder.

Specifically, we demonstrate the local MAE/DAE rate in the following 4 types of figures:
- MAE rate
    - Different model variants (color) under various perturbations (theta).
- ID DAE rate
    - Different model variants (color) under various perturbations (theta).
- OoD DAE rate
    - Different model variants (color) under various perturbations (theta).
    - Different model variants (color) tested on various OoD datasets (theta).
    - For each model variant, we also demonstrate the details of DAE rate on different OoD datasets (color) under various perturbations (theta).
- Compare the relationship among the three robustness metrics for each model variant
    - MAE rate, ID DAE rate, and OoD DAE rate (color).
    - Under different perturbations (theta).

In [None]:
# Set the refresh flag to True to regenerate all the figures.
refresh = True

# Load DNN model and OoD detectors' performance metrics.
performance_dir = os.path.join("results", "eval", "performance")
robustness_dir = os.path.join("results", "eval", "robustness")
df_model_perf = None
file_path = os.path.join(performance_dir, "model_performance.csv")
if os.path.exists(file_path):
    df_model_perf = pd.read_csv(file_path).copy()
    df_model_perf = df_model_perf.set_index(["benchmark", "model"])
file_path = os.path.join(performance_dir, "detector_performance.csv")
if os.path.exists(file_path):
    df_detector_perf = pd.read_csv(file_path).copy()
    df_detector_perf = df_detector_perf.set_index(["benchmark", "model", "dataset", "score_function"])

save_dir = os.path.join("results", "eval", "robustness")
assert os.path.exists(save_dir)

for benchmark in configs["benchmark"]:
    
    # ID dataset
    # Load the MAE/DAE rate table on ID dataset.
    df_id = None
    file_path = os.path.join(robustness_dir, f"{benchmark.lower()}_id_local_mae_dae_rate_mean_std.csv")
    if os.path.exists(file_path):
        df_id = pd.read_csv(file_path).copy()
        
        df_id["perturb_function"] = df_id["perturb_function"].astype("category")
        df_id["perturb_function"] = df_id["perturb_function"].cat.set_categories(perturb_function_sorter, ordered=True)
        df_id["variant"] = df_id["variant"].astype("category")
        df_id["variant"] = df_id["variant"].cat.set_categories(variant_sorter, ordered=True)

        for model_name in configs["benchmark"][benchmark]["model"]:
            
            # Read model accuracy for each model variant
            accuracies = dict()
            if df_model_perf is not None and (benchmark, model_name) in df_model_perf.index:
                model_perf = df_model_perf.loc[[(benchmark, model_name)]].copy().round(3).reset_index(drop=True).set_index("variant")
                model_perf.index.name = None
                accuracies = model_perf.to_dict()["accuracy"]

            # Read detector performance metrics for each model variant
            detector_performances = dict()
            for score_func in score_functions:
                detector_perf = dict()
                if (df_detector_perf is not None) and ((benchmark, model_name, "average", score_func) in df_detector_perf.index):
                    detector_perf = df_detector_perf.loc[[(benchmark, model_name, "average", score_func)]].copy().round(3).reset_index(drop=True).set_index("variant").T.to_dict()
                detector_performances[score_func] = detector_perf
            
            # MAE rate of different model variants under various perturbations.
            data = df_id[df_id["model"]==model_name].copy()
            data = data.sort_values(by=["perturb_function", "variant"]).copy()
            
            draw_radar_plot(data=data, theta="perturb_function", r="mae", hue="variant",
                            title=f"Average MAE rate (%) under different functional perturbations on ID dataset.\nBenchmark={benchmark}, model={model_name}, model accuracy (%):{accuracies}.",
                            save_path=os.path.join(save_dir, "figures", f"{benchmark.lower()}_{model_name}_id_mae_perturb_function_radar_plot.png"), refresh=refresh)
            draw_line_chart(data=data, x="perturb_function", y="mae", hue="variant", errorbar=True,
                            title=f"Average MAE rate (%) under different functional perturbations on ID dataset.\nBenchmark={benchmark}, model={model_name}, model accuracy (%):{accuracies}.",
                            save_path=os.path.join(save_dir, "figures", f"{benchmark.lower()}_{model_name}_id_mae_perturb_function_line_chart.png"), refresh=refresh)
            
            # ID DAE rate of different model variants under various perturbations.
            dae_names = [col.replace("_mean", "") for col in data.columns if ("dae_" in col) and ("_mean" in col)]
            detector_names = [dae_name.replace("dae_", "") for dae_name in dae_names]
            subtitles = []
            for d in detector_names:
                if d in detector_performances:
                    perf_str = "\n".join([f"{k}:{v}" for k, v in detector_performances[d].items()])
                    subtitles.append(f"DAE rate - {d}\n Detector performance:\n {perf_str}")
                else:
                    subtitles.append(f"DAE rate - {d}")
            draw_radar_plot(data=data, theta="perturb_function", r=dae_names, 
                            hue="variant", title=f"Average DAE rate (%) under different functional perturbations on ID dataset.\nBenchmark={benchmark}, model={model_name}, model accuracy (%):{accuracies}.",
                            save_path=os.path.join(save_dir, "figures", f"{benchmark.lower()}_{model_name}_id_dae_perturb_function_radar_plot.png"), 
                            subtitle=subtitles, refresh=refresh)
            draw_line_chart(data=data, x="perturb_function", y=dae_names, hue="variant", errorbar=True,
                            title=f"Average DAE rate (%) under different functional perturbations on ID dataset.\nBenchmark={benchmark}, model={model_name}, model accuracy (%):{accuracies}.",
                            save_path=os.path.join(save_dir, "figures", f"{benchmark.lower()}_{model_name}_id_dae_perturb_function_line_chart.png"), refresh=refresh)
    else:
        print("File "+file_path+" does not exist!")

    # OOD dataset
    # Load the DAE rate table on OoD dataset.
    df_ood = None
    file_path = os.path.join(robustness_dir, f"{benchmark.lower()}_ood_local_dae_rate_mean_std.csv")
    if os.path.exists(file_path):
        df_ood = pd.read_csv(file_path).copy()
    
        df_ood["perturb_function"] = df_ood["perturb_function"].astype("category")
        df_ood["perturb_function"] = df_ood["perturb_function"].cat.set_categories(perturb_function_sorter, ordered=True)
        df_ood["variant"] = df_ood["variant"].astype("category")
        df_ood["variant"] = df_ood["variant"].cat.set_categories(variant_sorter, ordered=True)

        for model_name in configs["benchmark"][benchmark]["model"]:

            # Read model accuracy for each model variant
            accuracies = dict()
            if df_model_perf is not None and (benchmark, model_name) in df_model_perf.index:
                model_perf = df_model_perf.loc[[(benchmark, model_name)]].copy().round(3).reset_index(drop=True).set_index("variant")
                model_perf.index.name = None
                accuracies = model_perf.to_dict()["accuracy"]

            # Read detector performance metrics for each model variant
            detector_performances = dict()
            for score_func in score_functions:
                detector_perf = dict()
                if (df_detector_perf is not None) and ((benchmark, model_name, "average", score_func) in df_detector_perf.index):
                    detector_perf = df_detector_perf.loc[[(benchmark, model_name, "average", score_func)]].copy().round(3).reset_index(drop=True).set_index("variant").T.to_dict()
                detector_performances[score_func] = detector_perf

            # OoD DAE rate of different model variants under various perturbations.
            data = df_ood[(df_ood["model"]==model_name) & (df_ood["dataset"]=="average")].copy()
            data = data.sort_values(by=["perturb_function", "variant"]).copy()

            dae_names = [col.replace("_mean", "") for col in data.columns if ("dae_" in col) and ("_mean" in col)]
            detector_names = [dae_name.replace("dae_", "") for dae_name in dae_names]
            subtitles = []
            for d in detector_names:
                if d in detector_performances:
                    perf_str = "\n".join([f"{k}:{v}" for k, v in detector_performances[d].items()])
                    subtitles.append(f"DAE rate - {d}\n Detector performance:\n {perf_str}")
                else:
                    subtitles.append(f"DAE rate - {d}")

            draw_radar_plot(data=data, theta="perturb_function", r=dae_names, 
                            hue="variant", title=f"Average DAE rate (%) under different functional perturbations on OoD dataset.\nBenchmark={benchmark}, model={model_name}, model accuracy (%):{accuracies}.",
                            save_path=os.path.join(save_dir, "figures", f"{benchmark.lower()}_{model_name}_ood_dae_perturb_function_radar_plot.png"), 
                            subtitle=subtitles, refresh=refresh)
            draw_line_chart(data=data, x="perturb_function", y=dae_names, hue="variant", errorbar=True,
                            title=f"Average DAE rate (%) under different functional perturbations on OoD dataset.\nBenchmark={benchmark}, model={model_name}, model accuracy (%):{accuracies}.",
                            save_path=os.path.join(save_dir, "figures", f"{benchmark.lower()}_{model_name}_ood_dae_perturb_function_line_chart.png"), refresh=refresh)
            
            # OoD DAE rate of different model variants on various OoD datasets.
            data = df_ood[(df_ood["model"]==model_name) & (df_ood["perturb_function"]=="average")].copy()
            data = data.sort_values(by=["variant"]).copy()
            
            dae_names = [col.replace("_mean", "") for col in data.columns if ("dae_" in col) and ("_mean" in col)]
            detector_names = [dae_name.replace("dae_", "") for dae_name in dae_names]
            subtitles = []
            for d in detector_names:
                if d in detector_performances:
                    perf_str = "\n".join([f"{k}:{v}" for k, v in detector_performances[d].items()])
                    subtitles.append(f"DAE rate - {d}\n Detector performance:\n {perf_str}")
                else:
                    subtitles.append(f"DAE rate - {d}")

            draw_radar_plot(data=data, theta="dataset", r=dae_names,
                            hue="variant", title=f"Average DAE rate (%) on different OoD datasets.\nBenchmark={benchmark}, model={model_name}, model accuracy (%):{accuracies}.",
                            save_path=os.path.join(save_dir, "figures", f"{benchmark.lower()}_{model_name}_ood_dae_dataset_radar_plot.png"), 
                            subtitle=subtitles, refresh=refresh)
            draw_line_chart(data=data, x="dataset", y=dae_names, hue="variant", errorbar=True,
                            title=f"Average DAE rate (%) on different OoD datasets.\nBenchmark={benchmark}, model={model_name}, model accuracy (%):{accuracies}.",
                            save_path=os.path.join(save_dir, "figures", f"{benchmark.lower()}_{model_name}_ood_dae_dataset_line_chart.png"), refresh=refresh)

            # Demonstrate the details of DAE rate on each OoD dataset for each model variant.
            variants = sorted(set(data["variant"]), key=list(data["variant"]).index)
            for variant in variants:
                data = df_ood[(df_ood["model"]==model_name) & (df_ood["dataset"]!="average") & (df_ood["variant"]==variant)].copy()
                data = data.sort_values(by=["perturb_function"]).copy()
                dae_names = [col.replace("_mean", "") for col in data.columns if ("dae_" in col) and ("_mean" in col)]
                detector_names = [dae_name.replace("dae_", "") for dae_name in dae_names]
                subtitles = []
                for d in detector_names:
                    if (d in detector_performances) and( variant in detector_performances[d]):
                        subtitles.append(f"DAE rate - {d}\n Detector performance:\n {detector_performances[d][variant]}")
                    else:
                        subtitles.append(f"DAE rate - {d}")

                draw_radar_plot(data=data, theta="perturb_function", r=dae_names, hue="dataset",
                                title=f"Average DAE rate (%) under different functional perturbations on various OoD datasets.\nBenchmark={benchmark}, model={model_name}, variant={variant}, model accuracy (%):{accuracies[variant] if variant in accuracies else 'N/A'}.",
                                save_path=os.path.join(save_dir, "figures", f"{benchmark.lower()}_{model_name}_{variant}_ood_dae_details_radar_plot.png"), 
                                subtitle=subtitles, refresh=refresh)
                draw_line_chart(data=data, x="perturb_function", y=dae_names, hue="dataset", errorbar=True,
                                title=f"Average DAE rate (%) under different functional perturbations on various OoD datasets.\nBenchmark={benchmark}, model={model_name}, variant={variant}, model accuracy (%):{accuracies[variant] if variant in accuracies else 'N/A'}.",
                                save_path=os.path.join(save_dir, "figures", f"{benchmark.lower()}_{model_name}_{variant}_ood_dae_details_line_chart.png"), refresh=refresh)
    else:
        print("File "+file_path+" does not exist!")


    # Compare the relationship among the three: MAE rate, ID DAE rate, and OoD DAE rate.
    # Under different perturbations for each model variant.
    if (df_id is not None) and (df_ood is not None):
        df_id_ood = pd.merge(df_id, df_ood[df_ood["dataset"]=="average"], 
                                on=["model", "variant", "perturb_function"], 
                                how='inner', suffixes=["_id", "_ood"])
        # Assembly MAE/DAE table
        df_all = None
        for score_func in score_functions:
            if (f"dae_{score_func}_mean_id" in df_id_ood.columns) and (f"dae_{score_func}_mean_ood" in df_id_ood.columns):
                df = df_id_ood.melt(id_vars=["model", "variant", "perturb_function"], 
                    value_vars=["mae_mean", f"dae_{score_func}_mean_id", f"dae_{score_func}_mean_ood"],
                    var_name="ae_type", value_name=f"ae_{score_func}_mean")
                df = df.replace("mae_mean", "mae").replace(f"dae_{score_func}_mean_id", "dae_id").replace(f"dae_{score_func}_mean_ood", "dae_ood")
                
                if (f"dae_{score_func}_std_id" in df_id_ood.columns) and (f"dae_{score_func}_std_ood" in df_id_ood.columns):
                    df_std = df_id_ood.melt(id_vars=["model", "variant", "perturb_function"], 
                        value_vars=["mae_std", f"dae_{score_func}_std_id", f"dae_{score_func}_std_ood"],
                        var_name="ae_type", value_name=f"ae_{score_func}_std")
                    df_std = df_std.replace("mae_std", "mae").replace(f"dae_{score_func}_std_id", "dae_id").replace(f"dae_{score_func}_std_ood", "dae_ood")
                    df = pd.merge(df, df_std, on=["model", "variant", "perturb_function", "ae_type"]).copy()
                    
                if df_all is None:
                    df_all = df.copy()
                else:
                    df_all = pd.merge(df_all, df, on=["model", "variant", "perturb_function", "ae_type"]).copy()
        
        if df_all is not None:
            df_all["perturb_function"] = df_all["perturb_function"].astype("category")
            df_all["perturb_function"] = df_all["perturb_function"].cat.set_categories(perturb_function_sorter, ordered=True)
            df_all["variant"] = df_all["variant"].astype("category")
            df_all["variant"] = df_all["variant"].cat.set_categories(variant_sorter, ordered=True)
            variants = sorted(set(df_all["variant"]), key=list(df_all["variant"]).index)
            
            for model_name in configs["benchmark"][benchmark]["model"]:

                # Read model accuracy for each model variant
                accuracies = dict()
                if df_model_perf is not None and (benchmark, model_name) in df_model_perf.index:
                    model_perf = df_model_perf.loc[[(benchmark, model_name)]].copy().round(3).reset_index(drop=True).set_index("variant")
                    model_perf.index.name = None
                    accuracies = model_perf.to_dict()["accuracy"]

                # Read detector performance metrics for each model variant
                detector_performances = dict()
                for score_func in score_functions:
                    detector_perf = dict()
                    if (df_detector_perf is not None) and ((benchmark, model_name, "average", score_func) in df_detector_perf.index):
                        detector_perf =df_detector_perf.loc[[(benchmark, model_name, "average", score_func)]].copy().round(3).reset_index(drop=True).set_index("variant").T.to_dict()
                    detector_performances[score_func] = detector_perf

                for variant in variants:
                    data = df_all[(df_all["model"]==model_name) & (df_all["variant"]==variant)].copy()
                    data = data.sort_values(by=["perturb_function", "ae_type"]).copy()
                    # MAE, ID/OoD DAE rate under different perturbations for each model variant.
                    dae_names = [col.replace("_mean", "") for col in data.columns if ("ae_" in col) and ("_mean" in col)]
                    detector_names = [dae_name.replace("ae_", "") for dae_name in dae_names]
                    subtitles = []
                    for d in detector_names:
                        if (d in detector_performances) and (variant in detector_performances[d]):
                            subtitles.append(f"{d}\n Detector performance:\n {detector_performances[d][variant]}")
                        else:
                            subtitles.append(f"{d}")
                    if len(dae_names) > 0:
                        draw_radar_plot(data=data, theta="perturb_function", r=dae_names,
                                        hue="ae_type", title=f"Average MAE vs ID/OoD DAE rate (%) under different functional perturbations.\nBenchmark={benchmark}, model={model_name}, model accuracy (%):{accuracies[variant] if variant in accuracies else 'N/A'}.",
                                        save_path=os.path.join(save_dir, "figures", f"{benchmark.lower()}_{model_name}_{variant}_id_ood_mae_dae_perturb_function_radar_plot.png"), 
                                        subtitle=subtitles, refresh=refresh)
                        draw_line_chart(data=data, x="perturb_function", y=dae_names, hue="ae_type", errorbar=True,
                                        title=f"Average MAE vs ID/OoD DAE rate (%) under different functional perturbations.\nBenchmark={benchmark}, model={model_name}, model accuracy (%):{accuracies[variant] if variant in accuracies else 'N/A'}.",
                                        save_path=os.path.join(save_dir, "figures", f"{benchmark.lower()}_{model_name}_{variant}_id_ood_mae_dae_perturb_function_line_chart.png"), refresh=refresh)


