### This notebook aims to evaluate the effectiveness of randomized smoothing methods applied on OoD detectors.

The raw experiment results should be stored in the `results/benchmark/rand_seed/model/variant/dataset/rs_scores/` folder aforehand. To regenerate the OoD scores, please run `randomized_smoothing_test.py`.

Specifically, this notebook analyses the raw experiment results and provides a summary of the OoD detectors' performance (FPR95) and robustness (average DAE rate) metrics applying 3 methods:
- Normal test
- Randomized smoothing - Majority voting 
- Randomized smoothing - Averaging

The result files are saved in the `results/eval/randomized_smoothing/` folder.

In [237]:
import numpy as np
import yaml
import pandas as pd
import os
import time
import torch
from torch.utils.data import DataLoader

os.chdir(os.path.dirname(os.getcwd()))
print("Current working directory: ", os.getcwd())
from utils.eval import get_thr_tpr

def df_sorted(df, sorter_dict):
    for col, sorter in sorter_dict.items():
        df[col] = df[col].astype("category")
        df[col] = df[col].cat.set_categories(sorter, ordered=True)
    df = df.sort_values(list(sorter_dict.keys())).copy()
    return df

# Load configs: benchmarks, model variants, OoD datasets and save directory.
with open('config.yaml', 'r') as f:
    configs = yaml.safe_load(f)

score_functions = configs["score_functions"]
perturb_functions = configs["perturb_functions"]
rand_seed = configs["rand_seed"]
batch_size = configs["batch_size"]
n_seeds = configs["n_seeds"]
n_sampling = configs["n_sampling"]
n_rs = configs["n_randomized_smoothing"]

method_sorter = ["seed", "perb", "voting", "avg"]

for benchmark in configs["benchmark"]:
    print("========================================")
    print("Benchmark:", benchmark)
    
    ood_datasets = configs["benchmark"][benchmark]["ood_datasets"]
    dataset_sorter = ["average", benchmark] + ood_datasets
    
    for model_name in configs["benchmark"][benchmark]["model"]:
        print("========================================")
        print("Model:", model_name)
        for variant, weight_name in configs["benchmark"][benchmark]["model"][model_name].items():
            
            save_dir = os.path.join("results", "eval", "randomized_smoothing")
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            print("----------------------------------------")
            print("Variant:", variant)
            df_fpr95_all = pd.DataFrame()
            df_dae_all = pd.DataFrame()
            df_acc = pd.DataFrame()
            df_mae = pd.DataFrame()

            rlt_dir = os.path.join("results", benchmark.lower(), str(rand_seed), model_name, variant, benchmark, "scores")
            filepath_all = os.path.join(rlt_dir, "temp_all_scores.csv")
            if(not os.path.exists(filepath_all)):
                continue
            thr_dict = get_thr_tpr(filepath_all, score_functions=score_functions, tpr=0.95)
            
            for dataset in [benchmark]+ood_datasets:
                print("Dataset:", dataset)
                rlt_dir = os.path.join("results", benchmark.lower(), str(rand_seed), model_name, variant, dataset, "scores")
                rs_rlt_dir = os.path.join("results", benchmark.lower(), str(rand_seed), model_name, variant, dataset, "rs_scores")
                filepath = os.path.join(rlt_dir, "temp_scores.csv")
                if (not os.path.exists(filepath)):
                    continue
                df_temp = pd.read_csv(filepath).copy()
                df_temp_ood = df_temp[["idx", "y_true", "y_pred"]].copy()
                for score_func in score_functions:
                    if (f"{score_func}_score" in df_temp.columns) and (f"{score_func}_score" in thr_dict):
                        df_temp_ood[f"{score_func}_ood"] = df_temp[f"{score_func}_score"] > thr_dict[f"{score_func}_score"]

                df_perb_rs = pd.DataFrame()
                for perb_func in perturb_functions:
                    filepath_rs = os.path.join(rs_rlt_dir, f"perb_{perb_func}_scores.csv")
                    if (not os.path.exists(filepath_rs)):
                        continue
                    df_perb = pd.read_csv(filepath_rs).copy()
                    cols = [col for col in df_perb.columns if "_score" in col]
                    df_perb = df_perb.drop(cols, axis=1)
                    df_perb["perturb_function"] = perb_func
                    df_perb_rs = pd.concat([df_perb_rs, df_perb], axis=0, ignore_index=True).copy()
                
                if len(df_perb_rs) == 0:
                    continue
                df_ood = pd.merge(df_perb_rs, df_temp_ood, on="idx", suffixes=["_perb", "_seed"])
                
                # Calculate FPR95 (only for OoD datasets)
                if dataset != benchmark:
                    cols = [col for col in df_ood.columns if "_ood" in col]
                    df_fpr95 = df_ood[cols].apply(lambda x: ~x).mean() * 100
                    df_fpr95.name = "fpr95"
                    df_fpr95 = df_fpr95.to_frame()
                    df_fpr95.index.name = "detector_method"

                    df_fpr95.reset_index(inplace=True)
                    df_fpr95["method"] = df_fpr95["detector_method"].apply(lambda x: x.split("_")[-1])
                    df_fpr95["detector"] = df_fpr95["detector_method"].apply(lambda x: x.split("_")[0])
                    df_fpr95.drop("detector_method", axis=1, inplace=True)
                    df_fpr95["dataset"] = dataset
                    df_fpr95_all = pd.concat([df_fpr95_all, df_fpr95], axis=0, ignore_index=True).copy()
                
                # Calculate model accuracy (only for ID dataset)
                if dataset == benchmark:
                    cols = [col for col in df_ood.columns if "y_pred" in col]
                    df_ood[cols] = df_ood[cols].apply(lambda x: x==df_ood["y_true"])
                    df_acc = df_ood[cols].mean() * 100
                    df_acc.name = "accuracy"
                    df_acc = df_acc.to_frame()
                    df_acc.index.name = "method"
                    df_acc.reset_index(inplace=True)
                    df_acc["method"] = df_acc["method"].apply(lambda x: x.split("_")[-1])

                # Calculate average DAE rate
                df_dae = pd.DataFrame()
                for score_func in score_functions:
                    if (f"{score_func}_ood_seed" in df_ood.columns):
                        # select correctly detected seeds
                        df_ood_ = df_ood[df_ood[f"{score_func}_ood_seed"]==(dataset!=benchmark)].copy()
                        df_ood_ = df_ood_[[f"{score_func}_ood_perb", 
                                          f"{score_func}_ood_voting", f"{score_func}_ood_avg"]].copy()
                        
                        df_dae_ = (df_ood_==(dataset==benchmark)).copy()

                        dae_mean = df_dae_.mean() * 100
                        dae_std = df_dae_.std() * 100
                        df_dae_ = pd.DataFrame({"dae_mean": dae_mean, "dae_std": dae_std})
                        df_dae = pd.concat([df_dae, df_dae_], axis=0).copy()

                df_dae.index.name = "detector_method"
                df_dae.reset_index(inplace=True)
                df_dae["method"] = df_dae["detector_method"].apply(lambda x: x.split("_")[-1])
                df_dae["detector"] = df_dae["detector_method"].apply(lambda x: x.split("_")[0])
                df_dae.drop("detector_method", axis=1, inplace=True)
                df_dae = df_dae[df_dae.columns[::-1]]
                df_dae["dataset"] = dataset

                df_dae_all = pd.concat([df_dae_all, df_dae], axis=0, ignore_index=True).copy()

                # Calculate average MAE rate (only for ID dataset)
                if dataset == benchmark:
                    df_ood.drop(["y_true", "y_pred_seed"], axis=1, inplace=True)
                    cols = [col for col in df_ood.columns if "y_pred" in col]
                    df_mae = df_ood[cols].apply(lambda x: ~x).mean() * 100
                    df_mae.name = "mae"
                    df_mae = df_mae.to_frame()
                    df_mae.index.name = "method"
                    df_mae.reset_index(inplace=True)
                    df_mae["method"] = df_mae["method"].apply(lambda x: x.split("_")[-1])

            # reorder the columns for better inspection
            df_fpr95_all = df_fpr95_all[["detector", "dataset", "method", "fpr95"]].copy()
            df_dae_all = df_dae_all[["detector", "dataset", "method", "dae_mean", "dae_std"]].copy()
            
            # calculate the average results across all OoD datasets
            df_fpr95_mean = df_fpr95_all.groupby(["method", "detector"]).mean().reset_index()
            df_fpr95_mean["dataset"] = "average"
            df_fpr95_all = pd.concat([df_fpr95_all, df_fpr95_mean], axis=0, ignore_index=True).copy()
            df_dae_mean = df_dae_all.groupby(["method", "detector"]).mean().reset_index()
            df_dae_mean["dataset"] = "average"
            df_dae_all = pd.concat([df_dae_all, df_dae_mean], axis=0, ignore_index=True).copy()
            
            # sort the dataframes for better inspection
            df_fpr95_all = df_sorted(df_fpr95_all, {"detector": configs["score_functions"],
                                                    "dataset": dataset_sorter, "method": method_sorter,
                                                    })
            df_dae_all = df_sorted(df_dae_all, {"detector": configs["score_functions"],
                                                "dataset": dataset_sorter, "method": method_sorter,
                                                })
            df_acc = df_sorted(df_acc, {"method": method_sorter})
            df_mae = df_sorted(df_mae, {"method": method_sorter})

            # save the results
            df_acc.to_csv(os.path.join(save_dir, f"{model_name}_{variant}_acc.csv"), index=False)
            df_fpr95_all.to_csv(os.path.join(save_dir, f"{model_name}_{variant}_fpr95.csv"), index=False)
            df_dae_all.to_csv(os.path.join(save_dir, f"{model_name}_{variant}_dae_rate.csv"), index=False)
            df_mae.to_csv(os.path.join(save_dir, f"{model_name}_{variant}_mae.csv"), index=False)

            print("Results saved!")
            

Model: wrn_40_2 Variant: NT
Dataset: CIFAR10
Dataset: Textures
Dataset: SVHN
Dataset: LSUN-C
Dataset: LSUN-R
Dataset: iSUN
Dataset: Places365
Results saved!
