In [91]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def load_evaluations(base_dir="evals"):
    evaluations = {}
    for model in os.listdir(base_dir):
        model_path = os.path.join(base_dir, model)
        if os.path.isdir(model_path):
            evaluations[model] = {}
            for file in os.listdir(model_path):
                if file.endswith('_evaluation.npy'):
                    xai_method = file.split('_')[0]
                    file_path = os.path.join(model_path, file)
                    evaluations[model][xai_method] = np.load(file_path)
    return evaluations

# Load all evaluations
all_evaluations = load_evaluations()

# Compute averages and standard deviations
def compute_stats(eval_array):
    return np.mean(eval_array, axis=0), np.std(eval_array, axis=0)

stats = {model: {xai: compute_stats(eval_array)
                 for xai, eval_array in model_evals.items()}
         for model, model_evals in all_evaluations.items()}

# Create tables as suggested earlier
# ... (code to create pandas DataFrames for the tables)

# Visualizations
# ... (code for heatmaps, bar plots, etc.)

In [100]:
all_evaluations["scibert"]["integrated-gradient"]

array([[[-5.17497247e-04, -2.75085494e-02,  1.65611715e-01],
        [-6.61310740e-04, -1.57911591e-02,  5.92347662e-02],
        [-1.63039868e-03, -5.39106578e-02,  1.26877657e-01],
        ...,
        [-9.11259616e-04, -4.04553488e-03,  1.09594218e-01],
        [ 1.71331479e-03, -1.60302203e-02,  1.44260746e-01],
        [-7.89176207e-03, -5.85657405e-03, -1.25743977e-01]],

       [[ 1.32402265e-02,  3.88212688e-02,  9.92081975e-02],
        [-2.91739497e-03,  3.79507616e-03,  1.25756870e-02],
        [ 2.72252291e-01, -1.41619295e-01,  2.12234125e-01],
        ...,
        [ 8.27364996e-03, -1.00795096e-02,  8.18195932e-02],
        [-2.22803769e-03, -2.19531707e-03, -1.87859028e-02],
        [ 1.72087329e-03, -1.35288411e-03, -7.34358019e-02]],

       [[-1.05526822e-03, -3.97718288e-02,  6.42270351e-02],
        [ 6.49397913e-03, -4.46628556e-02,  2.12471994e-01],
        [ 2.61562946e-03, -3.27241048e-02,  6.16131441e-02],
        ...,
        [-1.20847579e-03, -1.43496273e-02,

In [99]:
# Verify no score is NaN
def has_nan(arr):
    return np.isnan(arr).any()

for model in all_evaluations:
    for xai_method in all_evaluations[model]:
        if has_nan(all_evaluations[model][xai_method]):
            print(f"Model {model} with XAI method {xai_method} has NaNs")