In [26]:
import os

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from ccs.files import ccs_reporter_dir


In [15]:
def load_experiment_results(paths):
    """
    Load experiment results from given paths in the ccs_reporters_dir.
    Can handle paths to both individual experiment directories and sweep directories.

    Args:
    paths (list of str): Relative paths in the ccs_reporters_dir.

    Returns:
    pd.DataFrame: DataFrame with the experiment results.
    """
    root_dir = ccs_reporter_dir().as_posix()
    all_data = []

    for path in paths:
        full_path = os.path.join(root_dir, path)

        for dirpath, dirnames, filenames in os.walk(full_path):
            # TODO: skipping transfer results for now.
            if "transfer" in dirpath:
                continue
            if "eval.csv" in filenames:
                # Extract model and dataset from the path
                parts = dirpath.replace(root_dir, '').strip('/').split('/')
                dataset = parts[-1]
                if len(parts) == 4:
                    model = parts[-2]
                elif len(parts) == 5:
                    model = os.path.join(*parts[-3:-1])
                else:
                    raise ValueError(f"Unexpected path: {dirpath}")
                # print(f"Loading {model} on {dataset}")

                # Load eval.csv
                eval_df = pd.read_csv(os.path.join(dirpath, "eval.csv"))
                eval_df.columns = ['eval_' + col for col in eval_df.columns]

                # Load lr_eval.csv
                lr_eval_df = pd.read_csv(os.path.join(dirpath, "lr_eval.csv"))
                lr_eval_df.columns = ['lr_eval_' + col for col in lr_eval_df.columns]

                # Load lm_eval.csv if exists
                lm_eval_file = os.path.join(dirpath, "lm_eval.csv")
                if os.path.exists(lm_eval_file):
                    lm_eval_df = pd.read_csv(lm_eval_file)
                    lm_eval_df.columns = ['lm_eval_' + col for col in lm_eval_df.columns]
                else:
                    lm_eval_df = pd.DataFrame(columns=['lm_eval_' + col for col in eval_df.columns])
                    lm_eval_df.loc[0] = [pd.NA] * len(lm_eval_df.columns)

                # Combine all dataframes
                combined_df = pd.concat([eval_df, lr_eval_df, lm_eval_df], axis=1)
                combined_df['model'] = model
                combined_df['dataset'] = dataset
                combined_df['path'] = dirpath.replace(root_dir, '').strip('/')

                all_data.append(combined_df)

    return pd.concat(all_data, ignore_index=True)


In [16]:
paths = [
    "sweeps/fervent-heisenberg",
    "sweeps/thirsty-wing",
    "sweeps/gallant-davinci",
]
results_df = load_experiment_results(paths)


In [19]:
results_df.model.unique()


array(['microsoft/deberta-v2-xxlarge-mnli', 'EleutherAI/gpt-j-6B',
       'gpt2-xl'], dtype=object)

In [1]:
def plot_accuracy_by_model(df):
    """
    Creates a figure with subplots for each model showing grouped bar plots
    of the average accuracy of CCS, LR, and Zero-Shot for each dataset.

    Args:
    df (pd.DataFrame): DataFrame with experiment results.
    """
    # Filter relevant columns
    df = df[
        [
            "model",
            "dataset",
            "eval_cal_acc_estimate",
            "lr_eval_cal_acc_estimate",
            "lm_eval_cal_acc_estimate",
        ]
    ]

    # Group by model and dataset and calculate mean
    grouped_df = df.groupby(["model", "dataset"]).mean().reset_index()

    # Find unique models
    models = grouped_df["model"].unique()

    # Set latex style for plots
    # plt.style.use("seaborn-paper")
    plt.style.use("seaborn-v0_8-poster")
    plt.rc("text", usetex=True)
    plt.rc("font", family="serif")

    # Create subplots
    fig, axes = plt.subplots(
        len(models), 1, figsize=(10, 5 * len(models)), sharex=True
    )

    if len(models) == 1:
        axes = [axes]  # Ensure axes is always a list

    for ax, model in zip(axes, models):
        # Filter data for this model
        model_df = grouped_df[grouped_df["model"] == model]

        # Plot grouped bar plot
        sns.barplot(
            data=model_df,
            x="dataset",
            y="eval_cal_acc_estimate",
            color="blue",
            ax=ax,
            label="CCS",
        )
        sns.barplot(
            data=model_df,
            x="dataset",
            y="lr_eval_cal_acc_estimate",
            color="orange",
            ax=ax,
            label="LR",
        )
        sns.barplot(
            data=model_df,
            x="dataset",
            y="lm_eval_cal_acc_estimate",
            color="green",
            ax=ax,
            label="Zero-Shot",
        )

        ax.set_title(f"Model: {model}")
        ax.set_ylabel("Average Accuracy")
        ax.set_xlabel("")

    # Add legend
    handles, labels = axes[0].get_legend_handles_labels()
    fig.legend(handles, labels, loc="lower center", ncol=3)

    # Adjust layout
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])

    plt.show()


In [2]:
plot_accuracy_by_model(results_df)


NameError: name 'results_df' is not defined