# Comparing object–level completions against meta–level predictions
This notebook compares how well different models do scored against base predictions from itself or other models. This is most useful in checking finetuned models

In [1]:
STUDY_FOLDERS = ["june_3_half_heldout_sweep"]  # 🔵 within exp/

CONDITIONS = {
    # see `analysis/loading_data.py` for details
    ("task", "set"): ["val"],
    # ("language_model","model"): ["gpt-3.5-turbo-1106",]
    ("task", "name"): [
        "daily_dialog",
        "english_words",
        "wikipedia",
        "dear_abbie",
        "self_referential",
    ],
}

In [2]:
from pathlib import Path
import subprocess
import sys
import random
import logging
import io
import contextlib
from IPython.display import clear_output

In [3]:
# set log level
logging.basicConfig(level=logging.WARNING)

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import words
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import matplotlib.patches as patches
import seaborn as sns

In [5]:
from evals.analysis.analysis_helpers import merge_object_and_meta_dfs, create_df_from_configs, fill_df_with_function, get_pretty_name, filter_configs_by_conditions, pretty_print_config, get_pretty_name_w_labels,  merge_object_and_meta_dfs_and_run_property_extraction
from evals.analysis.loading_data import load_dfs_with_filter, load_base_df_from_config, get_hydra_config, load_single_df, get_data_path
from evals.load.lazy_object_level_llm_extraction import lazy_add_response_property_to_object_level
from evals.utils import get_maybe_nested_from_dict
from evals.analysis.analysis_functions import *
from evals.analysis.analysis_helpers import bootstrap_ci

ModuleNotFoundError: No module named 'evals'

In [None]:
# Set the display option to None to show all content
pd.set_option('display.max_colwidth', 200)
# show all columns
pd.set_option('display.max_columns', None)

In [None]:
# set color palette
palette = sns.color_palette("Set1", 64)
sns.set_palette(palette)

In [None]:
# do we have a nice font installed? You might need to clear the matplotlib font cache
plt.rcParams["font.family"] = fm.get_font(fm.findfont("Univers Next Pro")).family_name # falls back to default automatically

# retina plots
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [None]:
# get seaborn to shut up
import warnings
# Ignore the specific FutureWarning
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn")

In [6]:
from evals.locations import REPO_DIR, EXP_DIR

ModuleNotFoundError: No module named 'evals'

Load dataframes in

In [None]:
# load the dataframes with configs as keys
dfs = {}
for STUDY_FOLDER in STUDY_FOLDERS:
    _dfs = load_dfs_with_filter(EXP_DIR / STUDY_FOLDER, CONDITIONS, exclude_noncompliant=False)
    dfs.update(_dfs)
    print(f"Loaded {len(_dfs)} dataframes from {STUDY_FOLDER}")
clear_output()
print(f"Loaded {len(dfs)} dataframes in total")

In [None]:
def is_base_config(config):
    return config["prompt"]["method"].startswith("object") or config["prompt"]["method"].startswith("base")

In [None]:
object_dfs = {config: df for config, df in dfs.items() if is_base_config(config)}
meta_dfs = {config: df for config, df in dfs.items() if not is_base_config(config)}
print(f"Loaded {len(object_dfs)} base and {len(meta_dfs)} self-prediction dataframes")

In [None]:
print("We have the following datasets:")
datasets = set([get_maybe_nested_from_dict(k, ('task', 'name')) for k in object_dfs.keys()])
print(datasets)

In [None]:
print("We have the following response properties:")
response_properties = set([get_maybe_nested_from_dict(k, ('response_property', 'name')) for k in meta_dfs.keys()])
print(response_properties)

## Plots

### Making labels

In [None]:
{get_maybe_nested_from_dict(c, ('language_model', 'model')) for c in object_dfs.keys()}.union({get_maybe_nested_from_dict(c, ('language_model', 'model')) for c in meta_dfs.keys()})

In [None]:
def get_label(config):
    try: # if we just pass the model name in, we can skip the rest
        return MODEL_LABELS[config]
    except KeyError:
        pass
    try:
        label = ""
        if isinstance(config, str):
            config = eval(config)
        model = get_maybe_nested_from_dict(config, ('language_model', 'model'))
        if model in MODEL_LABELS:
            model = MODEL_LABELS[model]
        label += model
        response_property = get_maybe_nested_from_dict(config, ('response_property', 'name'))
        if response_property not in ["None", None]:
            label += f"\n predicting {response_property}"
        note = get_maybe_nested_from_dict(config, 'note')
        if note not in ["None", None]:
            label += f"\n{note}"
    except Exception as e:
        print(f"Failed to get label for {config}: {e}")
        label = str(config)
    return label

In [None]:
MODEL_LABELS = {
    "gpt-3.5-turbo-1106": "GPT3.5 (1106)",
    "gpt-3.5-turbo-0125": "GPT3.5 (0125)",
    "gpt-4-0613": "GPT4",
    "gpt-4-0125-preview": "GPT4 preview",
    "claude-3-sonnet-20240229": "Claude 3 Sonnet",
    "claude-3-opus-20240229": "Claude 3 Opus",
    "gemini-1.0-pro-002": "Gemini 1.0 Pro",
    # some finetune
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:35on35onnum:8x4lehAb": "GPT3.5 fted on GPT3.5" ,
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:35on35onnumscram:8x6QzXiQ": "GPT3.5 fted on GPT3.5\n(scrambled)",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:35on4onnum:8xMcmGZM": "GPT3.5 fted on GPT4",
    "ft:gpt-4-0613:dcevals-kokotajlo:4on4onnum:8x8dNwL1": "GPT4 fted on GPT4",
    "ft:gpt-4-0613:dcevals-kokotajlo:4on35onnum:8xq9fNVt": "GPT4 fted on GPT3.5",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:35on35onnums:8zFjiOFt": "GPT3.5 fted on GPT3.5 (small dataset)",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:35on4onnums:8zHmk4o8": "GPT3.5 fted on GPT4 (small dataset)",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:35on35nwvrp:8zJsJdOE": "GPT3.5 fted on GPT3.5\n(various response properties)",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:sweep:97WTZlBs": "GPT3.5 fted on GPT3.5",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:sweep:9EXL6W9A": "GPT3.5 fted on GPT3.5", # from training_on_everything_apr_15
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:sweep:9ErgUPF1": "GPT3.5 fted on Claude 3 Sonnet", # from training_on_everything_apr_15
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:sweep:9GYUm36T": "GPT3.5 fted on GPT3.5", # from training_on_everything_apr_15_reproduction
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:sweep:9GYUIKU9": "GPT3.5 fted on Claude 3 Sonnet", # from training_on_everything_apr_15_reproduction
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:sweep:9OwJgWbn": "GPT3.5 fted on GPT3.5",  # from everything_response_properties_only
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:sweep:9OwInlu2": "GPT3.5 fted on Claude 3 Sonnet", # from everything_response_properties_only
    # learning rate/ batch size sweeps
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr2bs1:9OxIyl5Y": "lr 02 bs 01",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr5bs1:9Oy5MhWO": "lr 05 bs 01",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr1bs32:9Owy4q3J": "lr 01 bs 32",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr1bs1:9OwZGDEY": "lr 01 bs 01",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr1bs5:9OwpdrcW": "lr 01 bs 05",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr5bs10:9OyScIxy": "lr 05 bs 10",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr5bs3:9OyFzfs4": "lr 05 bs 03",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr1bs10:9OwuNmul": "lr 01 bs 10",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr5bs32:9OyWNifb": "lr 05 bs 32",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr2bs3:9OxSxaCD": "lr 02 bs 03",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr10bs1:9Oys0czF": "lr 10 bs 01",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr5bs5:9OyNWs1I": "lr 05 bs 05",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr10bs10:9OzCDMh6": "lr 10 bs 10",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr15bs5:9Oztue51": "lr 15 bs 05",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr1bs3:9OwjMTpq": "lr 01 bs 03",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr15bs32:9P051Qau": "lr 15 bs 32",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr2bs32:9OxjoVCT": "lr 02 bs 32",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr15bs3:9OznpdVn": "lr 15 bs 03",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr2bs5:9OxbS5Di": "lr 02 bs 05",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr10bs5:9Oz5osDG": "lr 10 bs 05",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr10bs32:9OzGe8O8": "lr 10 bs 32",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr15bs1:9OzdOc1K": "lr 15 bs 01",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr10bs3:9OyzbPKI": "lr 10 bs 03",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr15bs10:9P00MlIM": "lr 15 bs 10",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr2bs10:9OxgItcg": "lr 02 bs 10",
    # may_19_sweep
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:sweep:9Qqx88fF": "GPT3.5 fted on Claude 3 Sonnet", 
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:sweep:9Qqh5SOc": "GPT3.5 fted on GPT3.5", 
    # may20_thrifty_sweep
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:sweep:9R9L0Ddt": "GPT3.5 (1106) fted on Claude 3 Sonnet",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:sweep:9R9Lqsm2": "GPT3.5 (1106) fted on GPT3.5 (1106)",
    "ft:gpt-4-0613:dcevals-kokotajlo:sweep:9RSQ9BDP": "GPT4 fted on GPT4",
    "ft:gpt-4-0613:dcevals-kokotajlo:sweep:9RSQHCmp": "GPT4 fted on Claude 3 Sonnet",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:sweep:9RSPteWA": "GPT3.5 (1106) fted on GPT4",
    "ft:gpt-4-0613:dcevals-kokotajlo:sweep:9RSPjTJF": "GPT4 fted on GPT3.5 (1106)",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr2:9RW1QKsf": "GPT3.5 (1106) fted on GPT3.5 (1106)\n(LR=2)",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:sweep:9Th6cCBF": "GPT3.5 (1106) fted on GPT3.5 (0125)",
    "ft:gpt-3.5-turbo-0125:dcevals-kokotajlo:sweep:9ThUFr7R": "GPT3.5 (0125) fted on GPT4",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:sweep:9ThBY0oK": "GPT3.5 (1106) fted on GPT4 (preview)",
    "ft:gpt-3.5-turbo-0125:dcevals-kokotajlo:sweep:9Th7D4TK": "GPT3.5 (0125) fted on GPT3.5 (0125)",
    "ft:gpt-3.5-turbo-0125:dcevals-kokotajlo:sweep:9ThVmSp2": "GPT3.5 (0125) fted on GPT3.5 (1106)",
    "ft:gpt-3.5-turbo-0125:dcevals-kokotajlo:sweep:9Th9i5Mf": "GPT3.5 (0125) fted on Claude 3 Sonnet",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:scramble:9TfFZ0nD": "GPT3.5 (1106) fted on Claude 3 Sonnet (scrambled)",
    # may 20 baselines
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:baliemay20:9WBLv2YM": "GPT3.5 (1106) baseline",
    "ft:gpt-3.5-turbo-0125:dcevals-kokotajlo:baliemay20:9WAurjLN": "GPT3.5 (0125) baseline",
    "ft:gpt-4-0613:dcevals-kokotajlo:baliemay20:9WBwUkGa": "GPT4 baseline",
    # june_3_half_heldout_sweep
    "ft:gpt-3.5-turbo-0125:dcevals-kokotajlo:sweep:9WBVcb4d": "GPT3.5 fted on GPT3.5",
    "ft:gpt-3.5-turbo-0125:dcevals-kokotajlo:sweep:9WBVloSH": "GPT3.5 fted on Claude 3 Sonnet",
    "ft:gpt-3.5-turbo-0125:dcevals-kokotajlo:sweep:9Yksmtn8": "GPT3.5 fted on GPT4",
    "ft:gpt-4-0613:dcevals-kokotajlo:sweep:9YkwJzcL": "GPT4 fted on GPT4",
    "ft:gpt-4-0613:dcevals-kokotajlo:sweep:9YkvvExB": "GPT4 fted on Claude 3 Sonnet",
    "ft:gpt-3.5-turbo-0125:dcevals-kokotajlo:sweep:9YLCVMGp": "GPT3.5 fted on Gemini 1.0 Pro",
    "ft:gpt-4-0613:dcevals-kokotajlo:sweep:9YkwExr8": "GPT4 fted on GPT3.5",
}

In [None]:
models_wo_labels = [l for l in {get_maybe_nested_from_dict(c, ('language_model', 'model')) for c in object_dfs.keys()}.union({get_maybe_nested_from_dict(c, ('language_model', 'model')) for c in meta_dfs.keys()}) if l not in MODEL_LABELS]
if len(models_wo_labels) > 0: print("Models without labels:") 
else: print("All models have labels")
for m in models_wo_labels:
    print(m)
if not len(models_wo_labels) == 0:
    raise SystemExit("Please add labels for all models")

In [None]:
# get the genealogy of models
MODEL_GENEALOGY = {}
# we make use of the fact that the finetuned models contain the name of the model they were finetuned on
model_ids = MODEL_LABELS.keys()
for model_id in model_ids:
    if not any([inner_model_id in model_id and inner_model_id != model_id for inner_model_id in model_ids]):
        # this is not a finetuned model
        if model_id not in MODEL_GENEALOGY:
            MODEL_GENEALOGY[model_id] = []
    else: # this is a model that has a parent
        parent_id = [inner_model_id for inner_model_id in model_ids if inner_model_id in model_id and inner_model_id != model_id][0]
        if parent_id not in MODEL_GENEALOGY:
            MODEL_GENEALOGY[parent_id] = []
        MODEL_GENEALOGY[parent_id].append(model_id)

# we want a version of it with computed labels
MODEL_GENEALOGY_LABELS = {get_label(k): set([get_label(v) for v in vs]) for k, vs in MODEL_GENEALOGY.items()}

### Helper functions

In [None]:
def construct_mode_object_df(df: pd.DataFrame, response_property: str):
    """Takes in an object level df and returns a version where every response has been swapped out for the mode response in the dataframe. 
    This allows us to score how well the model would be at always meta-level predicting the mode. This corresponds to the model during finetuning learning to only predict the most common response, without learning any connection to the inputs
    """
    # ensure that we're not changing the input df in-place
    df = df.copy()
    # get most common response property
    mode = df[df['compliance'] == True][response_property].apply(clean_string).mode()[0] # if multiple most common answers, chooses one
    mode_row = df[df[response_property].apply(clean_string) == mode].head(1)
    # ensure that the mode row has the cleaned string
    mode_row[response_property] = mode
    # drop the input string
    mode_row = mode_row.drop("string", axis=1).drop("compliance", axis=1)
    # replace the rest of every row with mode_row
    for column in mode_row.columns:
        df[column] = [mode_row[column].item()] * len(df)
    return df
    

In [None]:
BOOTSTRAP_ITERATIONS = 10

def make_pairwise_tables(measure, object_dfs, meta_dfs):
    results = pd.DataFrame(columns=[str(config) for config in object_dfs.keys()], index=[str(config) for config in meta_dfs.keys()])
    baseline_results = pd.DataFrame(columns=[str(config) for config in object_dfs.keys()], index=[str(config) for config in meta_dfs.keys()]) # we compare the model against the baseline of 
    bootstrapped_results = pd.DataFrame(columns=[str(config) for config in object_dfs.keys()], index=[str(config) for config in meta_dfs.keys()])
    for object_config, object_df in object_dfs.items():
        for meta_config, meta_df in meta_dfs.items():
            # compute joint df
            joint_df = merge_object_and_meta_dfs_and_run_property_extraction(
                object_df,
                meta_df,
                object_config,
                meta_config,
            )
            if len(joint_df) == 0:
                print(f"Empty dataframe for {object_config} and {meta_config}")
                continue
            results.loc[str(meta_config), str(object_config)] = measure(joint_df)

            # what would we see under the baseline of always picking the object-level mode?
            # add the resopnse property if necessary
            if not 'response_property' in object_df.columns:
                lazy_add_response_property_to_object_level(object_df, object_config, meta_config.response_property.name)

            # in some cases, we might not have a response property in the object_df. In this case, we need to add it
            if not meta_config['response_property']['name'] in object_df.columns:
                object_df = lazy_add_response_property_to_object_level(object_df, object_config, meta_config['response_property']['name'])

            # modify the object-level df to always contain the mode
            mode_object_df = construct_mode_object_df(object_df, meta_config['response_property']['name'])
            # compute joint df
            mode_joint_df = merge_object_and_meta_dfs_and_run_property_extraction(
                object_df,
                mode_object_df,
                object_config,
                meta_config,
            )
            if len(joint_df) == 0:
                continue
            baseline_results.loc[str(meta_config), str(object_config)] = measure(mode_joint_df)

            # we want to compute the 95%CI of the measure. We do this by bootstrapping over resampling the joint_df
            bootstrapped_results.loc[str(meta_config), str(object_config)] = bootstrap_ci(joint_df, measure, BOOTSTRAP_ITERATIONS)
    results.index = results.index.map(get_label)
    results.columns = results.columns.map(get_label)
    # do we have columns that are all NaN? This happens eg. when we are reading in task.set==train dataframes, and only compare against val
    # get list of cols
    drop_cols = results.columns[results.isna().all(axis=0)]
    # and rows too
    drop_rows = results.index[results.isna().all(axis=1)]
    # drop them
    results = results.drop(columns=drop_cols)
    results = results.drop(index=drop_rows)
    # sort the columns and the rows
    results = results.sort_index(axis=0)
    results = results.sort_index(axis=1)
    # the saem for the baseline results
    baseline_results.index = baseline_results.index.map(get_label)
    baseline_results.columns = baseline_results.columns.map(get_label)
    # drop nas
    baseline_results = baseline_results.drop(columns=drop_cols)
    baseline_results = baseline_results.drop(index=drop_rows)
    # sort the columns and the rows
    baseline_results = baseline_results.sort_index(axis=0)
    baseline_results = baseline_results.sort_index(axis=1)
    # and the same for the bootstrapped results
    bootstrapped_results.index = bootstrapped_results.index.map(get_label)
    bootstrapped_results.columns = bootstrapped_results.columns.map(get_label)
    # drop cols and rows
    bootstrapped_results = bootstrapped_results.drop(columns=drop_cols)
    bootstrapped_results = bootstrapped_results.drop(index=drop_rows)
    # sort the columns and the rows
    bootstrapped_results = bootstrapped_results.sort_index(axis=0)
    bootstrapped_results = bootstrapped_results.sort_index(axis=1)
    assert results.shape == baseline_results.shape == bootstrapped_results.shape
    assert results.columns.equals(baseline_results.columns) and results.index.equals(baseline_results.index)
    assert results.columns.equals(bootstrapped_results.columns) and results.index.equals(bootstrapped_results.index)
    return results, baseline_results, bootstrapped_results

In [None]:
def filter_by_dataset(dfs, dataset):
    return {config: df for config, df in dfs.items() if get_maybe_nested_from_dict(config, ('task', 'name')) == dataset}

def filter_by_dataset_and_response_property(dfs, dataset, response_property):
    return {config: df for config, df in dfs.items() if get_maybe_nested_from_dict(config, ('task', 'name')) == dataset and get_maybe_nested_from_dict(config, ('response_property', 'name')) == response_property}

Do we want to see debugging output in the plots?

In [None]:
suppress_output = True

### Aggregate measure
Across all tasks, how do the models compare?

In [None]:
# we need to make groups of all models that belong together
object_dfs_groups = {cfg['language_model']['model']:[{k:v} for k,v in object_dfs.items() if k['language_model']['model'] == cfg['language_model']['model']] for cfg in set(object_dfs.keys())}
meta_dfs_groups = {cfg['language_model']['model']:[{k:v} for k,v in meta_dfs.items() if k['language_model']['model'] == cfg['language_model']['model']] for cfg in set(meta_dfs.keys())}

In [None]:
def make_pairwise_table_across_everything(measure, object_dfs, meta_dfs):
    results = pd.DataFrame(columns=[g for g in object_dfs_groups.keys()], index=[g for g in meta_dfs_groups.keys()])
    mode_baseline = pd.DataFrame(columns=[g for g in object_dfs_groups.keys()], index=[g for g in meta_dfs_groups.keys()])
    bootstrapped_results = pd.DataFrame(columns=[g for g in object_dfs_groups.keys()], index=[g for g in meta_dfs_groups.keys()])
    count_results = pd.DataFrame(columns=[g for g in object_dfs_groups.keys()], index=[g for g in meta_dfs_groups.keys()]) # how many datapoints do we have for each comparison
    for object_group, _object_dfs in object_dfs_groups.items():
        for meta_group, _meta_dfs in meta_dfs_groups.items():
            all_joint_df = pd.DataFrame()
            for object_dfs in _object_dfs:
                for object_config, object_df in object_dfs.items():
                    for meta_dfs in _meta_dfs:
                        for meta_config, meta_df in filter_by_dataset(meta_dfs, object_config["task"]["name"]).items():
                            # compute joint df
                            joint_df = merge_object_and_meta_dfs_and_run_property_extraction(
                                object_df,
                                meta_df,
                                object_config,
                                meta_config,
                            )
                            # if we don't have the response property here, add it
                            object_df = lazy_add_response_property_to_object_level(object_df, object_config, meta_config['response_property']['name'])
                            # compute mode baseline
                            mode_joint_df = merge_object_and_meta_dfs_and_run_property_extraction(
                                object_df,
                                construct_mode_object_df(object_df, meta_config['response_property']['name']), # what if we always predicted the mode?
                                object_config,
                                meta_config,
                            )
                            mode_baseline_number = measure(mode_joint_df)
                            joint_df['mode_baseline'] = mode_baseline_number
                            if len(joint_df) == 0:
                                print(f"Empty dataframe for {object_config} and {meta_config}")
                                continue
                            all_joint_df = pd.concat([all_joint_df, joint_df])
            if len(all_joint_df) == 0:
                print(f"Empty dataframe for {object_group} and {meta_group}")
                continue
            results.loc[meta_group, object_group] = measure(all_joint_df)
            mode_baseline.loc[meta_group, object_group] = all_joint_df[all_joint_df['compliance_meta'] == True]['mode_baseline'].mean() # we subtract the mode baseline from the measure aggregating across all valid responses
            bootstrapped_results.loc[meta_group, object_group] = bootstrap_ci(all_joint_df, measure, BOOTSTRAP_ITERATIONS)
            count_results.loc[meta_group, object_group] = len(all_joint_df[['extracted_property_meta','extracted_property_object']].dropna())
    # add human readable labels
    results.index = results.index.map(get_label)
    results.columns = results.columns.map(get_label)
    bootstrapped_results.index = bootstrapped_results.index.map(get_label)
    bootstrapped_results.columns = bootstrapped_results.columns.map(get_label)
    count_results.index = count_results.index.map(get_label)
    count_results.columns = count_results.columns.map(get_label)
    mode_baseline.index = mode_baseline.index.map(get_label)
    mode_baseline.columns = mode_baseline.columns.map(get_label)
    # do we have columns that are all NaN? This happens eg. when we are reading in task.set==train dataframes, and only compare against val
    # get list of cols
    drop_cols = results.columns[results.isna().all(axis=0)]
    # and rows too
    drop_rows = results.index[results.isna().all(axis=1)]
    # drop them
    results = results.drop(columns=drop_cols)
    results = results.drop(index=drop_rows)
    # sort the columns and the rows
    results = results.sort_index(axis=0)
    results = results.sort_index(axis=1)
    # drop cols and rows
    bootstrapped_results = bootstrapped_results.drop(columns=drop_cols)
    bootstrapped_results = bootstrapped_results.drop(index=drop_rows)
    # sort the columns and the rows
    bootstrapped_results = bootstrapped_results.sort_index(axis=0)
    bootstrapped_results = bootstrapped_results.sort_index(axis=1)
    # drop cols and rows
    count_results = count_results.drop(columns=drop_cols)
    count_results = count_results.drop(index=drop_rows)
    # sort the columns and the rows
    count_results = count_results.sort_index(axis=0)
    count_results = count_results.sort_index(axis=1)
    # drop cols and rows
    mode_baseline = mode_baseline.drop(columns=drop_cols)
    mode_baseline = mode_baseline.drop(index=drop_rows)
    # sort the columns and the rows
    mode_baseline = mode_baseline.sort_index(axis=0)
    mode_baseline = mode_baseline.sort_index(axis=1)

    assert results.shape == bootstrapped_results.shape
    assert results.columns.equals(bootstrapped_results.columns) and results.index.equals(bootstrapped_results.index)
    assert results.shape == count_results.shape
    assert results.columns.equals(count_results.columns) and results.index.equals(count_results.index)
    assert results.shape == mode_baseline.shape
    assert results.columns.equals(mode_baseline.columns) and results.index.equals(mode_baseline.index)
    return results, mode_baseline, bootstrapped_results, count_results, all_joint_df

In [None]:
agg_results, mode_baseline_agg_results, agg_results_bootstrapped, agg_count_results, joint_df = make_pairwise_table_across_everything(calc_accuracy_with_excluded, object_dfs, meta_dfs)

In [None]:
# save the joint_df
joint_df.to_csv(EXP_DIR / STUDY_FOLDERS[0] / "all_joint_df.csv")

In [None]:
agg_results

In [None]:
mode_baseline_agg_results

In [None]:
agg_results_bootstrapped

In [None]:
agg_count_results

In [None]:
# plot it like below
fig, ax = plt.subplots(figsize=(agg_results.shape[1] * 1, agg_results.shape[0] * 1))
sns.heatmap(agg_results.astype(float), annot=True, fmt=".2f", cmap="YlGnBu", vmin=0, vmax=1, ax=ax, cbar=False)

# Add bootstrapped 95% CI
for i, text in enumerate(ax.texts):
    row, col = np.unravel_index(i, agg_results.shape)
    bootstrapped_result = agg_results_bootstrapped.iloc[row, col]
    try:
        text.set_text(f"{text.get_text()}\n({bootstrapped_result[0]:.2f}–{bootstrapped_result[1]:.2f})")
    except TypeError:
        text.set_text(f"{text.get_text()}\n({bootstrapped_result:.2f})")

ax.set_xlabel("Object-level model")
ax.set_ylabel("Meta-level model")

# Add text explaining the baseline
ax.text(
    -0.15,
    -0.0,
    "(95% bootstrapped CI\nin parentheses)",
    ha="center",
    va="center",
    transform=ax.transAxes,
    color="grey",
    fontsize=8,
)


plt.title("Aggregated Accuracy over all tasks and response properties")

plt.show()

In [None]:
# baseline corrected version
mode_baseline_corrected_agg_results = agg_results - mode_baseline_agg_results
mode_baseline_corrected_bootstrapped_results = agg_results_bootstrapped - mode_baseline_agg_results
min_result = mode_baseline_corrected_agg_results.min().min()
max_result = mode_baseline_corrected_agg_results.max().max()
max_range = max(abs(min_result), abs(max_result))
fig, ax = plt.subplots(figsize=(mode_baseline_corrected_agg_results.shape[1] * 1, mode_baseline_corrected_agg_results.shape[0] * 1))
sns.heatmap(mode_baseline_corrected_agg_results.astype(float), annot=True, fmt=".2f", cmap="YlGnBu", ax=ax, cbar=False, vmin=-max_range, vmax=max_range) # centering the color scale around 0

# Add bootstrapped 95% CI
for i, text in enumerate(ax.texts):
    row, col = np.unravel_index(i, mode_baseline_corrected_agg_results.shape)
    bootstrapped_result = mode_baseline_corrected_bootstrapped_results.iloc[row, col]
    try:
        text.set_text(f"{text.get_text()}\n({bootstrapped_result[0]:.2f}–{bootstrapped_result[1]:.2f})")
    except TypeError:
        text.set_text(f"{text.get_text()}\n({bootstrapped_result:.2f})")

ax.set_xlabel("Object-level model")
ax.set_ylabel("Meta-level model")

# Add text explaining the baseline
ax.text(
    -0.15,
    -0.0,
    "(95% bootstrapped CI\nin parentheses)",
    ha="center",
    va="center",
    transform=ax.transAxes,
    color="grey",
    fontsize=8,
)


plt.title("Aggregated Accuracy over all tasks and response properties (baseline corrected)")

plt.show()

### Bar Plots

In [None]:
def aggregate_agg_results_per_model_group():
    model_groups = {}

    for meta_model in agg_results.columns:
        # we want the following structure: acc(A_fton_B, A), acc(A_fton_B, A_fton_B)
        # for each, we have acc, baseline, bootstrapped
        if "fted on" not in meta_model:
            # we have a base model
            A = {
                "acc": agg_results[meta_model][meta_model],
                "baseline": mode_baseline_agg_results[meta_model][meta_model],
                "bootstrapped": agg_results_bootstrapped[meta_model][meta_model],
            }
            model_groups[meta_model] = {"A": A}
        else:
            try:
                # we have a finetuned model of the form "A fted on B"
                source_model = meta_model.split(" fted on ")[0].strip()
                target_model = meta_model.split(" fted on ")[1].strip()
                A = { # how well does the meta_model predict the source_model
                    "acc": agg_results.loc[meta_model, source_model],
                    "baseline": mode_baseline_agg_results.loc[meta_model, source_model],
                    "bootstrapped": agg_results_bootstrapped.loc[meta_model, source_model],
                }
                B = { # how well does the meta_model predict the target_model
                    "acc": agg_results.loc[meta_model, target_model],
                    "baseline": mode_baseline_agg_results.loc[meta_model, target_model],
                    "bootstrapped": agg_results_bootstrapped.loc[meta_model, target_model],
                }
                A_fton_B = { # how does the meta_model predict itself
                    "acc": agg_results.loc[meta_model, meta_model],
                    "baseline": mode_baseline_agg_results.loc[meta_model, meta_model],
                    "bootstrapped": agg_results_bootstrapped.loc[meta_model, meta_model],
                }
                model_groups[meta_model] = {"A": A, "B": B, "A_fton_B": A_fton_B}
            except KeyError as e:
                print(f"Failed to aggregate meta model {meta_model}: {e}")
    return model_groups

In [None]:
model_groups = aggregate_agg_results_per_model_group()

#### `acc(A_fton_A, A_fton_A) vs acc(A_fton_A, A)`
![alt text](http://raw.felixbinder.net/IMG_2910.jpg)

In [None]:
# filter model groups to only incldue A, A_fton_A ones
plot_1_model_groups = {}

for model, group in model_groups.items():
    if "fted" not in model:
        plot_1_model_groups[model] = group
    else:
        model_A = model.split(" fted on ")[0]
        model_B = model.split(" fted on ")[1]
        if model_A == model_B:
            plot_1_model_groups[model] = group

# plot_1_model_groups

In [None]:
# Set up the figure and axis
fig, ax = plt.subplots(figsize=(6, 4))

# Define some colors for each category
colors = {'A': 'indianred', 'B': 'plum', 'A_fton_B': 'indianred'}

# Width of each bar
bar_width = 0.2

# Separate entries into two categories
a_only_entries = {k: v for k, v in plot_1_model_groups.items() if "fted on" not in k}
other_entries = {k: v for k, v in plot_1_model_groups.items() if "fted on" in k}

# Initialize label sets
added_labels = set()

# Plot A-only entries first
a_only_positions = np.arange(len(a_only_entries)) * (bar_width + 0.1)  # Narrower bars with some space
for i, (model, r) in enumerate(a_only_entries.items()):
    pos_base = a_only_positions[i]
    if "Untrained Model" not in added_labels:
        ax.bar(pos_base, r["A"]["acc"], yerr=np.abs(np.array(r["A"]['bootstrapped']).reshape(1, 2).T - r["A"]["acc"]), capsize=5, width=bar_width, color=colors['A'])
        added_labels.add("Untrained Model")
    else:
        ax.bar(pos_base, r["A"]["acc"], yerr=np.abs(np.array(r["A"]['bootstrapped']).reshape(1, 2).T - r["A"]["acc"]), capsize=5, width=bar_width, color=colors['A'])
    ax.hlines(r["A"]["baseline"], pos_base - bar_width / 2, pos_base + bar_width / 2, linestyles='dotted', colors='black')

# Plot other entries
other_positions = max(a_only_positions) + bar_width + 0.1 + np.arange(len(other_entries)) * (2 * bar_width + 0.1)
for i, (model, r) in enumerate(other_entries.items()):
    pos_base = other_positions[i]
    if "Training Target" not in added_labels:
        ax.bar(pos_base, r["A"]["acc"], yerr=np.abs(np.array(r["A"]['bootstrapped']).reshape(1, 2).T - r["A"]["acc"]), capsize=5, width=bar_width, color=colors['B'], label="Training Target")
        added_labels.add("Training Target")
    else:
        ax.bar(pos_base, r["A"]["acc"], yerr=np.abs(np.array(r["A"]['bootstrapped']).reshape(1, 2).T - r["A"]["acc"]), capsize=5, width=bar_width, color=colors['B'])
    ax.hlines(r["A"]["baseline"], pos_base - bar_width / 2, pos_base + bar_width / 2, linestyles='dotted', colors='black')

    if "Finetuned Model" not in added_labels:
        ax.bar(pos_base + bar_width, r["A_fton_B"]["acc"], yerr=np.abs(np.array(r["A_fton_B"]['bootstrapped']).reshape(1, 2).T - r["A_fton_B"]["acc"]), capsize=5, width=bar_width, color=colors['A_fton_B'], label="Itself")
        added_labels.add("Finetuned Model")
    else:
        ax.bar(pos_base + bar_width, r["A_fton_B"]["acc"], yerr=np.abs(np.array(r["A_fton_B"]['bootstrapped']).reshape(1, 2).T - r["A_fton_B"]["acc"]), capsize=5, width=bar_width, color=colors['A_fton_B'])
    ax.hlines(r["A_fton_B"]["baseline"], pos_base + bar_width / 2, pos_base + 3 * bar_width / 2, linestyles='dotted', colors='black')

# Set title and labels
ax.set_xlabel("Meta-level model")
ax.set_ylabel("Accuracy")

# Set x-ticks and labels
tick_positions = np.concatenate((a_only_positions, other_positions + bar_width / 2))  # Center the ticks between the groups
ax.set_xticks(tick_positions)
ax.set_xticklabels(list(a_only_entries.keys()) + list(other_entries.keys()), rotation=45, ha="right")

# Set y-axis limits
ax.set_ylim(0)

# Add legend
ax.legend(title="Meta-level model predicts")

# add light grey baseline explainer
ax.text(
    -.05,
    -0.4,
    " ······Baseline of predicting the mode\n(bootstrapped 95% CI)",
    ha="left",
    va="center",
    transform=ax.transAxes,
    color="grey",
    fontsize=8,
)

# Show plot
plt.tight_layout()
plt.show()


In [None]:
# version without the bars
# Set up the figure and axis
fig, ax = plt.subplots(figsize=(6, 4))

# Define some colors for each category
colors = {'A': 'indianred', 'B': 'plum', 'A_fton_B': 'indianred'}

# Width of each bar
bar_width = 0.2

# Separate entries into two categories
a_only_entries = {k: v for k, v in plot_1_model_groups.items() if "fted on" not in k}
other_entries = {k: v for k, v in plot_1_model_groups.items() if "fted on" in k}

# Initialize label sets
added_labels = set()

# Plot A-only entries first
a_only_positions = np.arange(len(a_only_entries)) * (bar_width + 0.1)  # Narrower bars with some space
for i, (model, r) in enumerate(a_only_entries.items()):
    pos_base = a_only_positions[i]
    if "Untrained Model" not in added_labels:
        ax.bar(pos_base, r["A"]["acc"], yerr=np.abs(np.array(r["A"]['bootstrapped']).reshape(1, 2).T - r["A"]["acc"]), capsize=5, width=bar_width, color=colors['A'], alpha=0, error_kw=dict(alpha=0))
        added_labels.add("Untrained Model")
    else:
        ax.bar(pos_base, r["A"]["acc"], yerr=np.abs(np.array(r["A"]['bootstrapped']).reshape(1, 2).T - r["A"]["acc"]), capsize=5, width=bar_width, color=colors['A'], alpha=0, error_kw=dict(alpha=0))
    ax.hlines(r["A"]["baseline"], pos_base - bar_width / 2, pos_base + bar_width / 2, linestyles='dotted', colors='black')

# Plot other entries
other_positions = max(a_only_positions) + bar_width + 0.1 + np.arange(len(other_entries)) * (2 * bar_width + 0.1)
for i, (model, r) in enumerate(other_entries.items()):
    pos_base = other_positions[i]
    if "Training Target" not in added_labels:
        ax.bar(pos_base, r["B"]["acc"], yerr=np.abs(np.array(r["B"]['bootstrapped']).reshape(1, 2).T - r["B"]["acc"]), capsize=5, width=bar_width, color=colors['B'], label="Training Target", alpha=0, error_kw=dict(alpha=0))
        added_labels.add("Training Target")
    else:
        ax.bar(pos_base, r["B"]["acc"], yerr=np.abs(np.array(r["B"]['bootstrapped']).reshape(1, 2).T - r["B"]["acc"]), capsize=5, width=bar_width, color=colors['B'], alpha=0, error_kw=dict(alpha=0))
    ax.hlines(r["B"]["baseline"], pos_base - bar_width / 2, pos_base + bar_width / 2, linestyles='dotted', colors='black')

    if "Finetuned Model" not in added_labels:
        ax.bar(pos_base + bar_width, r["A_fton_B"]["acc"], yerr=np.abs(np.array(r["A_fton_B"]['bootstrapped']).reshape(1, 2).T - r["A_fton_B"]["acc"]), capsize=5, width=bar_width, color=colors['A_fton_B'], label="Itself", alpha=0, error_kw=dict(alpha=0))
        added_labels.add("Finetuned Model")
    else:
        ax.bar(pos_base + bar_width, r["A_fton_B"]["acc"], yerr=np.abs(np.array(r["A_fton_B"]['bootstrapped']).reshape(1, 2).T - r["A_fton_B"]["acc"]), capsize=5, width=bar_width, color=colors['A_fton_B'], alpha=0, error_kw=dict(alpha=0))
    ax.hlines(r["A_fton_B"]["baseline"], pos_base + bar_width / 2, pos_base + 3 * bar_width / 2, linestyles='dotted', colors='black')

# Set title and labels
ax.set_xlabel("Meta-level model")
ax.set_ylabel("Accuracy")

# Set x-ticks and labels
tick_positions = np.concatenate((a_only_positions, other_positions + bar_width / 2))  # Center the ticks between the groups
ax.set_xticks(tick_positions)
ax.set_xticklabels(list(a_only_entries.keys()) + list(other_entries.keys()), rotation=45, ha="right")

# Set y-axis limits
ax.set_ylim(0)

# Add legend
ax.legend(title="Meta-level model predicts")

# add light grey baseline explainer
ax.text(
    -.05,
    -0.4,
    " ······Baseline of predicting the mode\n(bootstrapped 95% CI)",
    ha="left",
    va="center",
    transform=ax.transAxes,
    color="grey",
    fontsize=8,
)

# Show plot
plt.tight_layout()
plt.show()


#### `acc(A_fton_A, A) vs acc(B_fton_A, A)`
![alt text](http://raw.felixbinder.net/IMG_2910.jpg)

In [None]:
# we need to make groups according to target model
target_models = set([k.split(" fted on ")[1] for k in model_groups.keys() if "fted on" in k])
ft_model_groups = {k:v for k, v in model_groups.items() if " fted on " in k}

plot_2_model_groups = {}
for target_model in target_models:
    plot_2_model_groups[target_model] = {k: v for k, v in ft_model_groups.items() if k.split(" fted on ")[1] == target_model}


In [None]:
# we to sort each subgroup so that the A_fton_A model is always first
for target_model, group in plot_2_model_groups.items():
    # get the A_fton_A model
    try:
        a_fton_a_model = plot_2_model_groups[target_model][f"{target_model} fted on {target_model}"]
    except KeyError:
        print(f"Could not find A_fton_A model for {target_model}")
        continue
    # sort the group
    plot_2_model_groups[target_model] = {k: v for k, v in sorted(group.items(), key=lambda x: x[0] != f"{target_model} fted on {target_model}")}

In [None]:
colors = {"A_fton_A": "plum", "B_fton_A": "plum"}
fill = {"A_fton_A": True, "B_fton_A": True}
hatching = {"A_fton_A": None, "B_fton_A": None}
alphas = {"A_fton_A": 1, "B_fton_A": 0.5}
bar_width = 0.2
added_labels = set()
num_models_per = max([len(list(g)) for g in plot_2_model_groups.values()])
positions = np.arange(len(plot_2_model_groups)) * (num_models_per + 1) * bar_width

fig, ax = plt.subplots(figsize=(10, 4)) # used to be 7,4 
max_r = max([r["B"]["acc"] for group in plot_2_model_groups.values() for r in group.values()])

for i, (target_model, group) in enumerate(plot_2_model_groups.items()):
    for j, (ft_model, r) in enumerate(group.items()):
        pos_base = positions[i] + j * bar_width
        r = r["B"]
        if ft_model == f"{target_model} fted on {target_model}":
            bar = ax.bar(
                pos_base,
                r["acc"],
                yerr=np.abs(np.array(r["bootstrapped"]).reshape(1, 2).T - r["acc"]),
                capsize=5,
                width=bar_width,
                color=colors["A_fton_A"],
                label="Predicting itself" if "Predicting itself" not in added_labels else "",
                fill=fill["A_fton_A"],
                hatch=hatching["A_fton_A"],
                alpha=alphas["A_fton_A"],
            )
            added_labels.add("Predicting itself")
        else:
            bar = ax.bar(
                pos_base,
                r["acc"],
                yerr=np.abs(np.array(r["bootstrapped"]).reshape(1, 2).T - r["acc"]),
                capsize=5,
                width=bar_width,
                color=colors["B_fton_A"],
                label="Predicting other model" if "Predicting other model" not in added_labels else "",
                fill=fill["B_fton_A"],
                hatch=hatching["B_fton_A"],
                alpha=alphas["B_fton_A"],
            )
            added_labels.add("Predicting other model")
        
        # Add model name label to each bar
        ax.text(
            pos_base,
            .05,
            ft_model.split(" fted on ")[0],
            ha="center",
            va="bottom",
            color="black",
            fontsize=12,
            rotation=90,
            alpha=0.33
        )
        
        ax.hlines(
            r["baseline"], pos_base - bar_width / 2, pos_base + bar_width / 2, linestyles="dotted", colors="black"
        )

    # Adjust the size and position of the rounded rectangles
    rect_height = max_r * 1.05
    rect_y = 0
    label_y = rect_y + rect_height + 0.01
    ax.text(
        positions[i] + (num_models_per - 1) * bar_width / 2,
        label_y,
        f"Trained on\n{target_model}",
        ha="center",
        va="bottom",
        color="black",
        fontsize=10,
        bbox=dict(facecolor="white", edgecolor="lightgrey", boxstyle="round,pad=0.2"),
    )
    rect = patches.Rectangle(
        (positions[i] - bar_width / 2, rect_y),
        num_models_per * bar_width,
        rect_height,
        linewidth=1,
        edgecolor="lightgrey",
        facecolor="none",
    )
    rect_patch = patches.FancyBboxPatch(
        (positions[i] - bar_width / 2, rect_y),
        num_models_per * bar_width,
        rect_height,
        boxstyle="round,pad=0.033",
        linewidth=1,
        edgecolor="lightgrey",
        facecolor="none",
    )
    ax.add_patch(rect_patch)

ax.set_xlabel("Meta-level model")
ax.set_ylabel("Accuracy")
ax.set_xticks([])
ax.set_ylim(0, max_r * 1.3)

# legend off to the side
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
ax.set_xlim([-0.2, positions[-1] + num_models_per * bar_width])

# add light grey baseline explainer
ax.text(
    0,
    -0.15,
    " ······Baseline of predicting the mode\n(bootstrapped 95% CI)",
    ha="left",
    va="center",
    transform=ax.transAxes,
    color="grey",
    fontsize=8,
)

plt.tight_layout()
plt.show()

#### first attempts

In [None]:
# Set up the figure and axis
fig, ax = plt.subplots(figsize=(12, 5))

# Define some colors for each category
colors = {'A': 'gold', 'B': 'plum', 'A_fton_B': 'indianred'}

# Width of each bar
bar_width = 0.2

# Separate entries into two categories
a_only_entries = {k: v for k, v in model_groups.items() if "fted on" not in k}
other_entries = {k: v for k, v in model_groups.items() if "fted on" in k}

# Initialize label sets
added_labels = set()

# Plot A-only entries first
a_only_positions = np.arange(len(a_only_entries)) * (bar_width + 0.1)  # Narrower bars with some space
for i, (model, r) in enumerate(a_only_entries.items()):
    pos_base = a_only_positions[i]
    if "Untrained Model" not in added_labels:
        ax.bar(pos_base, r["A"]["acc"], yerr=np.abs(np.array(r["A"]['bootstrapped']).reshape(1, 2).T - r["A"]["acc"]), capsize=5, width=bar_width, label="Untrained Model", color=colors['A'])
        added_labels.add("Untrained Model")
    else:
        ax.bar(pos_base, r["A"]["acc"], yerr=np.abs(np.array(r["A"]['bootstrapped']).reshape(1, 2).T - r["A"]["acc"]), capsize=5, width=bar_width, color=colors['A'])
    ax.hlines(r["A"]["baseline"], pos_base - bar_width / 2, pos_base + bar_width / 2, linestyles='dotted', colors='black')

# Plot other entries
other_positions = max(a_only_positions) + 0.75 + np.arange(len(other_entries)) * (2 * bar_width + 0.1)
for i, (model, r) in enumerate(other_entries.items()):
    pos_base = other_positions[i]
    if "Training Target" not in added_labels:
        ax.bar(pos_base, r["B"]["acc"], yerr=np.abs(np.array(r["B"]['bootstrapped']).reshape(1, 2).T - r["B"]["acc"]), capsize=5, width=bar_width, color=colors['B'], label="Training Target")
        added_labels.add("Training Target")
    else:
        ax.bar(pos_base, r["B"]["acc"], yerr=np.abs(np.array(r["B"]['bootstrapped']).reshape(1, 2).T - r["B"]["acc"]), capsize=5, width=bar_width, color=colors['B'])
    ax.hlines(r["B"]["baseline"], pos_base - bar_width / 2, pos_base + bar_width / 2, linestyles='dotted', colors='black')

    if "Finetuned Model" not in added_labels:
        ax.bar(pos_base + bar_width, r["A_fton_B"]["acc"], yerr=np.abs(np.array(r["A_fton_B"]['bootstrapped']).reshape(1, 2).T - r["A_fton_B"]["acc"]), capsize=5, width=bar_width, color=colors['A_fton_B'], label="Finetuned Model")
        added_labels.add("Finetuned Model")
    else:
        ax.bar(pos_base + bar_width, r["A_fton_B"]["acc"], yerr=np.abs(np.array(r["A_fton_B"]['bootstrapped']).reshape(1, 2).T - r["A_fton_B"]["acc"]), capsize=5, width=bar_width, color=colors['A_fton_B'])
    ax.hlines(r["A_fton_B"]["baseline"], pos_base + bar_width / 2, pos_base + 3 * bar_width / 2, linestyles='dotted', colors='black')

# Set title and labels
ax.set_xlabel("Meta-level model")
ax.set_ylabel("Accuracy")

# Set x-ticks and labels
tick_positions = np.concatenate((a_only_positions, other_positions + bar_width / 2))  # Center the ticks between the groups
ax.set_xticks(tick_positions)
ax.set_xticklabels(list(a_only_entries.keys()) + list(other_entries.keys()), rotation=45, ha="right")

# Set y-axis limits
ax.set_ylim(0)

# Add legend
ax.legend(title="Comparing against")

# add light grey baseline explainer
ax.text(
    0,
    -0.4,
    " ······Baseline of predicting the mode\n(bootstrapped 95% CI)",
    ha="left",
    va="center",
    transform=ax.transAxes,
    color="grey",
    fontsize=8,
)

# Show plot
plt.tight_layout()
plt.show()


In [None]:
fig, ax = plt.subplots(figsize=(12, 5))

# Define some colors for each category
colors = {'A': 'gold', 'B': 'plum', 'A_fton_B': 'indianred'}

# Width of each bar
bar_width = 0.2

# Separate entries into two categories
a_only_entries = {k: v for k, v in model_groups.items() if "fted on" not in k}
other_entries = {k: v for k, v in model_groups.items() if "fted on" in k}

# Initialize label sets
added_labels = set()

# Plot A-only entries first
a_only_positions = np.arange(len(a_only_entries)) * (bar_width + 0.1)  # Narrower bars with some space
for i, (model, r) in enumerate(a_only_entries.items()):
    pos_base = a_only_positions[i]
    if "Untrained Model" not in added_labels:
        ax.bar(pos_base, r["A"]["acc"] - r["A"]["baseline"], yerr=np.abs(np.array(r["A"]['bootstrapped']).reshape(1, 2).T - r["A"]["acc"]), capsize=5, width=bar_width, label="Untrained Model", color=colors['A'])
        added_labels.add("Untrained Model")
    else:
        ax.bar(pos_base, r["A"]["acc"] - r["A"]["baseline"], yerr=np.abs(np.array(r["A"]['bootstrapped']).reshape(1, 2).T - r["A"]["acc"]), capsize=5, width=bar_width, color=colors['A'])

# Plot other entries
other_positions = max(a_only_positions) + 0.75 + np.arange(len(other_entries)) * (2 * bar_width + 0.1)
for i, (model, r) in enumerate(other_entries.items()):
    pos_base = other_positions[i]
    if "Training Target" not in added_labels:
        ax.bar(pos_base, r["B"]["acc"] - r["B"]["baseline"], yerr=np.abs(np.array(r["B"]['bootstrapped']).reshape(1, 2).T - r["B"]["acc"]), capsize=5, width=bar_width, color=colors['B'], label="Training Target")
        added_labels.add("Training Target")
    else:
        ax.bar(pos_base, r["B"]["acc"] - r["B"]["baseline"], yerr=np.abs(np.array(r["B"]['bootstrapped']).reshape(1, 2).T - r["B"]["acc"]), capsize=5, width=bar_width, color=colors['B'])

    if "Finetuned Model" not in added_labels:
        ax.bar(pos_base + bar_width, r["A_fton_B"]["acc"] - r["A_fton_B"]["baseline"], yerr=np.abs(np.array(r["A_fton_B"]['bootstrapped']).reshape(1, 2).T - r["A_fton_B"]["acc"]), capsize=5, width=bar_width, color=colors['A_fton_B'], label="Finetuned Model")
        added_labels.add("Finetuned Model")
    else:
        ax.bar(pos_base + bar_width, r["A_fton_B"]["acc"] - r["A_fton_B"]["baseline"], yerr=np.abs(np.array(r["A_fton_B"]['bootstrapped']).reshape(1, 2).T - r["A_fton_B"]["acc"]), capsize=5, width=bar_width, color=colors['A_fton_B'])

# Set title and labels
ax.set_xlabel("Meta-level model")
ax.set_ylabel("Accuracy Relative to Baseline")

# Set x-ticks and labels
tick_positions = np.concatenate((a_only_positions, other_positions + bar_width / 2))  # Center the ticks between the groups
ax.set_xticks(tick_positions)
ax.set_xticklabels(list(a_only_entries.keys()) + list(other_entries.keys()), rotation=45, ha="right")

# Add a horizontal line at y=0 to represent the baseline
ax.axhline(0, color='black', linewidth=1, linestyle='dotted')

# Set y-axis limits
ax.set_ylim(min([r["A"]["acc"] - r["A"]["baseline"] for r in a_only_entries.values()] + [r["B"]["acc"] - r["B"]["baseline"] for r in other_entries.values()] + [r["A_fton_B"]["acc"] - r["A_fton_B"]["baseline"] for r in other_entries.values()]) - 0.1, 
            max([r["A"]["acc"] - r["A"]["baseline"] for r in a_only_entries.values()] + [r["B"]["acc"] - r["B"]["baseline"] for r in other_entries.values()] + [r["A_fton_B"]["acc"] - r["A_fton_B"]["baseline"] for r in other_entries.values()]) + 0.1)

# Add legend
ax.legend(title="Comparing against")

# add light grey baseline explainer
ax.text(
    0,
    -0.4,
    " ······Baseline of predicting the mode\n(bootstrapped 95% CI)",
    ha="left",
    va="center",
    transform=ax.transAxes,
    color="grey",
    fontsize=8,
)

# Show plot
plt.tight_layout()
plt.show()


### Accuracy heatmap

In [None]:
for dataset in datasets:
    for response_property in response_properties:
        # Create a buffer to capture output
        buffer = io.StringIO()
        
        # Redirect stdout to the buffer
        with contextlib.redirect_stdout(buffer) if suppress_output else contextlib.nullcontext():
            results, baseline_results, bootstrap_results = make_pairwise_tables(calc_accuracy_with_excluded, filter_by_dataset(object_dfs, dataset), filter_by_dataset_and_response_property(meta_dfs, dataset, response_property))
        
        if len(results) == 0 or results.shape[0] == 0:# or results.max().max() == 0.0:
            if not suppress_output: print(f"No data for {dataset} / {response_property}")
            continue
        
        fig, ax = plt.subplots(figsize=(results.shape[1] * 1, results.shape[0] * 1))
        sns.heatmap(results.astype(float), cmap="YlGnBu", cbar=False, vmin=0, vmax=1, annot=True, fmt=".2f", ax=ax)
        
        # Add bootstrapped 95% CI
        for i, text in enumerate(ax.texts):
            row, col = np.unravel_index(i, results.shape)
            bootstrapped_result = bootstrap_results.iloc[row, col]
            text.set_text(f"{text.get_text()}\n({bootstrapped_result[0]:.1f}–{bootstrapped_result[1]:.1f})")
        
        # Check if all baseline results in each column are the same
        for col in range(baseline_results.shape[1]):
            if not (baseline_results.iloc[:, col] == baseline_results.iloc[0, col]).all():
                raise ValueError(f"Baseline results in column {col} are not consistent.")
        
        # Add baseline values at the top of each column in light grey font
        for col, baseline_value in enumerate(baseline_results.iloc[0]):
            ax.text(col + 0.5, -0.1, f"Baseline:\n{baseline_value:.2f}", ha='center', va='bottom', color='grey', fontsize=8)
        
        # Move the title up to make room for the baseline values
        ax.set_title(f"Accuracy of meta-level predicting object-level models\non {dataset} eliciting {response_property}", y=1.1)
        
        # Add text explaining the baseline
        ax.text(-0.2, -0.0, "(95% bootstrapped CI\nin parentheses)", ha='center', va='center', transform=ax.transAxes, color="grey", fontsize=8)
        # ax.text(-0.2, -0.4, "<Mode–baseline\nin chevrons>", ha='center', va='center', transform=ax.transAxes, color="grey", fontsize=8)
        
        ax.set_xlabel("Scored against object-level")
        ax.set_ylabel("Meta-level")
        ax.set_aspect("equal")  # Set aspect ratio to "equal" for square cells
        
        # Display the plot
        plt.show()

### Accuracy heatmap with baseline adjustment
Same as above, but this time we measure performance above the baseline: how surprising is the result? 

This can be done in absolute terms (subtracting the baseline from the accuracy) or in relative terms (dividing the accuracy by the baseline).

In [None]:
for dataset in datasets:
    for response_property in response_properties:
        # Create a buffer to capture output
        buffer = io.StringIO()
        
        # Redirect stdout to the buffer
        with contextlib.redirect_stdout(buffer) if suppress_output else contextlib.nullcontext():
            results, baseline_results, bootstrap_results = make_pairwise_tables(calc_accuracy_with_excluded, filter_by_dataset(object_dfs, dataset), filter_by_dataset_and_response_property(meta_dfs, dataset, response_property))
        
        if len(results) == 0 or results.shape[0] == 0:# or results.max().max() == 0.0:
            if not suppress_output: print(f"No data for {dataset} / {response_property}")
            continue

        # subtract the baseline from the results
        results = results - baseline_results
        # the same for the bootstrapped results
        bootstrap_results = bootstrap_results - baseline_results

        # turn into percentages
        results = results * 100
        bootstrap_results = bootstrap_results * 100

        # get range for color scale
        min_result = results.min().min()
        max_result = results.max().max()
        max_range = max(abs(min_result), abs(max_result))
        
        fig, ax = plt.subplots(figsize=(results.shape[1] * 1, results.shape[0] * 1))
        sns.heatmap(results.astype(float), cmap="YlGnBu", cbar=False, vmin=-max_range, vmax=max_range, annot=True, fmt=".1f", ax=ax)
        
        # Add bootstrapped 95% CI
        for i, text in enumerate(ax.texts):
            row, col = np.unravel_index(i, results.shape)
            bootstrapped_result = bootstrap_results.iloc[row, col]
            text.set_text(f"{text.get_text()}\n({bootstrapped_result[0]:.0f}–{bootstrapped_result[1]:.0f})")
        
        # Check if all baseline results in each column are the same
        for col in range(baseline_results.shape[1]):
            if not (baseline_results.iloc[:, col] == baseline_results.iloc[0, col]).all():
                raise ValueError(f"Baseline results in column {col} are not consistent.")
        
        # Add baseline values at the top of each column in light grey font
        for col, baseline_value in enumerate(baseline_results.iloc[0]):
            ax.text(col + 0.5, -0.1, f"Baseline:\n{baseline_value:.2f}", ha='center', va='bottom', color='grey', fontsize=8)
        
        # Move the title up to make room for the baseline values
        ax.set_title(f"Accuracy of meta-level predicting object-level models\non {dataset} eliciting {response_property}", y=1.1)
        
        # Add text explaining the baseline
        ax.text(-0.2, -0.0, "(95% bootstrapped CI\nin parentheses)", ha='center', va='center', transform=ax.transAxes, color="grey", fontsize=8)
        # ax.text(-0.2, -0.4, "<Mode–baseline\nin chevrons>", ha='center', va='center', transform=ax.transAxes, color="grey", fontsize=8)
        
        ax.set_xlabel("Scored against object-level")
        ax.set_ylabel("Meta-level")
        ax.set_aspect("equal")  # Set aspect ratio to "equal" for square cells
        
        # Display the plot
        plt.show()

### Logprob heatmap
What is the logprob of the _first token_ of the correct answer under the meta–level model?

In [None]:
for dataset in datasets:
    for response_property in response_properties:
        with contextlib.redirect_stdout(buffer) if suppress_output else contextlib.nullcontext():
            results, baseline_results, bootstrapped_results = make_pairwise_tables(likelihood_of_correct_first_token, filter_by_dataset(object_dfs, dataset), filter_by_dataset_and_response_property(meta_dfs, dataset, response_property))
                
        if len(results) == 0 or results.shape[0] == 0:
            if not suppress_output: print(f"No data for {dataset} / {response_property}")
            continue
        
        fig, ax = plt.subplots()
        sns.heatmap(results.astype(float), annot=True, cmap="YlGnBu", cbar=False, ax=ax, fmt=".3f")
        
        # Add bootstrapped 95% CI
        for i, text in enumerate(ax.texts):
            row, col = np.unravel_index(i, results.shape)
            bootstrapped_result = bootstrapped_results.iloc[row, col]
            text.set_text(f"{text.get_text()}\n({bootstrapped_result[0]:.2f}–{bootstrapped_result[1]:.2f})")
        
        # # Check if all baseline results in each column are the same
        # for col in range(baseline_results.shape[1]):
        #     if not (baseline_results.iloc[:, col] == baseline_results.iloc[0, col]).all():
        #         raise ValueError(f"Baseline results in column {col} are not consistent.")
        
        # Add baseline values at the top of each column in light grey font
        for col, baseline_value in enumerate(baseline_results.iloc[0]):
            ax.text(col + 0.5, -0.1, f"Baseline:\n{baseline_value:.2f}", ha='center', va='bottom', color='lightgrey', fontsize=8)
        
        # Move the title up to make room for the baseline values
        ax.set_title(f"Mean log-prob of initial object-level response under meta-level model\non {dataset} eliciting {response_property}", y=1.1)
        
        # Add text explaining the baseline
        ax.text(-0.2, -0.0, "(95% bootstrapped CI\nin parentheses)", ha='center', va='center', transform=ax.transAxes, color="grey", fontsize=8)
        
        ax.set_xlabel("Scored against object-level")
        ax.set_ylabel("Meta-level")
        ax.set_aspect("equal")  # Set aspect ratio to "equal" for square cells
        
        # Display the plot
        plt.show()

### Object vs object change heatmap

In [None]:
# which response property do we want to use for the analysis?
response_property = "identity"

In [None]:
for dataset in datasets:
    with contextlib.redirect_stdout(buffer) if suppress_output else contextlib.nullcontext():
        # fake having a meta level for this
        faux_meta_level = filter_by_dataset(object_dfs, dataset)
        for config in faux_meta_level.keys():
            config['response_property'] = {'name': response_property}
        results, _, _ = make_pairwise_tables(calc_accuracy, filter_by_dataset(object_dfs, dataset), faux_meta_level)
        print(f"Overlap between object-level completions for {dataset}")
        
        mask = np.triu(np.ones_like(results, dtype=bool), k=1)
        plt.figure(figsize=(results.shape[1] * 0.66, results.shape[0] * 0.66))
        sns.heatmap(results.astype(float), annot=True, cmap="YlGnBu", cbar=False, vmin=0, vmax=1, fmt=".0%", mask=mask)
        # plt.xlabel("Scored against object-level")
        # plt.ylabel("Meta-level")
        plt.title(f"Overlap between object-level completions for {dataset}")
        plt.gca().set_aspect("equal")  # Set aspect ratio to "equal" for square cells
        plt.show()

## Entropy barplots

In [None]:
measure = lambda df: stats.entropy(df['response'].value_counts(normalize=True))

for dataset in datasets:
    with contextlib.redirect_stdout(buffer) if suppress_output else contextlib.nullcontext():
        results = {get_label(config): measure(df) for config, df in filter_by_dataset(object_dfs, dataset).items()}
        print(f"Entropy of object-level completions for {dataset}")
        plt.figure(figsize=(6, 5))
        sns.barplot(x=list(results.keys()), y=list(results.values()), color = "green")

        plt.title(f"Entropy of object-level completions for {dataset}")
        # plt.gca().set_aspect("equal")  # Set aspect ratio to "equal" for square cells
        plt.xticks(rotation=90)
        plt.show()

    for dataset in datasets:
        results = {get_label(config): measure(df) for config, df in filter_by_dataset(meta_dfs, dataset).items()}
        print(f"Entropy of meta-level completions for {dataset}")
        plt.figure(figsize=(6*3, 5))
        sns.barplot(x=list(results.keys()), y=list(results.values()), color = "purple")

        plt.title(f"Entropy of object-level completions for {dataset}")
        # plt.gca().set_aspect("equal")  # Set aspect ratio to "equal" for square cells
        plt.xticks(rotation=90)
        plt.show()

## Compliance

In [None]:
measure = lambda df: (df['compliance'] == True).mean()

for dataset in datasets:
    with contextlib.redirect_stdout(buffer) if suppress_output else contextlib.nullcontext():
        results = {get_label(config): measure(df) for config, df in filter_by_dataset(object_dfs, dataset).items()}
        print(f"Compliance of object-level completions for {dataset}")
        sns.barplot(x=list(results.keys()), y=list(results.values()), color = "green")

        plt.title(f"Compliance of object-level completions for {dataset}")
        # plt.gca().set_aspect("equal")  # Set aspect ratio to "equal" for square cells
        plt.xticks(rotation=90)
        # scale to percent
        plt.gca().set_yticklabels(['{:.0f}%'.format(x*100) for x in plt.gca().get_yticks()])
        plt.show()

    for dataset in datasets:
        results = {get_label(config): measure(df) for config, df in filter_by_dataset(meta_dfs, dataset).items()}
        print(f"Compliance of meta-level completions for {dataset}")
        sns.barplot(x=list(results.keys()), y=list(results.values()), color = "purple")

        plt.title(f"Compliance of object-level completions for {dataset}")
        # plt.gca().set_aspect("equal")  # Set aspect ratio to "equal" for square cells
        plt.xticks(rotation=90)
        # scale to percent
        plt.gca().set_yticklabels(['{:.0f}%'.format(x*100) for x in plt.gca().get_yticks()])
        plt.show()

## View other evals
The other_evals log into EXP_DIR / STUDY_NAME / other_evals
A csv is created for each eval

In [None]:
from evals.locations import EXP_DIR
from other_evals.counterfactuals.plotting.plot_heatmap import load_csv_and_plot_heatmap
STUDY_FOLDERS = ["full_sweep_demo"]
other_evals_path = EXP_DIR / STUDY_FOLDERS[0] / "other_evals"
csv_files = list(other_evals_path.glob("*.csv"))
print(f"Found {csv_files} csv files in {other_evals_path}")

for csv_file_path in csv_files:
    load_csv_and_plot_heatmap(csv_file_path)

## For posterity
Save the notebook as HTML

In [None]:
raise Exception("Manually save the notebook before proceeding!")

In [None]:
PATH_THIS_NB = REPO_DIR / "analysis" / "object_vs_meta_comparisions.ipynb"
for study_folder in STUDY_FOLDERS:
    OUT_PATH = EXP_DIR / study_folder / "object_vs_meta_comparisions.html"
    subprocess.run(["jupyter", "nbconvert", "--to", "html", PATH_THIS_NB, "--output", OUT_PATH])