# Comparing object–level completions against meta–level predictions
This notebook compares how well different models do scored against base predictions from itself or other models. This is most useful in checking finetuned models

In [None]:
STUDY_FOLDERS = [ # 🔵 within exp/
    "training_on_many_tasks"
]
    
CONDITIONS = { 
    # see `analysis/loading_data.py` for details
    ("task", "set"): ["val"],
    ("task", "name"): ["wikipedia"],
}

In [None]:
from pathlib import Path
import subprocess
import sys
import random
import logging
import io
import contextlib
from IPython.display import clear_output

In [None]:
# set log level
logging.basicConfig(level=logging.WARNING)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import words
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import seaborn as sns

In [None]:
from evals.analysis.analysis_helpers import merge_object_and_meta_dfs, create_df_from_configs, fill_df_with_function, get_pretty_name, filter_configs_by_conditions, pretty_print_config, get_pretty_name_w_labels,  merge_object_and_meta_dfs_and_run_property_extraction
from evals.analysis.loading_data import load_dfs_with_filter, load_base_df_from_config, get_hydra_config, load_single_df, get_data_path
from evals.load.lazy_object_level_llm_extraction import lazy_add_response_property_to_object_level
from evals.utils import get_maybe_nested_from_dict
from evals.analysis.analysis_functions import *
from evals.analysis.analysis_helpers import bootstrap_ci

In [None]:
# Set the display option to None to show all content
pd.set_option('display.max_colwidth', 200)
# show all columns
pd.set_option('display.max_columns', None)

In [None]:
# set color palette
palette = sns.color_palette("Set1", 64)
sns.set_palette(palette)

In [None]:
# do we have a nice font installed? You might need to clear the matplotlib font cache
plt.rcParams["font.family"] = fm.get_font(fm.findfont("Univers Next Pro")).family_name # falls back to default automatically

# retina plots
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [None]:
# get seaborn to shut up
import warnings
# Ignore the specific FutureWarning
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn")

In [None]:
from evals.locations import REPO_DIR, EXP_DIR

Load dataframes in

In [None]:
# load the dataframes with configs as keys
dfs = {}
for STUDY_FOLDER in STUDY_FOLDERS:
    _dfs = load_dfs_with_filter(EXP_DIR / STUDY_FOLDER, CONDITIONS, exclude_noncompliant=False)
    dfs.update(_dfs)
    print(f"Loaded {len(_dfs)} dataframes from {STUDY_FOLDER}")
clear_output()
print(f"Loaded {len(dfs)} dataframes in total")

In [None]:
def is_base_config(config):
    return config["prompt"]["method"].startswith("object") or config["prompt"]["method"].startswith("base")

In [None]:
object_dfs = {config: df for config, df in dfs.items() if is_base_config(config)}
meta_dfs = {config: df for config, df in dfs.items() if not is_base_config(config)}
print(f"Loaded {len(object_dfs)} base and {len(meta_dfs)} self-prediction dataframes")

In [None]:
print("We have the following datasets:")
datasets = set([get_maybe_nested_from_dict(k, ('task', 'name')) for k in object_dfs.keys()])
print(datasets)

In [None]:
print("We have the following response properties:")
response_properties = set([get_maybe_nested_from_dict(k, ('response_property', 'name')) for k in meta_dfs.keys()])
print(response_properties)

## Plots

### Making labels

In [None]:
{get_maybe_nested_from_dict(c, ('language_model', 'model')) for c in object_dfs.keys()}.union({get_maybe_nested_from_dict(c, ('language_model', 'model')) for c in meta_dfs.keys()})

In [None]:
def get_label(config):
    try: # if we just pass the model name in, we can skip the rest
        return MODEL_LABELS[config]
    except KeyError:
        pass
    try:
        label = ""
        if isinstance(config, str):
            config = eval(config)
        model = get_maybe_nested_from_dict(config, ('language_model', 'model'))
        if model in MODEL_LABELS:
            model = MODEL_LABELS[model]
        label += model
        response_property = get_maybe_nested_from_dict(config, ('response_property', 'name'))
        if response_property not in ["None", None]:
            label += f"\n predicting {response_property}"
        note = get_maybe_nested_from_dict(config, 'note')
        if note not in ["None", None]:
            label += f"\n{note}"
    except Exception as e:
        print(f"Failed to get label for {config}: {e}")
        label = str(config)
    return label

In [None]:
MODEL_LABELS = {
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:35on35onnum:8x4lehAb": "GPT3.5 fted on GPT3.5" ,
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:35on35onnumscram:8x6QzXiQ": "GPT3.5 fted on GPT3.5\n(scrambled)",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:35on4onnum:8xMcmGZM": "GPT3.5 fted on GPT4",
    "ft:gpt-4-0613:dcevals-kokotajlo:4on4onnum:8x8dNwL1": "GPT4 fted on GPT4",
    "ft:gpt-4-0613:dcevals-kokotajlo:4on35onnum:8xq9fNVt": "GPT4 fted on GPT3.5",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:35on35onnums:8zFjiOFt": "GPT3.5 fted on GPT3.5 (small dataset)",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:35on4onnums:8zHmk4o8": "GPT3.5 fted on GPT4 (small dataset)",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:35on35nwvrp:8zJsJdOE": "GPT3.5 fted on GPT3.5\n(various response properties)",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:sweep:97WTZlBs": "GPT3.5 fted on GPT3.5",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:sweep:9EXL6W9A": "GPT3.5 fted on GPT3.5", # from training_on_everything_apr_15
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:sweep:9ErgUPF1": "GPT3.5 fted on Claude 3 Sonnet", # from training_on_everything_apr_15
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:sweep:9GYUm36T": "GPT3.5 fted on GPT3.5", # from training_on_everything_apr_15_reproduction
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:sweep:9GYUIKU9": "GPT3.5 fted on Claude 3 Sonnet", # from training_on_everything_apr_15_reproduction
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:sweep:9OwJgWbn": "GPT3.5 fted on GPT3.5",  # from everything_response_properties_only
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:sweep:9OwInlu2": "GPT3.5 fted on Claude 3 Sonnet", # from everything_response_properties_only
    "gpt-3.5-turbo-1106": "GPT3.5",
    "gpt-4-0613": "GPT4",
    "claude-3-sonnet-20240229": "Claude 3 Sonnet",
    "claude-3-opus-20240229": "Claude 3 Opus",
    # learning rate/ batch size sweeps
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr2bs1:9OxIyl5Y": "lr 02 bs 01",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr5bs1:9Oy5MhWO": "lr 05 bs 01",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr1bs32:9Owy4q3J": "lr 01 bs 32",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr1bs1:9OwZGDEY": "lr 01 bs 01",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr1bs5:9OwpdrcW": "lr 01 bs 05",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr5bs10:9OyScIxy": "lr 05 bs 10",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr5bs3:9OyFzfs4": "lr 05 bs 03",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr1bs10:9OwuNmul": "lr 01 bs 10",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr5bs32:9OyWNifb": "lr 05 bs 32",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr2bs3:9OxSxaCD": "lr 02 bs 03",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr10bs1:9Oys0czF": "lr 10 bs 01",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr5bs5:9OyNWs1I": "lr 05 bs 05",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr10bs10:9OzCDMh6": "lr 10 bs 10",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr15bs5:9Oztue51": "lr 15 bs 05",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr1bs3:9OwjMTpq": "lr 01 bs 03",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr15bs32:9P051Qau": "lr 15 bs 32",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr2bs32:9OxjoVCT": "lr 02 bs 32",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr15bs3:9OznpdVn": "lr 15 bs 03",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr2bs5:9OxbS5Di": "lr 02 bs 05",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr10bs5:9Oz5osDG": "lr 10 bs 05",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr10bs32:9OzGe8O8": "lr 10 bs 32",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr15bs1:9OzdOc1K": "lr 15 bs 01",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr10bs3:9OyzbPKI": "lr 10 bs 03",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr15bs10:9P00MlIM": "lr 15 bs 10",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr2bs10:9OxgItcg": "lr 02 bs 10",
    # how much data to train on
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:size25:9PeI9672": "0025",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:size1000:9PeeYDxO": "1000",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:size100:9PeMX537": "0100",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:size10:9PeFNUoB": "0010",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:size500:9PeSivnS": "0500",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:size400:9PhFYzvA": "0400",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:size250:9PhA5O0g": "0250",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:size750:9PhVaFNR": "0750",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:size600:9PhM4j2D": "0600",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:size900:9PhdncQc": "0900",
    # training_on_many_tasks
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:no-rp-1:9PgraN40": "no-rp-1",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:no-rp-3:9PgU7cqI": "no-rp-3",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:rp-2:9PgIbrdZ": "rp-2",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:no-rp-2:9Pgck86p": "no-rp-2",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:rp-1:9Pgiyv8C": "rp-1",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:all:9QQHO0EH": "all",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:rp-3:9QQKfdt3": "rp-3",
}

In [242]:
print('Get names for Gemini models')
endpoints = [
    "projects/351298396653/locations/us-central1/endpoints/3197931768409751552",
    "projects/351298396653/locations/us-central1/endpoints/5520944751202795520"
]
hparams_list = get_hparams_for_endpoints(endpoints)
for hp, endpoint in zip(hparams_list, endpoints):
    print(f"\"{endpoint}\": \"G_ft_G e {hp.get('epochCount','none')} lr {str(hp.get('learningRateMultiplier','none'))} a {hp.get('adapterSize','none').split('_')[-1]}\",")

Get names for Gemini models, make sure 
"projects/351298396653/locations/us-central1/endpoints/3197931768409751552": "G_ft_G e none lr none a none",
"projects/351298396653/locations/us-central1/endpoints/5520944751202795520": "G_ft_G e none lr none a none",


In [None]:
models_wo_labels = [l for l in {get_maybe_nested_from_dict(c, ('language_model', 'model')) for c in object_dfs.keys()}.union({get_maybe_nested_from_dict(c, ('language_model', 'model')) for c in meta_dfs.keys()}) if l not in MODEL_LABELS]
if len(models_wo_labels) > 0: print("Models without labels:") 
else: print("All models have labels")
for m in models_wo_labels:
    print(m)

In [None]:
# get the genealogy of models
MODEL_GENEALOGY = {}
# we make use of the fact that the finetuned models contain the name of the model they were finetuned on
model_ids = MODEL_LABELS.keys()
for model_id in model_ids:
    if not any([inner_model_id in model_id and inner_model_id != model_id for inner_model_id in model_ids]):
        # this is not a finetuned model
        if model_id not in MODEL_GENEALOGY:
            MODEL_GENEALOGY[model_id] = []
    else: # this is a model that has a parent
        parent_id = [inner_model_id for inner_model_id in model_ids if inner_model_id in model_id and inner_model_id != model_id][0]
        if parent_id not in MODEL_GENEALOGY:
            MODEL_GENEALOGY[parent_id] = []
        MODEL_GENEALOGY[parent_id].append(model_id)

# we want a version of it with computed labels
MODEL_GENEALOGY_LABELS = {get_label(k): set([get_label(v) for v in vs]) for k, vs in MODEL_GENEALOGY.items()}

MODEL_GENEALOGY

In [None]:
MODEL_GENEALOGY_LABELS

### Helper functions

In [None]:
def construct_mode_object_df(df: pd.DataFrame, response_property: str):
    """Takes in an object level df and returns a version where every response has been swapped out for the mode response in the dataframe. 
    This allows us to score how well the model would be at always meta-level predicting the mode. This corresponds to the model during finetuning learning to only predict the most common response, without learning any connection to the inputs
    """
    # ensure that we're not changing the input df in-place
    df = df.copy()
    # get most common response property
    mode = df[df['compliance'] == True][response_property].apply(clean_string).mode()[0] # if multiple most common answers, chooses one
    mode_row = df[df[response_property].apply(clean_string) == mode].head(1)
    # ensure that the mode row has the cleaned string
    mode_row[response_property] = mode
    # drop the input string
    mode_row = mode_row.drop("string", axis=1).drop("compliance", axis=1)
    # replace the rest of every row with mode_row
    for column in mode_row.columns:
        df[column] = [mode_row[column].item()] * len(df)
    return df
    

In [None]:
BOOTSTRAP_ITERATIONS = 10

def make_pairwise_tables(measure, object_dfs, meta_dfs):
    results = pd.DataFrame(columns=[str(config) for config in object_dfs.keys()], index=[str(config) for config in meta_dfs.keys()])
    baseline_results = pd.DataFrame(columns=[str(config) for config in object_dfs.keys()], index=[str(config) for config in meta_dfs.keys()]) # we compare the model against the baseline of 
    bootstrapped_results = pd.DataFrame(columns=[str(config) for config in object_dfs.keys()], index=[str(config) for config in meta_dfs.keys()])
    for object_config, object_df in object_dfs.items():
        for meta_config, meta_df in meta_dfs.items():
            # compute joint df
            joint_df = merge_object_and_meta_dfs_and_run_property_extraction(
                object_df,
                meta_df,
                object_config,
                meta_config,
            )
            if len(joint_df) == 0:
                print(f"Empty dataframe for {object_config} and {meta_config}")
                continue
            results.loc[str(meta_config), str(object_config)] = measure(joint_df)

            # what would we see under the baseline of always picking the object-level mode?
            # add the resopnse property if necessary
            if not 'response_property' in object_df.columns:
                lazy_add_response_property_to_object_level(object_df, object_config, meta_config.response_property.name)

            # in some cases, we might not have a response property in the object_df. In this case, we need to add it
            if not meta_config['response_property']['name'] in object_df.columns:
                object_df = lazy_add_response_property_to_object_level(object_df, object_config, meta_config['response_property']['name'])

            # modify the object-level df to always contain the mode
            mode_object_df = construct_mode_object_df(object_df, meta_config['response_property']['name'])
            # compute joint df
            mode_joint_df = merge_object_and_meta_dfs_and_run_property_extraction(
                object_df,
                mode_object_df,
                object_config,
                meta_config,
            )
            if len(joint_df) == 0:
                continue
            baseline_results.loc[str(meta_config), str(object_config)] = measure(mode_joint_df)

            # we want to compute the 95%CI of the measure. We do this by bootstrapping over resampling the joint_df
            bootstrapped_results.loc[str(meta_config), str(object_config)] = bootstrap_ci(joint_df, measure, BOOTSTRAP_ITERATIONS)
    results.index = results.index.map(get_label)
    results.columns = results.columns.map(get_label)
    # do we have columns that are all NaN? This happens eg. when we are reading in task.set==train dataframes, and only compare against val
    # get list of cols
    drop_cols = results.columns[results.isna().all(axis=0)]
    # and rows too
    drop_rows = results.index[results.isna().all(axis=1)]
    # drop them
    results = results.drop(columns=drop_cols)
    results = results.drop(index=drop_rows)
    # sort the columns and the rows
    results = results.sort_index(axis=0)
    results = results.sort_index(axis=1)
    # the saem for the baseline results
    baseline_results.index = baseline_results.index.map(get_label)
    baseline_results.columns = baseline_results.columns.map(get_label)
    # drop nas
    baseline_results = baseline_results.drop(columns=drop_cols)
    baseline_results = baseline_results.drop(index=drop_rows)
    # sort the columns and the rows
    baseline_results = baseline_results.sort_index(axis=0)
    baseline_results = baseline_results.sort_index(axis=1)
    # and the same for the bootstrapped results
    bootstrapped_results.index = bootstrapped_results.index.map(get_label)
    bootstrapped_results.columns = bootstrapped_results.columns.map(get_label)
    # drop cols and rows
    bootstrapped_results = bootstrapped_results.drop(columns=drop_cols)
    bootstrapped_results = bootstrapped_results.drop(index=drop_rows)
    # sort the columns and the rows
    bootstrapped_results = bootstrapped_results.sort_index(axis=0)
    bootstrapped_results = bootstrapped_results.sort_index(axis=1)
    assert results.shape == baseline_results.shape == bootstrapped_results.shape
    assert results.columns.equals(baseline_results.columns) and results.index.equals(baseline_results.index)
    assert results.columns.equals(bootstrapped_results.columns) and results.index.equals(bootstrapped_results.index)
    return results, baseline_results, bootstrapped_results

In [None]:
def filter_by_dataset(dfs, dataset):
    return {config: df for config, df in dfs.items() if get_maybe_nested_from_dict(config, ('task', 'name')) == dataset}

def filter_by_dataset_and_response_property(dfs, dataset, response_property):
    return {config: df for config, df in dfs.items() if get_maybe_nested_from_dict(config, ('task', 'name')) == dataset and get_maybe_nested_from_dict(config, ('response_property', 'name')) == response_property}

Do we want to see debugging output in the plots?

In [None]:
suppress_output = True

### Aggregate measure
Across all tasks, how do the models compare?

In [None]:
# we need to make groups of all models that belong together
object_dfs_groups = {cfg['language_model']['model']:[{k:v} for k,v in object_dfs.items() if k['language_model']['model'] == cfg['language_model']['model']] for cfg in set(object_dfs.keys())}
meta_dfs_groups = {cfg['language_model']['model']:[{k:v} for k,v in meta_dfs.items() if k['language_model']['model'] == cfg['language_model']['model']] for cfg in set(meta_dfs.keys())}

In [None]:
def make_pairwise_table_across_everything(measure, object_dfs, meta_dfs):
    results = pd.DataFrame(columns=[g for g in object_dfs_groups.keys()], index=[g for g in meta_dfs_groups.keys()])
    bootstrapped_results = pd.DataFrame(columns=[g for g in object_dfs_groups.keys()], index=[g for g in meta_dfs_groups.keys()])
    count_results = pd.DataFrame(columns=[g for g in object_dfs_groups.keys()], index=[g for g in meta_dfs_groups.keys()]) # how many datapoints do we have for each comparison
    for object_group, _object_dfs in object_dfs_groups.items():
        for meta_group, _meta_dfs in meta_dfs_groups.items():
            all_joint_df = pd.DataFrame()
            for object_dfs in _object_dfs:
                for object_config, object_df in object_dfs.items():
                    for meta_dfs in _meta_dfs:
                        for meta_config, meta_df in filter_by_dataset(meta_dfs, object_config["task"]["name"]).items():
                            # compute joint df
                            joint_df = merge_object_and_meta_dfs_and_run_property_extraction(
                                object_df,
                                meta_df,
                                object_config,
                                meta_config,
                            )
                            if len(joint_df) == 0:
                                print(f"Empty dataframe for {object_config} and {meta_config}")
                                continue
                            all_joint_df = pd.concat([all_joint_df, joint_df])
            if len(all_joint_df) == 0:
                print(f"Empty dataframe for {object_group} and {meta_group}")
                continue
            results.loc[meta_group, object_group] = measure(all_joint_df)
            bootstrapped_results.loc[meta_group, object_group] = bootstrap_ci(all_joint_df, measure, BOOTSTRAP_ITERATIONS)
            count_results.loc[meta_group, object_group] = len(all_joint_df[['extracted_property_meta','extracted_property_object']].dropna())
    # add human readable labels
    results.index = results.index.map(get_label)
    results.columns = results.columns.map(get_label)
    bootstrapped_results.index = bootstrapped_results.index.map(get_label)
    bootstrapped_results.columns = bootstrapped_results.columns.map(get_label)
    count_results.index = count_results.index.map(get_label)
    count_results.columns = count_results.columns.map(get_label)
    # do we have columns that are all NaN? This happens eg. when we are reading in task.set==train dataframes, and only compare against val
    # get list of cols
    drop_cols = results.columns[results.isna().all(axis=0)]
    # and rows too
    drop_rows = results.index[results.isna().all(axis=1)]
    # drop them
    results = results.drop(columns=drop_cols)
    results = results.drop(index=drop_rows)
    # sort the columns and the rows
    results = results.sort_index(axis=0)
    results = results.sort_index(axis=1)
    # drop cols and rows
    bootstrapped_results = bootstrapped_results.drop(columns=drop_cols)
    bootstrapped_results = bootstrapped_results.drop(index=drop_rows)
    # sort the columns and the rows
    bootstrapped_results = bootstrapped_results.sort_index(axis=0)
    bootstrapped_results = bootstrapped_results.sort_index(axis=1)
    # drop cols and rows
    count_results = count_results.drop(columns=drop_cols)
    count_results = count_results.drop(index=drop_rows)
    # sort the columns and the rows
    count_results = count_results.sort_index(axis=0)
    count_results = count_results.sort_index(axis=1)
    assert results.shape == bootstrapped_results.shape
    assert results.columns.equals(bootstrapped_results.columns) and results.index.equals(bootstrapped_results.index)
    return results, bootstrapped_results, count_results, all_joint_df

In [None]:
agg_results, agg_results_bootstrapped, agg_count_results, joint_df = make_pairwise_table_across_everything(calc_accuracy_with_excluded, object_dfs, meta_dfs)

In [None]:
# save the joint_df
joint_df.to_csv(EXP_DIR / STUDY_FOLDERS[0] / "all_joint_df.csv")

In [None]:
agg_results

In [None]:
agg_results_bootstrapped

In [None]:
agg_count_results

In [None]:
# plot it like below
fig, ax = plt.subplots(figsize=(15, 15))
sns.heatmap(agg_results.astype(float), annot=True, fmt=".2f", cmap="viridis", vmin=0, vmax=1, ax=ax, cbar=False)

# Add bootstrapped 95% CI
for i, text in enumerate(ax.texts):
    row, col = np.unravel_index(i, agg_results.shape)
    bootstrapped_result = agg_results_bootstrapped.iloc[row, col]
    try:
        text.set_text(f"{text.get_text()}\n({bootstrapped_result[0]:.2f}–{bootstrapped_result[1]:.2f})")
    except TypeError:
        text.set_text(f"{text.get_text()}\n({bootstrapped_result:.2f})")

ax.set_xlabel("Object-level model")
ax.set_ylabel("Meta-level model")

# Add text explaining the baseline
ax.text(
    -0.15,
    -0.0,
    "(95% bootstrapped CI\nin parentheses)",
    ha="center",
    va="center",
    transform=ax.transAxes,
    color="grey",
    fontsize=8,
)


plt.title("Aggregated Accuracy over all tasks and response properties")

plt.show()

In [None]:
# TEMP to look at the effect of batch size and learning rate
indices = [n for n in agg_results.index if "lr" in n and "bs" in n]
batch_sizes = set([c.split("bs")[1].split(":")[0] for c in indices])
learning_rates = set([c.split("lr")[1].split("bs")[0] for c in indices])
sizes = sorted(set([c for c in agg_results.index if c.isdigit()]))

In [None]:
sorted(sizes)

In [None]:
# self-prediction over batch size
results = {}
for bs in batch_sizes:
    selected_indices = [c for c in indices if f"bs{bs}" in c]
    results[bs] = [agg_results.loc[i, i] for i in selected_indices]

# sort results
results = {k: v for k, v in sorted(results.items(), key=lambda item: int(item[0]))}

# Calculate mean and standard deviation
batch_sizes_sorted = list(results.keys())
means = [np.mean(results[bs]) for bs in batch_sizes_sorted]
std_devs = [np.std(results[bs]) for bs in batch_sizes_sorted]

# Convert batch sizes to integers for plotting
batch_sizes_sorted = [int(bs) for bs in batch_sizes_sorted]

# Plot the results with error bars
plt.figure(figsize=(10, 6))
plt.errorbar(batch_sizes_sorted, means, yerr=std_devs, fmt='-o', capsize=5, capthick=2, label = "acc(A_fton_A, A_fton_A)")

# we also want the same for acc(A_fton_A, A)
results = {}
for bs in batch_sizes:
    selected_indices = [c for c in indices if f"bs{bs}" in c]
    results[bs] = [agg_results.loc[i, "GPT3.5"] for i in selected_indices]

# sort results
results = {k: v for k, v in sorted(results.items(), key=lambda item: int(item[0]))}

# Calculate mean and standard deviation
batch_sizes_sorted = list(results.keys())
means = [np.mean(results[bs]) for bs in batch_sizes_sorted]
std_devs = [np.std(results[bs]) for bs in batch_sizes_sorted]

# Convert batch sizes to integers for plotting
batch_sizes_sorted = [int(bs) for bs in batch_sizes_sorted]

# Plot the results with error bars
plt.errorbar(batch_sizes_sorted, means, yerr=std_devs, fmt='-o', capsize=5, capthick=2, label = "acc(A_fton_A, A)")

plt.xlabel('Batch Size')
plt.ylabel('Accuracy')
plt.title('Batch Size')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# general self-prediction (might not make sense)
results = {}
for md in agg_results.index:
    results[md] = agg_results.loc[md, md]

plt.plot(results.keys(), results.values(), '-o', label = "acc(A_fton_A, A_fton_A)")

# plot versus gpt-3.5
results = {}
for md in agg_results.index:
    results[md] = agg_results.loc[md, "GPT3.5"]

plt.plot(results.keys(), results.values(), '-o', label = "acc(A_fton_A, A)")

plt.ylabel('Accuracy')
plt.legend()
plt.show()


In [None]:
# self-prediction over batch size
results = {}
for lr in learning_rates:
    selected_indices = [c for c in indices if f"lr{lr}" in c]
    results[lr] = [agg_results.loc[i, i] for i in selected_indices]

# sort results
results = {k: v for k, v in sorted(results.items(), key=lambda item: int(item[0]))}

# Calculate mean and standard deviation
lr_sorted = list(results.keys())
means = [np.mean(results[lr]) for lr in lr_sorted]
std_devs = [np.std(results[lr]) for lr in lr_sorted]

# Convert batch sizes to integers for plotting
lr_sorted = [int(lr) for lr in lr_sorted]

# Plot the results with error bars
plt.figure(figsize=(10, 6))
plt.errorbar(lr_sorted, means, yerr=std_devs, fmt='-o', capsize=5, capthick=2, label = "acc(A_fton_A, A_fton_A)")

# we also want the same for acc(A_fton_A, A)
results = {}
for lr in learning_rates:
    selected_indices = [c for c in indices if f"lr{lr}" in c]
    results[lr] = [agg_results.loc[i, "GPT3.5"] for i in selected_indices]

# sort results
results = {k: v for k, v in sorted(results.items(), key=lambda item: int(item[0]))}

# Calculate mean and standard deviation
lr_sorted = list(results.keys())
means = [np.mean(results[lr]) for lr in lr_sorted]
std_devs = [np.std(results[lr]) for lr in lr_sorted]

# Convert batch sizes to integers for plotting
lr_sorted = [int(lr) for lr in lr_sorted]

# Plot the results with error bars
plt.errorbar(lr_sorted, means, yerr=std_devs, fmt='-o', capsize=5, capthick=2, label = "acc(A_fton_A, A)")

plt.xlabel('Learning Rate')
plt.ylabel('Accuracy')
plt.title('Learning Rate')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
agg_results

In [None]:
# self-prediction over batch size
results = {}
for size in sizes:
    selected_indices = [c for c in agg_results.index if f"{size}" in c]
    results[size] = [agg_results.loc[i, i] for i in selected_indices]

# sort results
results = {k: v for k, v in sorted(results.items(), key=lambda item: int(item[0]))}

# Calculate mean and standard deviation
size_sorted = list(results.keys())
means = [np.mean(results[size]) for size in size_sorted]
std_devs = [np.std(results[size]) for size in size_sorted]

# Convert batch sizes to integers for plotting
size_sorted = [int(size) for size in size_sorted]

# Plot the results with error bars
plt.figure(figsize=(10, 6))
plt.errorbar(size_sorted, means, yerr=std_devs, fmt='-o', capsize=5, capthick=2, label = "acc(A_fton_A, A_fton_A)")

# we also want the same for acc(A_fton_A, A)
results = {}
for size in sizes:
    selected_indices = [c for c in agg_results.index if f"{size}" in c]
    results[size] = [agg_results.loc[i, "GPT3.5"] for i in selected_indices]

# sort results
results = {k: v for k, v in sorted(results.items(), key=lambda item: int(item[0]))}

# Calculate mean and standard deviation
size_sorted = list(results.keys())
means = [np.mean(results[size]) for size in size_sorted]
std_devs = [np.std(results[size]) for size in size_sorted]

# Convert batch sizes to integers for plotting
size_sorted = [int(size) for size in size_sorted]

# Plot the results with error bars
plt.errorbar(size_sorted, means, yerr=std_devs, fmt='-o', capsize=5, capthick=2, label = "acc(A_fton_A, A)")

plt.xlabel('Training size')
plt.ylabel('Accuracy')
plt.title('How much data to train on')
plt.legend()
plt.grid(True)
plt.show()

**Hypotheses**:
- H1 Introspection: Introspection training causes model to answer questions using introspection (internal access to what answer would be on object task). 
    - Prediction. If you do introspection training and its successful, then you could finetune again to shift the object level distribution again, and after this test the model in predicting itself. 
Not introspection:
- H2 (Extrospection) Finetuned model learns distribution it was trained on (object outputs and response properties of outputs). 
    - Question of what to expect OOD. Assume usual transfer story, mimicking similar distributions. 
- H3 (ignoring meta-level info). Finetuned model learns to ignore all but the object-level prompt. 
    - Predicts failure on response properties and manipulating the input string.\
    - What about if H3 holds for self-prediction but we have something else explaining response properties?
- H4 (simplification of distribution). The finetuning causes the model’s object distribution to become very simple, such that the model’s meta-level predictions can match it without a systematic causal link between the two (as would be predicted by H1). 
    - Predicts that output distributions will be simple. 
        - Mode collapse: most of mass is on a single output
        - Entropy / variance
        - Few-shot prompt or train another model on the distribution and see how well it does
    - Could test this by seeing how well GPT4 does with 50-shot at predicting the object-level distribution before/after finetuning
- H5 Training doesn’t improve performance beyond increasing adherence. 


**Evidence involving A_fton_A**

Intuitive: A_fton_A does better at predicting A_fton_A than it does at predicting A. 
Need a notion of predicting better. 
If one distribution has predict_mode baseline of 90% and model gets 91%, this is intuitively not as good as if baseline is 0% and model gets 90%. So raw difference in accuracy is not sufficient. 

`Difference(accuracy(A_fton_A, A_fton_A), accuracy(A_fton_A, A) )`

The accuracy can either be adjusted for inherent complexity of the distributions or not. We should show both. 

E.g. 
difference_raw = raw_accuracy(A_fton_A, A_fton_A) -  raw_accuracy(A_fton_A, A)

difference_adjusted would in both places replace the raw_accuracy with an adjusted accuracy, that gives lower scores if the underlying distribution is simple:
adjusted_accuracy(A_fton_A, A_fton_A) = raw_accuracy(A_fton_A, A_fton_A) - mode_baseline(A_fton_A). 



**Evidence involving A_fton_B**

`Difference(accuracy(A_fton_B, A_fton_B), accuracy(A_fton_B, B) )`

Simple story. On H2, model is better at predicting B than predicting itself, because it is trained on B. 

There is distribution shift in object level for A_fton_B. But H2 would still predict the difference above, even with some distribution shift towards B, assuming that the shift doesn’t take you all the way to B. (Note that H2 assumes zero introspection). 

Another possibility is a version of H4. A_fton_B’s own distribution becomes very simple and it happens to predict this distribution. We can rule this out by testing complexity and by doing some adjustment in computing accuracy. 


**A_fton_A**: how much better does an introspection-trained model predict itself compared to predicting the fixed training target, ie. the non-trained version of itself?

`Difference(accuracy(A_fton_A, A_fton_A), accuracy(A_fton_A, A) )`


In [None]:
MODEL_GENEALOGY_LABELS

In [None]:
# models trained on another model
# we code this as "does the model contain the name of the one it was trained on twice?"
def is_self_trained(model, parent_model):
    return model.count(parent_model) == 2

SELF_TRAINED_MODEL_GENEAOLGY_LABELS = {child: is_self_trained(child, model) for model, children in MODEL_GENEALOGY_LABELS.items() for child in children}
SELF_TRAINED_MODEL_GENEAOLGY_LABELS

In [None]:
for meta_label in agg_results.index:
    # is this a self-trained model?
    if meta_label not in SELF_TRAINED_MODEL_GENEAOLGY_LABELS or not SELF_TRAINED_MODEL_GENEAOLGY_LABELS[meta_label]:
        continue  # it's not a self-trained model

    acc_A_fton_B_on_A_fton_B = agg_results.loc[meta_label, meta_label]
    ci_A_fton_B_on_A_fton_B = agg_results_bootstrapped.loc[meta_label, meta_label]

    # now we need to find the B
    # which MODEL_GENEALOGY_LABELS has this model as a child?
    for parent_model, children in MODEL_GENEALOGY_LABELS.items():
        if meta_label in children:
            # we have found the parent model
            acc_A_fton_A_on_B = agg_results.loc[meta_label, parent_model]
            ci_A_fton_A_on_B = agg_results_bootstrapped.loc[meta_label, parent_model]

            acc_improvement = acc_A_fton_B_on_A_fton_B - acc_A_fton_A_on_B
            ci_improvement = (ci_A_fton_B_on_A_fton_B[0] - ci_A_fton_A_on_B[1], ci_A_fton_B_on_A_fton_B[1] - ci_A_fton_A_on_B[0])

            print(f"'{meta_label}':\t The introspection–trained models accuracy is **{acc_improvement:.3f}** higher when predicting itself ('{meta_label}') than when predicting what it has been trained on ('{parent_model}') (95% CI: {ci_improvement[0]:.2f}–{ci_improvement[1]:.2f})")
            break  # we can stop looking for the parent model

**A_fton_B**: how much better is a model trained on itself at predicting itself compared to a model trained on.

`Difference(accuracy(A_fton_B, A_fton_B), accuracy(A_fton_B, B) )`

In [None]:
for meta_label in agg_results.index:
    # is this a self-trained model?
    if meta_label not in SELF_TRAINED_MODEL_GENEAOLGY_LABELS or SELF_TRAINED_MODEL_GENEAOLGY_LABELS[meta_label]:
        continue  # it's not a self-trained model

    acc_A_fton_B_on_A_fton_B = agg_results.loc[meta_label, meta_label]
    ci_A_fton_B_on_A_fton_B = agg_results_bootstrapped.loc[meta_label, meta_label]

    # now we need to find the B
    # we need to find the model that it has been trained on
    for parent_label in MODEL_GENEALOGY_LABELS:
        if meta_label.startswith(parent_label):
            # we have found the parent model A
            # what is B?
            for target_label in MODEL_GENEALOGY_LABELS:
                if target_label == parent_label:
                    continue
                if target_label not in meta_label:
                    continue
                # the model that is left is B
                acc_A_fton_A_on_B = agg_results.loc[meta_label, target_label]
                ci_A_fton_A_on_B = agg_results_bootstrapped.loc[meta_label, target_label]

                acc_improvement = acc_A_fton_B_on_A_fton_B - acc_A_fton_A_on_B
                ci_improvement = (
                    ci_A_fton_B_on_A_fton_B[0] - ci_A_fton_A_on_B[1],
                    ci_A_fton_B_on_A_fton_B[1] - ci_A_fton_A_on_B[0],
                )

                print(
                    f"'{meta_label}':\t The introspection–trained models accuracy is **{acc_improvement:.3f}** higher when predicting itself ('{meta_label}') than when predicting what it has been trained on ('{target_label}') (95% CI: {ci_improvement[0]:.2f}–{ci_improvement[1]:.2f})"
                )
                break  # we can stop looking for the parent model

### Accuracy heatmap

In [None]:
for dataset in datasets:
    for response_property in response_properties:
        # Create a buffer to capture output
        buffer = io.StringIO()
        
        # Redirect stdout to the buffer
        with contextlib.redirect_stdout(buffer) if suppress_output else contextlib.nullcontext():
            results, baseline_results, bootstrap_results = make_pairwise_tables(calc_accuracy_with_excluded, filter_by_dataset(object_dfs, dataset), filter_by_dataset_and_response_property(meta_dfs, dataset, response_property))
        
        if len(results) == 0 or results.shape[0] == 0:# or results.max().max() == 0.0:
            if not suppress_output: print(f"No data for {dataset} / {response_property}")
            continue
        
        fig, ax = plt.subplots()
        sns.heatmap(results.astype(float), cmap="YlGnBu", cbar=False, vmin=0, vmax=1, annot=True, fmt=".2f", ax=ax)
        
        # Add bootstrapped 95% CI
        for i, text in enumerate(ax.texts):
            row, col = np.unravel_index(i, results.shape)
            bootstrapped_result = bootstrap_results.iloc[row, col]
            text.set_text(f"{text.get_text()}\n({bootstrapped_result[0]:.2f}–{bootstrapped_result[1]:.2f})")
        
        # Check if all baseline results in each column are the same
        for col in range(baseline_results.shape[1]):
            if not (baseline_results.iloc[:, col] == baseline_results.iloc[0, col]).all():
                raise ValueError(f"Baseline results in column {col} are not consistent.")
        
        # Add baseline values at the top of each column in light grey font
        for col, baseline_value in enumerate(baseline_results.iloc[0]):
            ax.text(col + 0.5, -0.1, f"Baseline:\n{baseline_value:.2f}", ha='center', va='bottom', color='grey', fontsize=8)
        
        # Move the title up to make room for the baseline values
        ax.set_title(f"Accuracy of meta-level predicting object-level models\non {dataset} eliciting {response_property}", y=1.1)
        
        # Add text explaining the baseline
        ax.text(-0.2, -0.0, "(95% bootstrapped CI\nin parentheses)", ha='center', va='center', transform=ax.transAxes, color="grey", fontsize=8)
        # ax.text(-0.2, -0.4, "<Mode–baseline\nin chevrons>", ha='center', va='center', transform=ax.transAxes, color="grey", fontsize=8)
        
        ax.set_xlabel("Scored against object-level")
        ax.set_ylabel("Meta-level")
        ax.set_aspect("equal")  # Set aspect ratio to "equal" for square cells
        
        # Display the plot
        plt.show()

### Logprob heatmap
What is the logprob of the _first token_ of the correct answer under the meta–level model?

In [None]:
for dataset in datasets:
    for response_property in response_properties:
        with contextlib.redirect_stdout(buffer) if suppress_output else contextlib.nullcontext():
            results, baseline_results, bootstrapped_results = make_pairwise_tables(likelihood_of_correct_first_token, filter_by_dataset(object_dfs, dataset), filter_by_dataset_and_response_property(meta_dfs, dataset, response_property))
                
        if len(results) == 0 or results.shape[0] == 0:
            if not suppress_output: print(f"No data for {dataset} / {response_property}")
            continue
        
        fig, ax = plt.subplots()
        sns.heatmap(results.astype(float), annot=True, cmap="YlGnBu", cbar=False, ax=ax, fmt=".3f")
        
        # Add bootstrapped 95% CI
        for i, text in enumerate(ax.texts):
            row, col = np.unravel_index(i, results.shape)
            bootstrapped_result = bootstrapped_results.iloc[row, col]
            text.set_text(f"{text.get_text()}\n({bootstrapped_result[0]:.2f}–{bootstrapped_result[1]:.2f})")
        
        # # Check if all baseline results in each column are the same
        # for col in range(baseline_results.shape[1]):
        #     if not (baseline_results.iloc[:, col] == baseline_results.iloc[0, col]).all():
        #         raise ValueError(f"Baseline results in column {col} are not consistent.")
        
        # Add baseline values at the top of each column in light grey font
        for col, baseline_value in enumerate(baseline_results.iloc[0]):
            ax.text(col + 0.5, -0.1, f"Baseline:\n{baseline_value:.2f}", ha='center', va='bottom', color='lightgrey', fontsize=8)
        
        # Move the title up to make room for the baseline values
        ax.set_title(f"Mean log-prob of initial object-level response under meta-level model\non {dataset} eliciting {response_property}", y=1.1)
        
        # Add text explaining the baseline
        ax.text(-0.2, -0.0, "(95% bootstrapped CI\nin parentheses)", ha='center', va='center', transform=ax.transAxes, color="grey", fontsize=8)
        
        ax.set_xlabel("Scored against object-level")
        ax.set_ylabel("Meta-level")
        ax.set_aspect("equal")  # Set aspect ratio to "equal" for square cells
        
        # Display the plot
        plt.show()

### Object vs object change heatmap

In [None]:
# which response property do we want to use for the analysis?
response_property = "identity"

In [None]:
for dataset in datasets:
    with contextlib.redirect_stdout(buffer) if suppress_output else contextlib.nullcontext():
        # fake having a meta level for this
        faux_meta_level = filter_by_dataset(object_dfs, dataset)
        for config in faux_meta_level.keys():
            config['response_property'] = {'name': response_property}
        results, _, _ = make_pairwise_tables(calc_accuracy, filter_by_dataset(object_dfs, dataset), faux_meta_level)
        print(f"Overlap between object-level completions for {dataset}")
        
        mask = np.triu(np.ones_like(results, dtype=bool), k=1)
        sns.heatmap(results.astype(float), annot=True, cmap="YlGnBu", cbar=False, vmin=0, vmax=1, fmt=".0%", mask=mask)
        # plt.xlabel("Scored against object-level")
        # plt.ylabel("Meta-level")
        plt.title(f"Overlap between object-level completions for {dataset}")
        plt.gca().set_aspect("equal")  # Set aspect ratio to "equal" for square cells
        plt.show()

## Entropy barplots

In [None]:
measure = lambda df: stats.entropy(df['response'].value_counts(normalize=True))

for dataset in datasets:
    with contextlib.redirect_stdout(buffer) if suppress_output else contextlib.nullcontext():
        results = {get_label(config): measure(df) for config, df in filter_by_dataset(object_dfs, dataset).items()}
        print(f"Entropy of object-level completions for {dataset}")
        sns.barplot(x=list(results.keys()), y=list(results.values()), color = "green")

        plt.title(f"Entropy of object-level completions for {dataset}")
        # plt.gca().set_aspect("equal")  # Set aspect ratio to "equal" for square cells
        plt.xticks(rotation=90)
        plt.show()

    for dataset in datasets:
        results = {get_label(config): measure(df) for config, df in filter_by_dataset(meta_dfs, dataset).items()}
        print(f"Entropy of meta-level completions for {dataset}")
        sns.barplot(x=list(results.keys()), y=list(results.values()), color = "purple")

        plt.title(f"Entropy of object-level completions for {dataset}")
        # plt.gca().set_aspect("equal")  # Set aspect ratio to "equal" for square cells
        plt.xticks(rotation=90)
        plt.show()

## Compliance

In [None]:
measure = lambda df: (df['compliance'] == True).mean()

for dataset in datasets:
    with contextlib.redirect_stdout(buffer) if suppress_output else contextlib.nullcontext():
        results = {get_label(config): measure(df) for config, df in filter_by_dataset(object_dfs, dataset).items()}
        print(f"Compliance of object-level completions for {dataset}")
        sns.barplot(x=list(results.keys()), y=list(results.values()), color = "green")

        plt.title(f"Compliance of object-level completions for {dataset}")
        # plt.gca().set_aspect("equal")  # Set aspect ratio to "equal" for square cells
        plt.xticks(rotation=90)
        # scale to percent
        plt.gca().set_yticklabels(['{:.0f}%'.format(x*100) for x in plt.gca().get_yticks()])
        plt.show()

    for dataset in datasets:
        results = {get_label(config): measure(df) for config, df in filter_by_dataset(meta_dfs, dataset).items()}
        print(f"Compliance of meta-level completions for {dataset}")
        sns.barplot(x=list(results.keys()), y=list(results.values()), color = "purple")

        plt.title(f"Compliance of object-level completions for {dataset}")
        # plt.gca().set_aspect("equal")  # Set aspect ratio to "equal" for square cells
        plt.xticks(rotation=90)
        # scale to percent
        plt.gca().set_yticklabels(['{:.0f}%'.format(x*100) for x in plt.gca().get_yticks()])
        plt.show()

## For posterity
Save the notebook as HTML

In [None]:
raise Exception("Manually save the notebook before proceeding!")

In [None]:
PATH_THIS_NB = REPO_DIR / "analysis" / "object_vs_meta_comparisions.ipynb"
for study_folder in STUDY_FOLDERS:
    OUT_PATH = EXP_DIR / study_folder / "object_vs_meta_comparisions.html"
    subprocess.run(["jupyter", "nbconvert", "--to", "html", PATH_THIS_NB, "--output", OUT_PATH])