# Cross-prediction analysis
2024-09-06: this notebook reproduces the analysis of cross-prediction evidence over `paper_figures.ipynb`.

In [None]:
STUDY = "23_jul_fixed_tasks_medium_cross"

In [None]:
model_pairs = [
    (
        "ft:gpt-4-0613:dcevals-kokotajlo::A2F4MybP",
        "ft:gpt-4-0613:dcevals-kokotajlo::A2F4MybP",
    ),  # A_fton_A predicting A_fton_A
    (
        "ft:gpt-4-0613:dcevals-kokotajlo::A2F4MybP",
        "ft:gpt-4o-2024-05-13:dcevals-kokotajlo::A3ZXwt6P",
    ),  # B_fton_(A_fton_A) predicting A_fton_A
    # and vice versa
    (
        "ft:gpt-4o-2024-05-13:dcevals-kokotajlo::9oUVKrCU",
        "ft:gpt-4o-2024-05-13:dcevals-kokotajlo::9oUVKrCU",
    ),
    (  # B_fton_B predicting B_fton_B
        "ft:gpt-4o-2024-05-13:dcevals-kokotajlo::9oUVKrCU",
        "ft:gpt-4-0613:dcevals-kokotajlo::A2BJlcNF",
    ),  # A_fton_(B_fton_B) predicting B_fton_B
    # (  # DEBUG
    #     "ft:gpt-4-0613:dcevals-kokotajlo::A2BJlcNF",
    #     "ft:gpt-4o-2024-05-13:dcevals-kokotajlo::9oUVKrCU",
    # ),
    # (  # DEBUG
    #     "ft:gpt-4o-2024-05-13:dcevals-kokotajlo::9oUVKrCU",
    #     "ft:gpt-4-0613:dcevals-kokotajlo::A2BJlcNF",
    # ),
]

tasks_and_response_properties = {  # 23_jul_fixed_tasks_medium_cross
    # test set
    # "writing_stories_pick_name": ["writing_stories/main_character_name"],
    # "wikipedia_long": [
    #     "first_character",
    #     "second_character",
    #     "third_character",
    #     "first_and_second_character",
    #     "first_word",
    #     "second_word",
    #     "starts_with_vowel",
    #     "third_word",
    # ],
    # "wealth_seeking": ["matches_wealth_seeking"],
    # "power_seeking": ["matches_power_seeking"],
    # "arc_challenge_non_cot": ["identity", "is_either_a_or_c", "is_either_b_or_d"],
    # "countries_long": [
    #     "first_character",
    #     "second_character",
    #     "third_character",
    #     "first_and_second_character",
    #     "first_word",
    #     "second_word",
    #     "starts_with_vowel",
    #     "third_word",
    # ],
    # "colors_long": [
    #     "first_character",
    #     "second_character",
    #     "third_character",
    #     "first_and_second_character",
    #     "first_word",
    #     "second_word",
    #     "starts_with_vowel",
    #     "third_word",
    # ],
    # "numbers": [
    #     "is_even_direct",
    #     # "is_even" # broken, but we only need is_even_direct
    # ],
    # val set
    "survival_instinct": ["matches_survival_instinct"],
    "myopic_reward": ["matches_myopic_reward"],
    "animals_long": [
        "first_character",
        "second_character",
        "third_character",
        "first_and_second_character",
        "first_word",
        "second_word",
        "starts_with_vowel",
        "third_word",
    ],
    "mmlu_non_cot": ["is_either_a_or_c", "is_either_b_or_d"],
    "english_words_long": [
        "first_character",
        "second_character",
        "third_character",
        "first_and_second_character",
        "first_word",
        "second_word",
        "starts_with_vowel",
        "third_word",
    ],
    "stories_sentences": [
        "first_character",
        "second_character",
        "third_character",
        "first_and_second_character",
        "first_word",
        "second_word",
        "starts_with_vowel",
        "third_word",
    ],
}

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import words
from scipy import stats
import seaborn as sns
import tqdm
from p_tqdm import p_umap

In [None]:
from evals.analysis.analysis_helpers import (
    merge_object_and_meta_dfs,
    create_df_from_configs,
    fill_df_with_function,
    get_pretty_name,
    filter_configs_by_conditions,
    pretty_print_config,
    get_pretty_name_w_labels,
    merge_object_and_meta_dfs_and_run_property_extraction,
)
from evals.analysis.loading_data import (
    load_dfs_with_filter,
    load_base_df_from_config,
    get_hydra_config,
    load_single_df,
    load_single_df_from_exp_path,
    get_data_path,
    get_folders_matching_config_key,
)
from evals.load.lazy_object_level_llm_extraction import lazy_add_response_property_to_object_level
from evals.utils import get_maybe_nested_from_dict
from evals.analysis.analysis_functions import *
from evals.analysis.analysis_helpers import bootstrap_ci, compute_standard_error_ci, compute_binary_ci
from evals.locations import EXP_DIR

print(f"EXP_DIR: {EXP_DIR}")

Strategy:
- for each model pair, load each response property individually
    - based on the --tasks property
- compute exclusions & merge
- compute accuracy
- compute mode baseline
- save to big table with model pair, property and accuracy
- plot the table

In [None]:
def load_dfs_for_model_pair_and_property(object_model, meta_model, task, response_property): # slow—takes ~30s
    # load object df
    conditions = {('language_model', 'model'): [object_model], ("task", "set"): ["val"], ("task", "name"): [task], ("response_property", "name"): [None]}
    exp_paths = get_folders_matching_config_key(EXP_DIR/STUDY, conditions)
    assert len(exp_paths) == 1, f"Expected 1 experiment path for object level, got {len(exp_paths)}"
    object_df = load_single_df_from_exp_path(exp_paths[0], exclude_noncompliant=False)
    # load meta df
    conditions = {('language_model', 'model'): [meta_model], ("task", "set"): ["val"], ("task", "name"): [task], ("response_property", "name"): [response_property]}
    exp_paths = get_folders_matching_config_key(EXP_DIR/STUDY, conditions)
    assert len(exp_paths) == 1, f"Expected 1 experiment path for meta level, got {len(exp_paths)}"
    meta_df = load_single_df_from_exp_path(exp_paths[0], exclude_noncompliant=False)
    return object_df, meta_df

In [None]:
def merge_dfs(object_df, meta_df):
    # Assert that strings are unique in both dataframes
    assert object_df['string'].nunique() == len(object_df), "Strings in object_df are not unique"
    assert meta_df['string'].nunique() == len(meta_df), "Strings in meta_df are not unique"
    # Rename columns in object_df
    object_df = object_df.add_prefix('obj_')
    
    # Rename columns in meta_df
    meta_df = meta_df.add_prefix('meta_')
    
    # Merge the dataframes on the string column
    merged_df = pd.merge(object_df, meta_df, left_on='obj_string', right_on='meta_string', how='inner')
    
    # did we loose all rows?
    assert len(merged_df) > 0, "No rows left after merging"
    return merged_df

In [None]:
def get_accuracy_and_baseline_for_model_pair_and_property(object_model, meta_model, task, response_property):
    """
    Calculates the accuracy and baseline for a given model pair and response property.

    Args:
    object_model (str): The name of the object-level model.
    meta_model (str): The name of the meta-level model.
    task (str): The name of the task.
    response_property (str): The specific response property to analyze.

    Returns:
    tuple: A tuple containing:
        - accuracy (float): The accuracy of the model pair.
        - sem (float): The standard error of the mean for the accuracy.
        - mode_acc (float): The accuracy of the mode baseline.
        - mode_sem (float): The standard error of the mean for the mode baseline.
    """
    object_df, meta_df = load_dfs_for_model_pair_and_property(object_model, meta_model, task, response_property)
    merged_df = merge_dfs(object_df, meta_df)
    accuracy, sem = compute_accuracy(merged_df, response_property)
    mode_acc, mode_sem = mode_baseline_accuracy(object_df, response_property)
    return accuracy, sem, mode_acc, mode_sem

In [None]:
def compute_accuracy(merged_df, response_property):
    """Computes the accuracy of the model pair.
    Performs exclusions according to the following rules:
    - if object level response is non-compliant, exclude the row
    - if meta level response is non-compliant, count the prediction as incorrect

    CI is the standard error of the mean
    """
    correctnesses = get_correctnesses(merged_df, response_property)
    # compute accuracy
    acc = correctnesses.mean()
    sem = stats.sem(correctnesses)
    return acc, sem

In [None]:
def get_correctnesses(merged_df, response_property):
    assert "meta_" + response_property in merged_df.columns, f"Response property {response_property} not found in meta_df"
    assert "obj_" + response_property in merged_df.columns, f"Response property {response_property} not found in object_df"
    # cast the response cols to string and ensure thy're lowercase
    merged_df['obj_' + response_property] = merged_df['obj_' + response_property].astype(str).str.lower()
    merged_df['meta_' + response_property] = merged_df['meta_' + response_property].astype(str).str.lower()
    # get correctness
    merged_df['correct'] = merged_df['obj_' + response_property] == merged_df['meta_' + response_property]
    # Exclusion rules
    # if object level response is non-compliant, exclude the row
    excluded_mask = merged_df['obj_compliance'] != True
    if excluded_mask.any():
        merged_df = merged_df[~excluded_mask]
    # if meta level response is non-compliant, count the prediction as incorrect
    excluded_mask = merged_df['meta_compliance'] != True
    if excluded_mask.any():
        merged_df.loc[excluded_mask, 'correct'] = False
    return merged_df['correct']

In [None]:
def mode_baseline_accuracy(object_df, response_property):
    """How well would you do if you always predicted the mode of the distribution?"""
    # Create an explicit copy of the DataFrame
    df = object_df.copy()
    
    # exclude non-compliant responses
    df = df[df['compliance'] == True]
    
    # compute mode
    mode = df[response_property].mode()[0]
    
    # Use .loc to set values
    df.loc[:, 'correct'] = df[response_property] == mode
    
    acc = df['correct'].mean()
    sem = stats.sem(df['correct'])
    return acc, sem

In [None]:
def calculate_accuracies_across_models_pairs(model_pairs, tasks_and_response_properties):
    """
    Calculate accuracies for a list of model pairs and tasks/response properties.

    Args:
    model_pairs (list of tuples): Each tuple contains two models to be compared. Include the language_model.name field, not the name of the config!
    tasks_and_response_properties (dict): according to the structure of the sweep script, eg:
    {"writing_stories_pick_name": ["writing_stories/main_character_name"], "wikipedia_long": ["first_character", "second_character", "third_character", "first_and_second_character", "first_word", "second_word", "starts_with_vowel", "third_word"], "wealth_seeking": ["matches_wealth_seeking"], "power_seeking": ["matches_power_seeking"], "arc_challenge_non_cot": ["identity", "is_either_a_or_c", "is_either_b_or_d"], "countries_long": ["first_character", "second_character", "third_character", "first_and_second_character", "first_word", "second_word", "starts_with_vowel", "third_word"], "colors_long": ["first_character", "second_character", "third_character", "first_and_second_character", "first_word", "second_word", "starts_with_vowel", "third_word"], "numbers": ["is_even_direct", "is_even"]}

    Returns:
        Multi-index dataframe with indices:
            1. object level model (prediction target)
            2. meta level model (predictor)
            3. task
            4. response property
        and columns:
            1. accuracy
            2. standard error of the mean
            3. mode baseline accuracy
            4. standard error of the mode baseline
    """
    # Initialize an empty dataframe with a MultiIndex
    index = pd.MultiIndex.from_tuples(
        [
            (model_pair[0], model_pair[1], task, prop)
            for model_pair in model_pairs
            for task, props in tasks_and_response_properties.items()
            for prop in props
        ],
        names=["object_model", "meta_model", "task", "response_property"],
    )
    columns = ["accuracy", "sem", "mode_baseline_accuracy", "mode_baseline_sem"]
    df = pd.DataFrame(index=index, columns=columns)

    def process_model_pair(args):
        object_model, meta_model, task, prop = args
        try:
            accuracy, sem, mode_acc, mode_sem = get_accuracy_and_baseline_for_model_pair_and_property(
                object_model, meta_model, task, prop
            )
            return (object_model, meta_model, task, prop), (accuracy, sem, mode_acc, mode_sem)
        except Exception as e:
            print(f"Error for {object_model}, {meta_model}, {task}, {prop}: {e}")
            return (object_model, meta_model, task, prop), (np.nan, np.nan, np.nan, np.nan)

    # Prepare arguments for parallel processing
    args_list = [
        (model_pair[0], model_pair[1], task, prop)
        for model_pair in model_pairs
        for task, props in tasks_and_response_properties.items()
        for prop in props
    ]

    # Use p_umap for parallel processing
    results = p_umap(process_model_pair, args_list)
    # non parallel for debugging
    # results = []
    # for args in args_list:
    #     result = process_model_pair(args)
    #     results.append(result)

    # Fill the dataframe with results
    for idx, (acc, sem, mode_acc, mode_sem) in results:
        df.loc[idx] = [acc, sem, mode_acc, mode_sem]

    return df

In [None]:
result_df = calculate_accuracies_across_models_pairs(
    model_pairs,
    tasks_and_response_properties,
)
result_df

In [None]:
from p_tqdm import p_map

def calculate_overall_accuracies(model_pairs, tasks_and_response_properties):
    """
    Calculate overall accuracies for a list of model pairs across all tasks and response properties combined.

    Args:
    model_pairs (list of tuples): Each tuple contains two models to be compared.
    tasks_and_response_properties (dict): Dictionary of tasks and their response properties.

    Returns:
    pd.DataFrame: DataFrame with overall accuracies, SEMs, and sample sizes for each model pair.
    """
    def process_model_pair(model_pair):
        object_model, meta_model = model_pair
        correct_predictions = 0
        total_predictions = 0

        correctnesses = []

        for task, props in tasks_and_response_properties.items():
            for prop in props:
                try:
                    object_df, meta_df = load_dfs_for_model_pair_and_property(object_model, meta_model, task, prop)
                    merged_df = merge_dfs(object_df, meta_df)
                    correctnesses.extend(get_correctnesses(merged_df, prop))
                except Exception as e:
                    print(f"Error for {object_model}, {meta_model}, {task}, {prop}: {e}")

        return {
            'object_model': object_model,
            'meta_model': meta_model,
            'accuracy': np.mean(correctnesses),
            'sem': stats.sem(correctnesses),
            'total_samples': len(correctnesses)
        }

    results = p_map(process_model_pair, model_pairs)
    # Convert the results to a DataFrame with a MultiIndex
    all_results_df = pd.DataFrame(results)
    all_results_df.set_index(['object_model', 'meta_model'], inplace=True)
    all_results_df = all_results_df.sort_index()

    return all_results_df
    
# Calculate overall accuracies
overall_result_df = calculate_overall_accuracies(model_pairs, tasks_and_response_properties)
overall_result_df


In [None]:
def how_much_do_we_loose(model_pairs, tasks_and_response_properties):
    # Initialize an empty dataframe with a MultiIndex
    index = pd.MultiIndex.from_tuples(
        [
            (model_pair[0], model_pair[1], task, prop)
            for model_pair in model_pairs
            for task, props in tasks_and_response_properties.items()
            for prop in props
        ],
        names=["object_model", "meta_model", "task", "response_property"],
    )
    columns = ["object_len", "meta_len", "merged_len"]
    df = pd.DataFrame(index=index, columns=columns)

    def process_model_pair(args):
        object_model, meta_model, task, prop = args
        try:
            object_df, meta_df = load_dfs_for_model_pair_and_property(object_model, meta_model, task, prop)
        except Exception as e:
            print(f"Error for {object_model}, {meta_model}, {task}, {prop}: {e}")
            return (object_model, meta_model, task, prop), (np.nan, np.nan, np.nan)
        try:
            merged_df = merge_dfs(object_df, meta_df)
            len_merged = len(merged_df)
        except Exception as e:
            print(f"Error for {object_model}, {meta_model}, {task}, {prop}: {e}")
            len_merged = np.nan
        return (object_model, meta_model, task, prop), (len(object_df), len(meta_df), len_merged)

    # Prepare arguments for parallel processing
    args_list = [
        (model_pair[0], model_pair[1], task, prop)
        for model_pair in model_pairs
        for task, props in tasks_and_response_properties.items()
        for prop in props
    ]

    # Use p_umap for parallel processing
    results = p_umap(process_model_pair, args_list)
    # non parallel for debugging
    # results = []
    # for args in args_list:
    #     result = process_model_pair(args)
    #     results.append(result)

    # Fill the dataframe with results
    for idx, (object_len, meta_len, merged_len) in results:
        df.loc[idx] = [object_len, meta_len, merged_len]

    return df

In [None]:
loose_result_df = how_much_do_we_loose(
    model_pairs,
    tasks_and_response_properties,
)
loose_result_df

## Plotting

In [None]:
result_df

In [None]:
# extract the model combinations from object_model and meta_model
result_df['model_pair'] = result_df.index.get_level_values('object_model') + ' -> ' + result_df.index.get_level_values('meta_model')
list(result_df['model_pair'].unique())

In [None]:
MODEL_PAIR_NAMES ={
    'ft:gpt-4-0613:dcevals-kokotajlo::A2F4MybP -> ft:gpt-4-0613:dcevals-kokotajlo::A2F4MybP': "4 self-predicting 4",
    'ft:gpt-4-0613:dcevals-kokotajlo::A2F4MybP -> ft:gpt-4o-2024-05-13:dcevals-kokotajlo::A3ZXwt6P': "4o cross-predicting 4",
    'ft:gpt-4o-2024-05-13:dcevals-kokotajlo::9oUVKrCU -> ft:gpt-4o-2024-05-13:dcevals-kokotajlo::9oUVKrCU': "4o self-predicting 4o",
    'ft:gpt-4o-2024-05-13:dcevals-kokotajlo::9oUVKrCU -> ft:gpt-4-0613:dcevals-kokotajlo::A2BJlcNF': "4 cross-predicting 4o",
}

In [None]:
MODEL_PAIR_COLORS = {
    '4 self-predicting 4': 'red',
    '4o cross-predicting 4': 'blue',
    '4o self-predicting 4o': 'lightcoral',
    '4 cross-predicting 4o': 'lightblue',
}

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_model_performance(df, overall_df):
    # Reset index to make model pairs, tasks, and response properties columns
    df_reset = df.reset_index()
    
    # Create a new column for model pairs and apply MODEL_PAIR_NAMES
    df_reset['model_pair'] = df_reset.apply(lambda row: MODEL_PAIR_NAMES[f"{row['object_model']} -> {row['meta_model']}"], axis=1)
    
    # Create a new column for task and response property
    df_reset['task_property'] = df_reset['task'] + ': ' + df_reset['response_property']
    
    # Sort the dataframe based on the order of MODEL_PAIR_NAMES
    model_pair_order = list(MODEL_PAIR_NAMES.values())
    df_sorted = df_reset.sort_values(['task', 'response_property', 'model_pair'], 
                                     key=lambda x: pd.Categorical(x, categories=model_pair_order, ordered=True))
    
    # Set up the plot
    fig, ax = plt.subplots(figsize=(32, 10))
    
    # Get unique task_properties and model_pairs
    task_properties = df_sorted['task_property'].unique()
    model_pairs = model_pair_order
    
    # Set width of each bar and positions
    bar_width = 0.2
    r = np.arange(len(task_properties) + 1)  # +1 for overall results
    
    # Plot bars for each model pair
    for i, model_pair in enumerate(model_pairs):
        data = df_sorted[df_sorted['model_pair'] == model_pair]
        accuracies = data['accuracy'].values
        errors = data['sem'].values
        baselines = data['mode_baseline_accuracy'].values
        
        # Plot model performance bars
        ax.bar(r[:-1] + i*bar_width, accuracies, width=bar_width, color=MODEL_PAIR_COLORS[model_pair], 
               yerr=errors, capsize=5, label=model_pair, align='center')
        
        # Plot corresponding baselines as stars
        ax.scatter(r[:-1] + i*bar_width, baselines, marker='*', color='black', s=100, zorder=3)
        
        # Add overall results
        overall_acc = overall_df.loc[(data['object_model'].iloc[0], data['meta_model'].iloc[0]), 'accuracy']
        overall_sem = overall_df.loc[(data['object_model'].iloc[0], data['meta_model'].iloc[0]), 'sem']
        ax.bar(r[-1] + i*bar_width, overall_acc, width=bar_width, color=MODEL_PAIR_COLORS[model_pair], 
               yerr=overall_sem, capsize=5, align='center')
    
    # Customize the plot
    ax.set_title('Self/cross-prediction accuracy across tasks and response properties', fontsize=16)
    ax.set_xlabel('Task: Response Property', fontsize=12)
    ax.set_ylabel('Accuracy', fontsize=12)
    ax.set_xticks(r + 1.5*bar_width)
    task_property_labels = list(task_properties) + ['Average across tasks and response properties']
    ax.set_xticklabels(task_property_labels, rotation=45, ha='right')
    ax.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
    
    # Add a legend entry for the baseline stars
    ax.scatter([], [], marker='*', color='black', s=100, label='Baseline')
    
    # Adjust layout and display the plot
    plt.tight_layout()
    plt.show()

# Call the function with your dataframe and overall dataframe
plot_model_performance(result_df, overall_result_df)


In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_model_performance(overall_df):
    # Set up the plot
    fig, ax = plt.subplots(figsize=(10, 6))
    
    # Get model pairs
    model_pairs = list(MODEL_PAIR_NAMES.values())
    
    # Set width of each bar and positions
    bar_width = 0.6
    r = np.arange(len(model_pairs))
    
    # Plot bars for each model pair
    for i, model_pair in enumerate(model_pairs):
        object_model, meta_model = next((k.split(' -> ') for k, v in MODEL_PAIR_NAMES.items() if v == model_pair))
        overall_acc = overall_df.loc[(object_model, meta_model), 'accuracy']
        overall_sem = overall_df.loc[(object_model, meta_model), 'sem']
        
        ax.bar(r[i], overall_acc, width=bar_width, color=MODEL_PAIR_COLORS[model_pair], 
               yerr=overall_sem, capsize=5, label=model_pair, align='center')
    
    # Customize the plot
    ax.set_title('Overall Self/cross-prediction accuracy', fontsize=16)
    ax.set_xlabel('Model Pair', fontsize=12)
    ax.set_ylabel('Accuracy', fontsize=12)
    ax.set_xticks(r)
    ax.set_xticklabels(model_pairs, rotation=45, ha='right')
    ax.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
    
    # Adjust layout and display the plot
    plt.tight_layout()
    plt.show()

# Call the function with your overall dataframe
plot_model_performance(overall_result_df)
