# Computing the ceiling performance for a model on the sweep

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from p_tqdm import p_tqdm
import json

In [None]:
from evals.locations import REPO_DIR, EXP_DIR
from evals.utils import run_command
from evals.analysis.loading_data import get_hydra_config
from evals.analysis.loading_data import load_single_df_from_exp_path

## Which models, and which tasks?
Using the format from `scripts/sweep_full_study.py`.

`TASKS` is a string of a dict.

In [None]:
STUDY_NAME = "may20_thrifty_sweep"
MODELS = [
    "claude-3-sonnet",
    "gpt-3.5-turbo",
    "gpt-4",
    "gemini-1.0-pro-002",
    "finetuned/may20_thrifty_sweep/gpt-3.5-turbo/ft_gpt-3.5-turbo-1106_dcevals-kokotajlo_sweep_9R9Lqsm2", #ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:sweep:9R9Lqsm2",
    "finetuned/may20_thrifty_sweep/claude-3-sonnet/ft_gpt-3.5-turbo-1106_dcevals-kokotajlo_sweep_9R9L0Ddt",
    "finetuned/may20_thrifty_sweep/gpt-4/ft_gpt-4-0613_dcevals-kokotajlo_sweep_9RSQ9BDP",
    "finetuned/may20_thrifty_sweep/claude-3-sonnet/ft_gpt-4-0613_dcevals-kokotajlo_sweep_9RSQHCmp",
    "finetuned/may20_thrifty_sweep/gpt-4/ft_gpt-3.5-turbo-1106_dcevals-kokotajlo_sweep_9RSPteWA",
    "finetuned/may20_thrifty_sweep/gpt-3.5-turbo/ft_gpt-4-0613_dcevals-kokotajlo_sweep_9RSPjTJF",
    # "finetuned/may20_thrifty_sweep/gpt-3.5-turbo/ft_gpt-3.5-turbo-1106_dcevals-kokotajlo_lr2_9RW1QKsf",
    # "finetuned/may20_thrifty_sweep/gpt-3.5-turbo-0125/ft_gpt-3.5-turbo-1106_dcevals-kokotajlo_sweep_9Th6cCBF",
    # "finetuned/may20_thrifty_sweep/gpt-4-turbo/ft_gpt-3.5-turbo-0125_dcevals-kokotajlo_sweep_9ThUFr7R",
    # "finetuned/may20_thrifty_sweep/gpt-4-turbo/ft_gpt-3.5-turbo-1106_dcevals-kokotajlo_sweep_9ThBY0oK",
    # "finetuned/may20_thrifty_sweep/gpt-3.5-turbo-0125/ft_gpt-3.5-turbo-0125_dcevals-kokotajlo_sweep_9Th7D4TK",
    "finetuned/may20_thrifty_sweep/gpt-3.5-turbo/ft_gpt-3.5-turbo-0125_dcevals-kokotajlo_sweep_9ThVmSp2",
    "finetuned/may20_thrifty_sweep/claude-3-sonnet/ft_gpt-3.5-turbo-0125_dcevals-kokotajlo_sweep_9Th9i5Mf",
    # "finetuned/may20_thrifty_sweep/gpt-3.5-turbo/ft_gpt-3.5-turbo-1106_dcevals-kokotajlo_scramble_9TfFZ0nD",
    "ft:gpt-3.5-turbo-0125:dcevals-kokotajlo:sweep:9XB7rmTP",
    "ft:gpt-3.5-turbo-0125:dcevals-kokotajlo:baseline:9YnjQGD9",
    "gpt-3.5-turbo-1106",
    "ft:gpt-3.5-turbo-0125:dcevals-kokotajlo:sweep:9Z9pc6zQ",
    "ft:gpt-3.5-turbo-0125:dcevals-kokotajlo:sweep:9Z5lKJnt",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:sweep:9Th6cCBF",
    "ft:gpt-3.5-turbo-0125:dcevals-kokotajlo:sweep:9ZA0H8nG",
    "ft:gpt-4-0613:dcevals-kokotajlo:4onft35:9a9f4Ufy",
    "projects/351298396653/locations/us-central1/endpoints/1715297512958459904",
    "ft:gpt-3.5-turbo-0125:dcevals-kokotajlo:sweep:9Z5lYymx",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:scramble:9TfFZ0nD",
    "ft:gpt-3.5-turbo-0125:dcevals-kokotajlo:sweep:9XAu5Qg5",
    "ft:gpt-3.5-turbo-0125:dcevals-kokotajlo:sweep:9Z5gE1Cw",
    "ft:gpt-4-0613:dcevals-kokotajlo:sweep:9XAtoNkl",
    "projects/351298396653/locations/us-central1/endpoints/8174022328561172480",
    "ft:gpt-3.5-turbo-0125:dcevals-kokotajlo:sweep:9Z5lNnpS",
    "projects/351298396653/locations/us-central1/endpoints/8583876282930954240",
    "ft:gpt-4-0613:dcevals-kokotajlo:baliemay20:9WBwUkGa",
    "ft:gpt-3.5-turbo-0125:dcevals-kokotajlo:baliemay20:9WAurjLN",
    # "gpt-4-0125-preview",
    "ft:gpt-3.5-turbo-0125:dcevals-kokotajlo:sweep:9ZA5QL2c",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:sweep:9ThBY0oK",
    "ft:gpt-3.5-turbo-0125:dcevals-kokotajlo:sweep:9Th7D4TK",
    "ft:gpt-3.5-turbo-0125:dcevals-kokotajlo:sweep:9ThUFr7R",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:baliemay20:9WBLv2YM",
    "ft:gpt-3.5-turbo-0125:dcevals-kokotajlo:sweep:9Z9kh7Vt",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:lr2:9RW1QKsf",
    "projects/351298396653/locations/us-central1/endpoints/1531239266468757504",    
]
TASKS = '{"number_triplets": ["identity", "is_even", "last_character", "first_character"], "wikipedia": ["identity", "syllable_count", "first_character", "last_character"], "writing_stories": ["identity", "first_word", "writing_stories/main_character_name"], "personal_preferences": ["identity", "syllable_count", "first_character", "last_character"], }'

In [None]:
# other hyperparameters
N_PER_TASK = 5
SEED = 42
# SAMPLES_PER_INPUT = 100
SAMPLES_PER_INPUT = 25

In [None]:
TASKS = eval(TASKS)

## Run the ceiling calculation

In [None]:
for model in tqdm(MODELS):
    for task in TASKS.keys():
        # can we get the model divergent strings?
        model_divergent_string_path = EXP_DIR / STUDY_NAME / f"divergent_strings_{task}.csv"
        if os.path.exists(model_divergent_string_path):
            print(f"🔍 Found divergent strings for {model} on {task}")
            command = f"cd {REPO_DIR} && python3 {REPO_DIR}/evals/run_object_level.py study_name={'nondeterminism_ceiling/'+STUDY_NAME} task={task} language_model={model} task.set=val n_samples={SAMPLES_PER_INPUT} task.num={N_PER_TASK} strings_path={model_divergent_string_path} "
        else:
            print(f"🔍⚠️ Could not find divergent strings for {model} on {task}—Running without")
            command = f"cd {REPO_DIR} && python3 {REPO_DIR}/evals/run_object_level.py study_name={'nondeterminism_ceiling/'+STUDY_NAME} task={task} language_model={model} task.set=val n_samples={SAMPLES_PER_INPUT} task.num={N_PER_TASK} "
        print(f"🏃‍➡️ Running {model} on {task}: {command}")
        try:
            run_command(command)
        except Exception as e:
            print(f"🚨 Error running {model} on {task}: {e}")
            print("Trying the model as .model")
            command = command.replace("language_model=", "language_model.model=")
            print(f"🏃‍➡️ Running {model} on {task}: {command}")
            run_command(command)

## Extract the response properties

In [None]:
results_folder = EXP_DIR / "nondeterminism_ceiling" / STUDY_NAME
subfolders = [results_folder / f for f in next(os.walk(results_folder))[1]]
print(f"Got {len(subfolders)} subfolders")

In [None]:
def extract_response_properties_from_folder(folder):
    # load config
    try:
        cfg = get_hydra_config(folder)
    except ValueError:
        print(f"Skipping {folder}")
        return
    task = cfg.task.name
    response_properties = TASKS[task]
    for response_property in response_properties:
        command = f"cd {REPO_DIR} && python3 {REPO_DIR}/evals/run_property_extraction.py dir={folder} response_property={response_property}"
        print(f"🛸 Extracting {response_property} on {model} on {task}: {command}")
        try:
            run_command(command)
        except Exception as e:
            print(f"Error: {e}\nwhile running {command}")

In [None]:
p_tqdm.p_umap(extract_response_properties_from_folder, subfolders)

## Compute the Ceiling

In [None]:
BOOTSTRAP_N = 100

In [None]:
def compute_pairwise_match(df_subset, response_property='identity'):
    # assert len(df_subset) == N_SAMPLES, f"Expected {N_SAMPLES} samples, got {len(df_subset)}"
    assert df_subset['string'].nunique() == 1, "Expected all samples to be from the same string"
    responses = df_subset[response_property].values
    shuffled_responses = np.random.permutation(responses)
    return np.mean(responses == shuffled_responses)

We'd have to make them be the same distribution, but with different levels of noise. Seems harder. 

The way to do this would be: 
- for both pairs of responses
    - find the most common response, rename 'A'
    - find the second most common response, rename 'B'
    - ...
- see how often two arbitrary pairs match

In [None]:
def compute_pairwise_match_across_sets(df_subsetA, df_subsetB, response_property='identity'):
    assert df_subsetA['string'].nunique() == 1, "Expected all samples to be from the same string"
    assert df_subsetB['string'].nunique() == 1, "Expected all samples to be from the same string"
    # we need to replace the responses with dummy values—most common one is 0, then 1, then 2, etc.
    responsesA = df_subsetA[response_property].values
    responsesB = df_subsetB[response_property].values
    # count up the responses in A
    response_countsA = {}
    for response in responsesA:
        if response not in response_countsA:
            response_countsA[response] = 0
        response_countsA[response] += 1
    # sort by frequency
    response_countsA = {k: v for k, v in sorted(response_countsA.items(), key=lambda item: item[1], reverse=True)}
    # count up the responses in B
    response_countsB = {}
    for response in responsesB:
        if response not in response_countsB:
            response_countsB[response] = 0
        response_countsB[response] += 1
    # sort by frequency
    response_countsB = {k: v for k, v in sorted(response_countsB.items(), key=lambda item: item[1], reverse=True)}
    # make aligned responses
    aligned_responsesA = []
    for i, (response, count) in enumerate(response_countsA.items()):
        aligned_responsesA.extend([i]*count)
    aligned_responsesB = []
    for i, (response, count) in enumerate(response_countsB.items()):
        aligned_responsesB.extend([i]*count)
    matches = []
    for _ in range(BOOTSTRAP_N):
        A = np.random.choice(aligned_responsesA)
        B = np.random.choice(aligned_responsesB)
        matches.append(A == B)
    return np.mean(matches)

In [None]:
def bootstrap_95_CI(samples):
    means = []
    for _ in range(BOOTSTRAP_N):
        sample = np.random.choice(samples, len(samples), replace=True)
        means.append(np.mean(sample))
    return np.percentile(means, [2.5, 97.5])

In [None]:
def compute_ceiling(folder, response_property):
    # load df
    df = load_single_df_from_exp_path(folder, exclude_noncompliant=False) # TODO Should this be true? That might increase the ceiling.
    samples_across_strings = []
    means_across_strings = []

    for string in df.string.unique():
        samples_across_iters = [compute_pairwise_match(df[df.string == string], response_property) for _ in range(BOOTSTRAP_N)]
        samples_across_strings.append(samples_across_iters)
        means_across_strings.append(np.mean(samples_across_iters))
    
    all_samples = np.concatenate(samples_across_strings)
    # return mean and 95%CI of mean
    return np.mean(means_across_strings), bootstrap_95_CI(all_samples)

In [None]:
ceiling_results = {}

for folder in tqdm(subfolders):
    try:
        cfg = get_hydra_config(folder)
    except ValueError:
        print(f"Skipping {folder}")
        continue
    task = cfg.task.name
    model = cfg.language_model.model
    response_properties = TASKS[task]
    for response_property in response_properties:
        try:
            mean, ci = compute_ceiling(folder, response_property)
        except Exception as e:
            print(f"Error: {e}\nwhile computing ceiling for {folder} on {response_property}")
            mean = np.nan
            ci = (np.nan, np.nan)
        ceiling_results[(model, task, response_property)] = (mean, ci)

ceiling_results_df = pd.DataFrame(ceiling_results).T
ceiling_results_df.columns = ['mean', 'ci']


In [None]:
ceiling_results_df

In [None]:
# aggregated mean by model
display(ceiling_results_df['mean'].groupby(level=0).mean())

We also want to calculate how well each model predicts every other model.

We'd have to make them be the same distribution, but with different levels of noise. Seems harder. 

The way to do this would be: 
- for both pairs of responses
    - find the most common response, rename 'A'
    - find the second most common response, rename 'B'
    - ...
- see how often two arbitrary pairs match

In [None]:
# load in folders in the structure {model}/{task}
dfs_per_model_task = {}

for folder in tqdm(subfolders):
    try:
        cfg = get_hydra_config(folder)
    except ValueError:
        print(f"Skipping {folder}")
        continue
    task = cfg.task.name
    model = cfg.language_model.model
    try:
        dfs_per_model_task[model][task] = folder
    except KeyError:
        dfs_per_model_task[model] = {task: folder}

In [None]:
def compute_ceiling_for_model_pair(folderA, folderB, response_property):
    # load df
    dfA = load_single_df_from_exp_path(folderA, exclude_noncompliant=False) # TODO Should this be true? That might increase the ceiling.
    dfB = load_single_df_from_exp_path(folderB, exclude_noncompliant=False) # TODO Should this be true? That might increase the ceiling.
    means_across_strings = []

    for current_string in set(list(dfA.string.unique()) + list(dfB.string.unique())):
        dfA_string_subset = dfA[dfA.string == current_string]
        dfB_string_subset = dfB[dfB.string == current_string]
        if len(dfA_string_subset) == 0 or len(dfB_string_subset) == 0:
            continue
        mean_acc = compute_pairwise_match_across_sets(dfA_string_subset, dfB_string_subset, response_property)
        means_across_strings.append(mean_acc)
    
    # return mean and 95%CI of mean
    return np.mean(means_across_strings), bootstrap_95_CI(means_across_strings)

In [None]:
# ceiling_pair_results = {}

# for modelA in tqdm(dfs_per_model_task.keys()):
#     for task in dfs_per_model_task[modelA].keys():
#         if task not in dfs_per_model_task[modelA]:
#             print(f"Skipping {modelA} and submodels on {task}")
#             continue
#         folderA = dfs_per_model_task[modelA][task]
#         for modelB in dfs_per_model_task.keys():
#             if task not in dfs_per_model_task[modelB]:
#                 print(f"Skipping {modelB} on {task}")
#                 continue
#             folderB = dfs_per_model_task[modelB][task]
#             for response_property in TASKS[task]:
#                 try:
#                     mean, ci = compute_ceiling_for_model_pair(folderA, folderB, response_property)
#                 except Exception as e:
#                     print(f"Error: {e}\nwhile computing ceiling for {folderA} and {folderB} on {response_property}")
#                     mean = np.nan
#                     ci = (np.nan, np.nan)
#                 ceiling_pair_results[(modelA, modelB, task, response_property)] = {'mean': mean, 'ci': ci}

# ceiling_paired_results_df = pd.DataFrame(ceiling_pair_results).T

In [None]:
def process_model_pair(args):
    modelA, modelB, task, dfs_per_model_task, TASKS = args
    if task not in dfs_per_model_task[modelA]:
        print(f"Skipping {modelA} and submodels on {task}")
        return []
    folderA = dfs_per_model_task[modelA][task]
    if task not in dfs_per_model_task[modelB]:
        print(f"Skipping {modelB} on {task}")
        return []
    folderB = dfs_per_model_task[modelB][task]
    results = []
    for response_property in TASKS[task]:
        try:
            mean, ci = compute_ceiling_for_model_pair(folderA, folderB, response_property)
        except Exception as e:
            # print(f"Error: {e}\nwhile computing ceiling for {folderA} and {folderB} on {response_property}")
            mean = np.nan
            ci = (np.nan, np.nan)
        results.append(((modelA, modelB, task, response_property), {'mean': mean, 'ci': ci}))
    return results

model_pairs = [(modelA, modelB, task, dfs_per_model_task, TASKS)
               for modelA in dfs_per_model_task.keys()
               for task in dfs_per_model_task[modelA].keys()
               for modelB in dfs_per_model_task.keys()]

ceiling_pair_results = p_tqdm.p_map(process_model_pair, model_pairs)

In [None]:
pd.DataFrame(ceiling_pair_results).T

In [None]:
# this is a horrid nested mess but this is how we get the data into a flat format
flat_data = []
for sublist in ceiling_pair_results:
    for identifiers, stats in sublist:
        row = {
            'modelA': identifiers[0],
            'modelB': identifiers[1],
            'task': identifiers[2],
            'response_property': identifiers[3],
            'ceiling': stats['mean'],
            'CI': stats['ci']
        }
        flat_data.append(row)

ceiling_paired_results_df = pd.DataFrame(flat_data)

In [None]:
ceiling_paired_results_df

In [None]:
# grouped by model pair
ceiling_paired_results_df.groupby(['modelA', 'modelB'])['ceiling'].mean()

In [None]:
ceiling_pair_results_df = ceiling_paired_results_df

In [None]:
ceiling_pair_results_df

## Save results
as .csv

In [None]:
ceiling_results_df.to_csv(EXP_DIR / "nondeterminism_ceiling" / f"{STUDY_NAME}_ceiling_results.csv")

In [None]:
ceiling_pair_results_df.to_csv(EXP_DIR / "nondeterminism_ceiling" / f"{STUDY_NAME}_ceiling_pair_results.csv")