# Computing the ceiling performance for a model on the sweep

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import json

In [None]:
from evals.locations import REPO_DIR, EXP_DIR
from evals.utils import run_command
from evals.analysis.loading_data import get_hydra_config
from evals.analysis.loading_data import load_single_df_from_exp_path

## Which models, and which tasks?
Using the format from `scripts/sweep_full_study.py`.

`TASKS` is a string of a dict.

In [None]:
STUDY_NAME = "may20_thrifty_sweep"
MODELS = ["claude-3-sonnet","gpt-3.5-turbo","gpt-4","gemini-1.0-pro-002"]
TASKS = '{"number_triplets": ["identity", "is_even", "last_character", "first_character"], "wikipedia": ["identity", "syllable_count", "first_character", "last_character"], "writing_stories": ["identity", "first_word", "writing_stories/main_character_name"], "personal_preferences": ["identity", "syllable_count", "first_character", "last_character"], }' 

In [None]:
# other hyperparameters
N_PER_TASK = 10
SEED = 42
SAMPLES_PER_INPUT = 100

In [None]:
TASKS = eval(TASKS)

## Run the ceiling calculation

In [None]:
for model in tqdm(MODELS):
    for task in TASKS.keys():
        command = f"cd {REPO_DIR} && python3 {REPO_DIR}/evals/run_object_level.py study_name={'nondeterminism_ceiling/'+STUDY_NAME} task={task} language_model={model} task.set=val n_samples={SAMPLES_PER_INPUT} task.num={N_PER_TASK}"
        print(f"🏃‍➡️ Running {model} on {task}: {command}")
        run_command(command)

## Extract the response properties

In [None]:
results_folder = EXP_DIR / "nondeterminism_ceiling" / STUDY_NAME
subfolders = [results_folder / f for f in next(os.walk(results_folder))[1]]
print(f"Got {len(subfolders)} subfolders")

In [None]:
for folder in tqdm(subfolders):
        # load config
        try:
                cfg = get_hydra_config(folder)
        except ValueError:
                print(f"Skipping {folder}")
                continue
        task = cfg.task.name
        response_properties = TASKS[task]
        for response_property in response_properties:
                command = f"cd {REPO_DIR} && python3 {REPO_DIR}/evals/run_property_extraction.py dir={folder} response_property={response_property}"
                print(f"🛸 Extracting {response_property} on {model} on {task}: {command}")
                try:
                        run_command(command)
                except Exception as e:
                        print(f"Error: {e}\nwhile running {command}")

## Compute the Ceiling

In [None]:
BOOTSTRAP_N = 1000

In [None]:
def compute_pairwise_match(df_subset, response_property='identity'):
    # assert len(df_subset) == N_SAMPLES, f"Expected {N_SAMPLES} samples, got {len(df_subset)}"
    assert df_subset['string'].nunique() == 1, "Expected all samples to be from the same string"
    responses = df_subset[response_property].values
    shuffled_responses = np.random.permutation(responses)
    return np.mean(responses == shuffled_responses)

In [None]:
def bootstrap_95_CI(samples):
    means = []
    for _ in range(BOOTSTRAP_N):
        sample = np.random.choice(samples, len(samples), replace=True)
        means.append(np.mean(sample))
    return np.percentile(means, [2.5, 97.5])

In [None]:
def compute_ceiling(folder, response_property):
    # load df
    df = load_single_df_from_exp_path(folder, exclude_noncompliant=False) # TODO Should this be true? That might increase the ceiling.
    samples_across_strings = []
    means_across_strings = []

    for string in tqdm(df.string.unique()):
        samples_across_iters = [compute_pairwise_match(df[df.string == string], response_property) for _ in range(BOOTSTRAP_N)]
        samples_across_strings.append(samples_across_iters)
        means_across_strings.append(np.mean(samples_across_iters))
    
    all_samples = np.concatenate(samples_across_strings)
    # return mean and 95%CI of mean
    return np.mean(means_across_strings), bootstrap_95_CI(all_samples)

In [None]:
ceiling_results = {}

for folder in tqdm(subfolders):
    try:
        cfg = get_hydra_config(folder)
    except ValueError:
        print(f"Skipping {folder}")
        continue
    task = cfg.task.name
    model = cfg.language_model.model
    response_properties = TASKS[task]
    for response_property in response_properties:
        mean, ci = compute_ceiling(folder, response_property)
        ceiling_results[(model, task, response_property)] = (mean, ci)

ceiling_results_df = pd.DataFrame(ceiling_results).T
ceiling_results_df.columns = ['mean', 'ci']


In [None]:
ceiling_results_df

In [None]:
# aggregated mean by model
display(ceiling_results_df['mean'].groupby(level=0).mean())

## Save results
as .csv

In [None]:
ceiling_results_df.to_csv(EXP_DIR / "nondeterminism_ceiling" / f"{STUDY_NAME}_ceiling_results.csv")