# How deterministic are models?
It looks like models at temperature = 0 still have a lot of randomness in them. This notebook will explore how deterministic models are at temperature = 0.

In [None]:
import subprocess

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
from evals.locations import REPO_DIR, EXP_DIR
from evals.utils import run_command

In [None]:
MODEL = "gpt-3.5-turbo-0125"
# MODEL = "claude-3-sonnet"
STUDY_NAME = "how_deterministic_are_models"
# TASK = "daily_dialog"
# TASK = "number_triplets"
TASK = "writing_stories"
N_SAMPLES = 100
N_STRINGS = 50

Generate a bunch of samples of the same string from the same model at temperature = 0

In [None]:
command = f"cd {REPO_DIR} && python3 {REPO_DIR}/evals/run_object_level.py study_name={STUDY_NAME} task={TASK} language_model={MODEL} task.set=val n_samples={N_SAMPLES} task.num={N_STRINGS}"
folder_name = run_command(command)

In [None]:
print(folder_name)

## Analysis

In [None]:
from evals.analysis.loading_data import load_single_df_from_exp_path
from evals.locations import EXP_DIR

In [None]:
df = load_single_df_from_exp_path(folder_name, exclude_noncompliant=False)

In [None]:
# how many unique answers per input?
df.groupby("string").response.nunique().hist()
plt.title(f"Number of unique responses on the same input (out of {N_SAMPLES} samples) on {TASK}")

> Compute upper bound: 100 samples from object-level—what is the chance that two samples match? -> [ ] try on non-determinism notebook

We want to know what the chance is that two given samples match given the level of non-determinacy. We compute this by taking the single 100 sample, permuting it, and seeing how many pairwise matches we get.

In [None]:
BOOTSTRAP_N = 1000

In [None]:
def compute_pairwise_match(df_subset):
    # assert len(df_subset) == N_SAMPLES, f"Expected {N_SAMPLES} samples, got {len(df_subset)}"
    assert df_subset['string'].nunique() == 1, "Expected all samples to be from the same string"
    responses = df_subset['response'].values
    shuffled_responses = np.random.permutation(responses)
    return np.mean(responses == shuffled_responses)

In [None]:
samples_across_strings = []
means_across_strings = []

for string in tqdm(df.string.unique()):
    samples_across_iters = [compute_pairwise_match(df[df.string == string]) for _ in range(BOOTSTRAP_N)]
    samples_across_strings.append(samples_across_iters)
    means_across_strings.append(np.mean(samples_across_iters))

In [None]:
np.mean(means_across_strings)