In [None]:
import numpy as np
from scipy import stats

In [None]:
## This eval script calculates CIs for all results we run:

def calculate_confidence_interval(
    data:np.array,
    percentage:bool=True,
    round_to:int=3,
    method='t',
    confidence_level=0.95):
    """
    Calculate the mean, standard error, and confidence interval for a mean (Could be extended to any point estimate).

    This function can compute the confidence interval using different statistical methods: t-score, z-score, or bootstrap.
    The results can be expressed as either proportions or percentages, and the output can be rounded to a specified number
    of decimal places.

    Args:
        is_correct (array-like): An array of binary values (0 or 1) indicating correctness.
        percentage (bool, optional): If True, results are returned as percentages (0-100). Defaults to True.
        round_to (int, optional): Number of decimal places to round the results to. Defaults to 3.
        method (str, optional): Statistical method for confidence interval calculation ('t', 'z', or 'bootstrap'). Defaults to 't'.
        confidence_level (float, optional): Confidence level for the interval, expressed as a decimal (e.g., 0.95 for 95% confidence). Defaults to 0.95.

    Returns:
        tuple: A tuple containing the mean, standard error, lower bound of the confidence interval,
               and upper bound of the confidence interval. The format depends on the `percentage` argument.
    """
    # Calculate mean and standard error
    mean      = data.mean()
    std_error = data.std() / np.sqrt(len(data))

    # Confidence interval calculation based on method
    if method == 't':
        # T-distribution
        degrees_freedom = len(data) - 1
        confidence_interval = stats.t.interval(confidence_level, degrees_freedom, mean, std_error)

    elif method == 'z':
        # Z-distribution
        z_value = stats.norm.ppf(1 - (1 - confidence_level) / 2)  # Two-tailed z-value for the confidence level
        confidence_interval = (mean - z_value * std_error, mean + z_value * std_error)

    elif method == 'bootstrap':
        # Bootstrap confidence interval
        n_iterations = 1000
        bootstrapped_means = []
        for _ in range(n_iterations):
            sample = stats.resample(data, replace=True)
            bootstrapped_means.append(sample.mean())
        lower = np.percentile(bootstrapped_means, (1 - confidence_level) / 2 * 100)
        upper = np.percentile(bootstrapped_means, (1 + confidence_level) / 2 * 100)
        confidence_interval = (lower, upper)

    else:
        raise ValueError("Method must be 't', 'z', or 'bootstrap'.")

    # Convert to percentage if required
    if percentage:
        mean *= 100
        confidence_interval = [ci * 100 for ci in confidence_interval]

    # Round results if specified
    if round_to is not None:
        mean = round(mean, round_to)
        confidence_interval = [round(ci, round_to) for ci in confidence_interval]

    out = {"mean":mean,"ci":confidence_interval}
    return out


def format_output(question_type:str,
                  metrics:dict[str,float]) -> None:
    """
    Print formatted output for the given question type and its associated metrics.

    Parameters
    ----------
    question_type : str
        The type of question being processed, which will be capitalized and displayed.
    metrics : dict[str, float]
        A dictionary containing metric information. It must include:
        - 'mean': The mean accuracy to display.
        - 'ci': A list or tuple containing two values representing the confidence interval.

    Returns
    -------
    None
        This function does not return any value; it simply prints the formatted output.
    """
    print(f"{question_type:<36}  Acc: {metrics['mean']:>6.3f} ({metrics['ci'][0]:>4.3f},{metrics['ci'][1]:>4.3f})")


In [None]:
import json, pandas as pd, string, torch, re
from PIL import Image
from transformers import AutoModelForCausalLM, AutoProcessor

def extract_letter(text, n_opts):
    text = (text or "").strip().upper()
    if len(text) == 1 and "A" <= text <= chr(ord("A")+n_opts-1):
        return text
    m = re.search(r"\b([A-{}])\b".format(chr(ord("A")+n_opts-1)), text)
    return m.group(1) if m else None

tsv = "~/LMUData/uBench/uBench_classification_10.tsv"
row = pd.read_csv(tsv, sep="\t").iloc[0]
opts = json.loads(row["options"])
letters = list(string.ascii_uppercase[:len(opts)])
options_block = "\n".join(f"{letters[i]}. {opts[i]}" for i in range(len(opts)))

prompt = (
    f"{row['question'].strip()}\n"
    f"Options:\n{options_block}\n"
    f"Answer with a single letter ({letters[0]}–{letters[-1]}) only."
)

image = Image.open(row["image_path"]).convert("RGB")

model_id = "google/medgemma-4b-it"  # or your current HF model
proc = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, device_map="auto", torch_dtype=torch.bfloat16)

inputs = proc(images=image, text=prompt, return_tensors="pt")
# keep on CPU if model is sharded; else move to model device
if getattr(model, "hf_device_map", None) is None:
    dev = next(model.parameters()).device
    inputs = {k:(v.to(dev)) for k,v in inputs.items()}

with torch.inference_mode():
    out_ids = model.generate(**inputs, max_new_tokens=2, do_sample=False)
text = proc.decode(out_ids[0], skip_special_tokens=True)
pred = extract_letter(text, len(opts))
gt = row["answer"]

print("RAW OUT:", repr(text))
print("PRED:", pred, "GT:", gt, "->", "OK" if pred==gt else "WRONG")