In [2]:
# types of runs for each experiment
# 1. uncalibrated              - original model without conformal at all
# 2. fixed budgeting           - share_budget=False and min_sample_size=None and naive=True
# 3. adaprive budgeting        - share_budget=False and min_sample_size=None and naive=False
# 4. capped adaptive budgeting - share_budget=False and min_sample_size=0.1  and naive=False
# 5. gloal budgeting           - share_budget=True  and min_sample_size=0.1  and naive=False

# experiments:
# A) mean coverage as function of budget per sample - TODO: how to measure?
# B) mean samples vs budget per sample (total num of samples generated)
# C) lpb vs budget per sample (average LPB value)

In [3]:
import pytorch_lightning as pl
import torch
import numpy as np
from torch.utils.data import DataLoader  # Import DataLoader
from src.failure_model import ToxicClassifier
from src.datasets import PromptOnlyDataset, PropDataset
import torch._dynamo

torch._dynamo.config.suppress_errors = True

try:
    torch.multiprocessing.set_start_method("spawn")
except RuntimeError:
    pass

In [4]:
target_taus = torch.tensor([0.1])
TAUS_RANGE = torch.tensor(np.logspace(-8, -0.33, 500))
target_tau_idx = torch.argmin(torch.abs(TAUS_RANGE - target_taus))

In [5]:
print(TAUS_RANGE[target_tau_idx])

tensor(0.0986, dtype=torch.float64)


In [None]:
# load model and data

ds_cal = PromptOnlyDataset("data/rtp_500/split_1_0.5_0.1_0.2_0.2/cal.pkl")
ds_test = PromptOnlyDataset("data/rtp_500/split_1_0.5_0.1_0.2_0.2/test.pkl")
dl_test = DataLoader(ds_test, batch_size=1500, shuffle=False)

model = ToxicClassifier.load_from_checkpoint("saved/Jigsaw_BERT/lightning_logs/version_1/checkpoints/epoch=4-step=970.ckpt")
_ = model.eval()

In [None]:
# create calibration subset (optional)
import random

print("Creating random subset...")
print("Original size:", len(ds_cal.data))

k = len(ds_cal.data) // 4
idx = random.sample(range(len(ds_cal.data)), k)
ds_cal.data = [ds_cal.data[i] for i in idx]

print("Subset size:", len(ds_cal.data))

In [None]:
from src import utils

# Create the parameter dictionary for the rating backend.
RATER_PARAMS = {
    "model_type": "original",
    "amp": True,
}

# Get the Hugging Face key.
HF_KEY = utils.api_key_from_file("HF_KEY.txt")

# Create the parameter dictionary for the generation backend.
GENERATOR_PARAMS = {
    "model_name": "meta-llama/Llama-3.2-1B",
    "hub_token": HF_KEY,
    "max_input_tokens": 40,
    "max_output_tokens": 30,
    "torch_dtype": torch.bfloat16,
}

In [None]:
# EXPERIMENT CONFIGS

target_taus = torch.tensor([0.1])
TAUS_RANGE = torch.tensor(np.logspace(-8, -0.33, 500))
target_tau_idx = torch.argmin(torch.abs(TAUS_RANGE - target_taus))

model.set_taus(TAUS_RANGE)
model.set_min_p_for_q_tau(1e-20)

# name, min_sample_size, share_budget, naive
EXPERIMENTS = [
    ("Fixed Budgeting", None, False, True),
    ("Adaptive Budgeting", None, False, False),
    ("Capped Adaptive Budgeting", 0.5, False, False),
    ("Global Budgeting", 0.5, True, False),
]

NUM_RUNS = 5

BUDGET_RANGE = torch.logspace(start=1, end=3, steps=10, base=10).int().unique().tolist()

In [None]:
import pandas as pd

SAVE_PATH = "results2.csv"

results_df = pd.DataFrame(
    columns=[
        "experiment",
        "budget",
        "run_num",
        "tau_hat",
        "max_est",
        "calib_tau_hat_miscoverage",
        "calib_tau_target_miscoverage",
        "calib_mean_generated_samples",
        "calib_mean_c_value",
        "test_tau_hat_lpb",
        "test_tau_target_lpb",
        "time_delta",
    ]
)


def save_results(save_path, df):
    df.to_csv(save_path, index=False)
    print(f"Results saved to {save_path}")


def load_results(save_path):
    df = pd.read_csv(save_path, index_col=None)
    print(f"Results loaded from {save_path}")
    return df

In [None]:
import pandas as pd
from src.conformal import conformalize
import time

for run_num in range(NUM_RUNS):

    for exp_type in EXPERIMENTS:

        trainer = pl.Trainer(enable_progress_bar=False)
        name, min_sample_size, share_budget, naive = exp_type

        for budget in BUDGET_RANGE:

            print("-" * 50)
            print(f"Running {name} with budget {budget} (run {run_num + 1}/{NUM_RUNS})")
            print("-" * 50)

            start_time = time.time()

            # Call the conformalize function with the specified parameters.
            result_tuple = conformalize(
                trainer=trainer,
                model=model,
                target_taus=target_taus,
                canidate_taus=TAUS_RANGE,
                X=ds_cal,
                generator_params=GENERATOR_PARAMS,
                rater_params=RATER_PARAMS,
                budget_per_sample=budget,
                share_budget=share_budget,
                min_sample_size=min_sample_size,
                naive=naive,
                text_prep_func="sentence_completion",
                multi_gpu=True,
                plot=False,
                return_extra=True,
                batch_size=1500,
            )

            (
                tau_hat,  # chosen tau for the target miscoverage
                max_est,  # maximum quantile prediction
                q_hats,  # quantile predictions for the chosen tau
                T_tilde,  # sampled survival time for all samples
                C,  # censoring time indicator
                quantile_est,  # predicted quantile estimates for all taus
                prior_quantile_est,  # each output is sampled at most prior_quantile_est times
                C_probs,  # sampling probability of each sample
                weights,  # weights used for the weighted miscoverage
                miscoverage,  # miscoverage rate for each tau
            ) = result_tuple

            time_delta = time.time() - start_time

            tau_hat_idx = np.argmin(torch.abs(TAUS_RANGE - tau_hat)).item()
            tau_hat_miscoverage = miscoverage[tau_hat_idx].item()
            tau_target_miscoverage = miscoverage[target_tau_idx].item()

            # compute total number of generated samples
            mean_generated_samples = T_tilde.mean().item()
            mean_c_value = C.mean().item()

            # compute LPB
            test_pred_raw = trainer.predict(model, dataloaders=dl_test)
            test_quantile_est = np.vstack([p["tau"].T for p in test_pred_raw])
            tau_hat_lpb = test_quantile_est[:, tau_hat_idx].clip(min=1, max=max_est).mean().item()
            tau_target_lpb = test_quantile_est[:, target_tau_idx].clip(min=1, max=max_est).mean().item()

            # add results to dataframe
            result_dict = {
                "experiment": name,
                "budget": budget,
                "run_num": run_num,
                "tau_hat": tau_hat,
                "max_est": max_est,
                "calib_tau_hat_miscoverage": tau_hat_miscoverage,
                "calib_tau_target_miscoverage": tau_target_miscoverage,
                "calib_mean_generated_samples": mean_generated_samples,
                "calib_mean_c_value": mean_c_value,
                "test_tau_hat_lpb": tau_hat_lpb,
                "test_tau_target_lpb": tau_target_lpb,
                "time_delta": time_delta,
            }

            results_df = pd.concat([results_df, pd.DataFrame([result_dict])], ignore_index=True)

            save_results(SAVE_PATH, results_df)