In [None]:
# types of runs for each experiment
# 1. uncalibrated              - original model without conformal at all
# 2. fixed budgeting           - share_budget=False and min_sample_size=None and naive=True
# 3. adaprive budgeting        - share_budget=False and min_sample_size=None and naive=False
# 4. capped adaptive budgeting - share_budget=False and min_sample_size=0.1  and naive=False
# 5. gloal budgeting           - share_budget=True  and min_sample_size=0.1  and naive=False

# experiments:
# A) mean coverage as function of budget per sample - TODO: how to measure?
# B) mean samples vs budget per sample (total num of samples generated)
# C) lpb vs budget per sample (average LPB value)

In [2]:
import pytorch_lightning as pl
import torch
import numpy as np
from torch.utils.data import DataLoader  # Import DataLoader
from src.failure_model import ToxicClassifier
from src.datasets import PromptOnlyDataset, PropDataset
import torch._dynamo

torch._dynamo.config.suppress_errors = True

try:
    torch.multiprocessing.set_start_method("spawn")
except RuntimeError:
    pass

In [3]:
# load model and data

ds_cal = PromptOnlyDataset("data/rtp_500/split_1_0.5_0.1_0.2_0.2/cal.pkl")
ds_test = PromptOnlyDataset("data/rtp_500/split_1_0.5_0.1_0.2_0.2/test.pkl")
dl_test = DataLoader(ds_test, batch_size=1500, shuffle=False)

model = ToxicClassifier.load_from_checkpoint("saved/Jigsaw_BERT/lightning_logs/version_1/checkpoints/epoch=4-step=970.ckpt")
_ = model.eval()

In [4]:
# create calibration subset (optional)
import random

print("Creating random subset...")
print("Original size:", len(ds_cal.data))

k = len(ds_cal.data) // 4 
idx = random.sample(range(len(ds_cal.data)), k)
ds_cal.data = [ds_cal.data[i] for i in idx]

print("Subset size:", len(ds_cal.data))

Creating random subset...
Original size: 19888
Subset size: 4972


In [5]:
from src import utils

# Create the parameter dictionary for the rating backend.
RATER_PARAMS = {
    "model_type": "original",
    "amp": True,
}

# Get the Hugging Face key.
HF_KEY = utils.api_key_from_file("HF_KEY.txt")

# Create the parameter dictionary for the generation backend.
GENERATOR_PARAMS = {
    "model_name": "meta-llama/Llama-3.2-1B",
    "hub_token": HF_KEY,
    "max_input_tokens": 40,
    "max_output_tokens": 30,
    "torch_dtype": torch.bfloat16,
}

In [6]:
# EXPERIMENT CONFIGS

target_taus = torch.tensor([0.1])
TAUS_RANGE = torch.tensor(np.logspace(-8, -0.33, 500))
target_tau_idx = torch.argmin(torch.abs(TAUS_RANGE - target_taus))

model.set_taus(TAUS_RANGE)
model.set_min_p_for_q_tau(1e-20)

EXPERIMENTS = [
    # ("Fixed Budgeting", None, False, True),
    # ("Adaptive Budgeting", None, False, False),
    # ("Capped Adaptive Budgeting", 0.1, False, False),
    # ("Global Budgeting", 0.1, True, False),
    # ("Global Budgeting Low", 0.01, True, False),
    # ("Global Budgeting High", 1.0, True, False),
    ("Global Budgeting Medium", 0.5, True, False),
]

NUM_RUNS = 1

# BUDGET_RANGE = torch.logspace(start=1, end=3, steps=10, base=10).int().unique().tolist()
BUDGET_RANGE = [599]

In [7]:
import pandas as pd

SAVE_PATH = "results2.csv"

results_df = pd.DataFrame(
    columns=[
        "experiment",
        "budget",
        "run_num",
        "tau_hat",
        "max_est",
        "calib_tau_hat_miscoverage",
        "calib_tau_target_miscoverage",
        "calib_mean_generated_samples",
        "calib_mean_c_value",
        "test_tau_hat_lpb",
        "test_tau_target_lpb",
        "time_delta",
    ]
)


def save_results(save_path, df):
    df.to_csv(save_path, index=False)
    print(f"Results saved to {save_path}")


def load_results(save_path):
    df = pd.read_csv(save_path, index_col=None)
    print(f"Results loaded from {save_path}")
    return df

In [8]:
import pandas as pd
from src.conformal import conformalize
import time

for run_num in range(NUM_RUNS):

    for exp_type in EXPERIMENTS:

        trainer = pl.Trainer(enable_progress_bar=False)
        name, min_sample_size, share_budget, naive = exp_type

        for budget in BUDGET_RANGE:

            print("-" * 50)
            print(f"Running {name} with budget {budget} (run {run_num + 1}/{NUM_RUNS})")
            print("-" * 50)
            
            start_time = time.time()

            # Call the conformalize function with the specified parameters.
            result_tuple = conformalize(
                trainer=trainer,
                model=model,
                target_taus=target_taus,
                canidate_taus=TAUS_RANGE,
                X=ds_cal,
                generator_params=GENERATOR_PARAMS,
                rater_params=RATER_PARAMS,
                budget_per_sample=budget,
                share_budget=share_budget,
                min_sample_size=min_sample_size,
                naive=naive,
                text_prep_func="sentence_completion",
                multi_gpu=True,
                plot=False,
                return_extra=True,
                batch_size=1500,
            )

            (
                tau_hat,  # chosen tau for the target miscoverage
                max_est,  # maximum quantile prediction
                q_hats,  # quantile predictions for the chosen tau
                T_tilde,  # sampled survival time for all samples
                C,  # censoring time indicator
                quantile_est,  # predicted quantile estimates for all taus
                prior_quantile_est,  # each output is sampled at most prior_quantile_est times
                C_probs,  # sampling probability of each sample
                weights,  # weights used for the weighted miscoverage
                miscoverage,  # miscoverage rate for each tau
            ) = result_tuple

            time_delta = time.time() - start_time

            tau_hat_idx = np.argmin(torch.abs(TAUS_RANGE - tau_hat)).item()
            tau_hat_miscoverage = miscoverage[tau_hat_idx].item()
            tau_target_miscoverage = miscoverage[target_tau_idx].item()

            # compute total number of generated samples
            mean_generated_samples = T_tilde.mean().item()
            mean_c_value = C.mean().item()

            # compute LPB
            test_pred_raw = trainer.predict(model, dataloaders=dl_test)
            test_quantile_est = np.vstack([p["tau"].T for p in test_pred_raw])
            tau_hat_lpb = test_quantile_est[:, tau_hat_idx].mean().item()
            tau_target_lpb = test_quantile_est[:, target_tau_idx].mean().item()

            # add results to dataframe
            result_dict = {
                "experiment": name,
                "budget": budget,
                "run_num": run_num,
                "tau_hat": tau_hat,
                "max_est": max_est,
                "calib_tau_hat_miscoverage": tau_hat_miscoverage,
                "calib_tau_target_miscoverage": tau_target_miscoverage,
                "calib_mean_generated_samples": mean_generated_samples,
                "calib_mean_c_value": mean_c_value,
                "test_tau_hat_lpb": tau_hat_lpb,
                "test_tau_target_lpb": tau_target_lpb,
                "time_delta": time_delta,
            }

            results_df = pd.concat([results_df, pd.DataFrame([result_dict])], ignore_index=True)

            save_results(SAVE_PATH, results_df)

/home/fre.gilad/miniforge3/envs/vllm4/lib/python3.11/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /home/fre.gilad/miniforge3/envs/vllm4/lib/python3.11 ...
You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/fre.gilad/miniforge3/envs/vllm4/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, un

--------------------------------------------------
Running Global Budgeting Medium with budget 599 (run 1/1)
--------------------------------------------------
GPU 0 processing 4972 prompts
INFO 04-30 01:47:31 [__init__.py:239] Automatically detected platform cuda.
INFO: Overhead tokens:  100
INFO: Empty input tokens:  1
INFO: Total sequence tokens:  71
INFO 04-30 01:47:49 [config.py:585] This model supports multiple tasks: {'reward', 'embed', 'score', 'generate', 'classify'}. Defaulting to 'generate'.
INFO 04-30 01:47:49 [config.py:1697] Chunked prefill is enabled with max_num_batched_tokens=106600.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


INFO 04-30 01:47:52 [core.py:54] Initializing a V1 LLM engine (v0.8.2) with config: model='meta-llama/Llama-3.2-1B', speculative_config=None, tokenizer='meta-llama/Llama-3.2-1B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=171, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=meta-llama/Llama-3.2-1B, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=True, chunked_prefill_enabled=True, use_

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.70it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.70it/s]



INFO 04-30 01:47:55 [loader.py:447] Loading weights took 0.71 seconds
INFO 04-30 01:47:55 [gpu_model_runner.py:1186] Model loading took 2.3185 GB and 2.118487 seconds
INFO 04-30 01:48:02 [backends.py:415] Using cache directory: /home/fre.gilad/.cache/vllm/torch_compile_cache/8ea18a64bd/rank_0_0 for vLLM's torch.compile
INFO 04-30 01:48:02 [backends.py:425] Dynamo bytecode transform time: 6.87 s
INFO 04-30 01:48:03 [backends.py:115] Directly load the compiled graph for shape None from the cache
INFO 04-30 01:48:08 [monitor.py:33] torch.compile takes 6.87 s in total
INFO 04-30 01:48:10 [kv_cache_utils.py:566] GPU KV cache size: 432,448 tokens
INFO 04-30 01:48:10 [kv_cache_utils.py:569] Maximum concurrency for 171 tokens per request: 2528.94x
INFO 04-30 01:48:24 [gpu_model_runner.py:1534] Graph capturing finished in 14 secs, took 0.90 GiB
INFO 04-30 01:48:24 [core.py:151] init engine (profile, create kv cache, warmup model) took 28.23 seconds


Processing Prompts: 100%|██████████| 4972/4972 [1:58:09<00:00,  1.43s/it, batch_num=2120, batch_time=0.20, time_remaining=0:00:00]             
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Results saved to results2.csv


  results_df = pd.concat([results_df, pd.DataFrame([result_dict])], ignore_index=True)
