In [1]:
import pytorch_lightning as pl
from huggingface_hub.hf_api import HfFolder
import torch
import numpy as np
from torch.utils.data import DataLoader
import pandas as pd

from src.failure_model import ToxicClassifier
from src.datasets import PromptOnlyDataset, SurvivalDataset
from src.conformal import conformalize
from src import utils

import time
import os
import sys
import logging


import torch._dynamo


# NOTE: supress more errors, great
torch._dynamo.config.suppress_errors = True

INFO 05-07 22:52:44 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 05-07 22:52:45 [__init__.py:239] Automatically detected platform cuda.


In [2]:
def save_results(save_path, df):
    df.to_csv(save_path, index=False)
    print(f"Results saved to {save_path}")


def load_results(save_path):
    df = pd.read_csv(save_path, index_col=None)
    print(f"Results loaded from {save_path}")
    return df

In [4]:

TEST_PROMPTS_PATH = "data/test_prompt_only.pkl"
TEST_SURV_TIME_PATH = "data/test_surv_times.npy"

# MODEL_PATH = "saved/Prop_rtp_500_ModernBERT/lightning_logs/version_0/checkpoints/epoch=4-step=495.ckpt"
MODEL_PATH = "saved/Prop_rtp_500_ModernBERT/lightning_logs/version_1/checkpoints/epoch=7-step=792.ckpt"
# MODEL_PATH = "saved/Jigsaw_BERT/lightning_logs/version_1/checkpoints/epoch=4-step=970.ckpt"

TARGET_TAUS = torch.tensor([0.1])
TAUS_RANGE = torch.tensor([0.1])
TAU_TARGET_IDX = np.argmin(torch.abs(TAUS_RANGE - TARGET_TAUS)).item()

BATCH_SIZE = 1500

SAVE_PATH = "results_uncalib.csv"

In [5]:
# laod data

ds_test = PromptOnlyDataset(TEST_PROMPTS_PATH)
dl_test = DataLoader(ds_test, batch_size=1500, shuffle=False)
test_t_tilde = np.load(TEST_SURV_TIME_PATH)

print(f"Loaded {len(ds_test)} test samples.")

Loaded 12000 test samples.


In [6]:
# load model
model = ToxicClassifier.load_from_checkpoint(MODEL_PATH)
_ = model.eval()

model.set_taus(TAUS_RANGE)
model.set_min_p_for_q_tau(1e-20)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

# NOTE: dont enable multiple-gpus for inference, as it causes weird bugs
trainer = pl.Trainer(accelerator="gpu", devices=1)

# compute LPB on test set
test_pred_raw = trainer.predict(model, dataloaders=dl_test)
test_quantile_est = np.vstack([p["tau"].T for p in test_pred_raw]).clip(min=1)
tau_hat_pred = test_quantile_est[:, TAU_TARGET_IDX].flatten().astype(np.int64)
test_mean_lpb = tau_hat_pred.mean().item()

# compute LPB on test set for covered samples
test_mean_covered_lpb = tau_hat_pred[test_t_tilde >= tau_hat_pred].mean().item()

# compute miscoverage upper-bound on test set
test_miscoverage = np.mean(test_t_tilde < tau_hat_pred)

# add results to dataframe
result_dict = {
    "exp_name": "Uncalibrated",
    "exp_min_sample_size": np.nan,
    "exp_share_budget": False,
    "exp_naive": False,
    "exp_budget": np.nan,
    "exp_run_num": 0,
    "tau_hat": TARGET_TAUS[TAU_TARGET_IDX].item(),
    "max_est": np.inf,
    "cal_hours": np.nan,
    "cal_mean_generated_samples": np.nan,
    "cal_mean_c_value": np.nan,
    "cal_miscoverage": np.nan,
    "test_miscoverage": test_miscoverage,
    "test_mean_lpb": test_mean_lpb,
    "test_mean_covered_lpb": test_mean_covered_lpb
}

print(f"Test miscoverage: {test_miscoverage:.4f}")
print(f"Test mean LPB: {test_mean_lpb:.4f}")
print(f"Test mean covered LPB: {test_mean_covered_lpb:.4f}")

result_df = pd.DataFrame([result_dict])
save_results(SAVE_PATH, result_df)

/home/fre.gilad/miniforge3/envs/llm-survival-final/lib/python3.11/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /home/fre.gilad/miniforge3/envs/llm-survival-final/l ...
Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/fre.gilad/miniforge3/envs/llm-survival-final/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=T

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/fre.gilad/miniforge3/envs/llm-survival-final/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.


Predicting: |          | 0/? [00:00<?, ?it/s]

Test miscoverage: 0.1357
Test mean LPB: 532.2369
Results saved to results_uncalib.csv
