In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
from tqdm import tqdm
import numpy as np
import pandas as pd
import os

In [None]:
"""
Verfier is used to check for the quality of synethic QA using llama-guard-3-8B
"""

In [None]:
model_name = 'meta-llama/Llama-Guard-3-8B'
out_dir = "output"
df_data = pd.read_json(f"{out_dir}/all_responses.json")
df_data

In [22]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
llm = LLM(model=model_name, max_model_len=4096, tensor_parallel_size=4, gpu_memory_utilization=0.7)

In [24]:
sampling_params = SamplingParams(temperature=0.9, top_p=0.97, max_tokens=40)

In [25]:
def extract_resp(msg):
    assistant_content = msg[1]["content"]
    resp = assistant_content[assistant_content.find("Agent:\n")+7:]
    return resp

In [26]:
subset = df_data #.sample(100,random_state=10)
formatted = [tokenizer.apply_chat_template([{"content":row["prompt"],"role":"user"},{"content":extract_resp(row["messages"]),"role":"assistant"}], tokenize=False, add_generation_prompt=True) for i, row in subset.iterrows()]

In [None]:
print(formatted[76])

In [None]:
outputs = llm.generate(formatted, sampling_params)

In [None]:
def set_out(row, out):
        if "unsafe" in out:
            row["Guard"] = "unsafe"
        elif "safe" in out:
            row["Guard"] = "safe"
        else:
            row["Guard"] = "none"
        return row

subset_out = pd.DataFrame(columns = subset.columns)
import warnings
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    for (i, row), out in tqdm(zip(subset.iterrows(),outputs),total=len(outputs),desc="post generation processing"):
        res = out.outputs[0].text.lower()
        new_row = set_out(row, res)
        new_row["Guard_prompt"] = out.prompt
        subset_out = pd.concat([subset_out, pd.DataFrame([new_row])])

In [32]:
subset_out = subset_out[~(subset_out["llm_judge"]=="none")]

In [None]:
subset_out["Guard"].value_counts()

In [None]:
subset_out["safe_category"].value_counts()

In [None]:
subset_out["llm_judge"].value_counts()

In [36]:
subset_out.to_json(f"{out_dir}/full_responses_guard.json")