# vLLM JSONSchemaBench + SchemaBench

Run JSONSchemaBench (epfl-dlab/jsonschemabench) and SchemaBench (thunlp/SchemaReinforcementLearning) using vLLM, with evaluation logic taken from the repos.

## Setup
Install dependencies if needed. vLLM requires a compatible CUDA setup for GPU usage.

In [None]:
# Optional: install dependencies
# !pip install -q vllm transformers datasets jsonschema jsonlines json5 tqdm numpy

In [None]:
from pathlib import Path
import sys

BENCH_ROOT = Path("/Users/songuijin/benchmarks")
JSONSCHEMA_REPO = BENCH_ROOT / "jsonschemabench-epfl"
SCHEMABENCH_REPO = BENCH_ROOT / "schemabench"

sys.path.append(str(JSONSCHEMA_REPO))
sys.path.append(str(SCHEMABENCH_REPO))

print(JSONSCHEMA_REPO)
print(SCHEMABENCH_REPO)

## Model configuration

In [None]:
MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.2"
TENSOR_PARALLEL_SIZE = 1
MAX_MODEL_LEN = None
DTYPE = "auto"
GPU_MEMORY_UTILIZATION = 0.90

TEMPERATURE = 0.0
TOP_P = 0.95
MAX_NEW_TOKENS = 512

In [None]:
import time
from vllm import LLM, SamplingParams

llm_kwargs = {
    "model": MODEL_ID,
    "tensor_parallel_size": TENSOR_PARALLEL_SIZE,
    "dtype": DTYPE,
    "gpu_memory_utilization": GPU_MEMORY_UTILIZATION,
}
if MAX_MODEL_LEN is not None:
    llm_kwargs["max_model_len"] = MAX_MODEL_LEN

llm = LLM(**llm_kwargs)
tokenizer = llm.get_tokenizer()

sampling_params = SamplingParams(
    temperature=TEMPERATURE,
    top_p=TOP_P,
    max_tokens=MAX_NEW_TOKENS,
)

def render_chat_messages(messages):
    if hasattr(tokenizer, "apply_chat_template"):
        return tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
    return "\n".join([f"{m['role']}: {m['content']}" for m in messages]) + "\nassistant:"

def extract_json_text(text):
    if "```json" in text:
        return text.split("```json", 1)[1].split("```", 1)[0].strip()
    return text.strip()

def generate_one(messages, params=None):
    prompt = render_chat_messages(messages)
    start = time.perf_counter()
    outputs = llm.generate([prompt], params or sampling_params)
    elapsed = time.perf_counter() - start

    request_output = outputs[0]
    output = request_output.outputs[0]

    text = output.text
    generation = extract_json_text(text)
    prompt_tokens = len(request_output.prompt_token_ids) if request_output.prompt_token_ids else 0
    output_tokens = len(output.token_ids) if output.token_ids else 0

    tpot_ms = (elapsed / max(output_tokens, 1)) * 1000 if output_tokens else None

    return {
        "prompt": prompt,
        "raw_text": text,
        "json": generation,
        "elapsed": elapsed,
        "prompt_tokens": prompt_tokens,
        "output_tokens": output_tokens,
        "tpot_ms": tpot_ms,
    }

## JSONSchemaBench (epfl-dlab/jsonschemabench)

In [None]:
from core.dataset import Dataset, DatasetConfig, DATASET_NAMES
from core.messages import FEW_SHOTS_MESSAGES_FORMATTER
from core.evaluator import evaluate
from core.types import (
    GenerationOutput,
    TokenUsage,
    PerfMetrics,
    CompileStatus,
    CompileStatusCode,
)

TASK = "Kubernetes"
LIMIT = 5

print("Available tasks:", DATASET_NAMES)

dataset = Dataset(DatasetConfig(TASK, limit=LIMIT))

In [None]:
sample_messages, sample_schema = next(dataset.iter(FEW_SHOTS_MESSAGES_FORMATTER))
print(render_chat_messages(sample_messages))

In [None]:
outputs = []
for messages, schema in dataset.iter(FEW_SHOTS_MESSAGES_FORMATTER):
    result = generate_one(messages)
    output = GenerationOutput(
        task=TASK,
        messages=messages,
        generation=result["json"],
        schema=schema,
    )
    output.metadata.compile_status = CompileStatus(code=CompileStatusCode.OK)
    output.token_usage = TokenUsage(
        input_tokens=result["prompt_tokens"],
        output_tokens=result["output_tokens"],
    )
    output.perf_metrics = PerfMetrics(
        tgt=result["elapsed"],
        tpot=result["tpot_ms"],
    )
    outputs.append(output)

dc, ec, compliance, perf_metrics, output_tokens = evaluate(outputs)

def metric_to_dict(metric):
    return {
        "min": metric.min,
        "max": metric.max,
        "median": metric.median,
        "std": metric.std,
    }

print("Declared coverage", metric_to_dict(dc))
print("Empirical coverage", metric_to_dict(ec))
print("Compliance", metric_to_dict(compliance))
print("TTFT", metric_to_dict(perf_metrics.ttft))
print("TPOT", metric_to_dict(perf_metrics.tpot))
print("TGT", metric_to_dict(perf_metrics.tgt))
print("GCT", metric_to_dict(perf_metrics.gct))
print("Output tokens", metric_to_dict(output_tokens))

## SchemaBench (thunlp/SchemaReinforcementLearning)
SchemaBench requires data files in `schemabench/data`. Download them from the repo README and place them in the correct folders before running.

In [None]:
from schemabench.bench.union import UnionSyntaxBench

SCHEMABENCH_DATA = SCHEMABENCH_REPO / "schemabench" / "data"
if not SCHEMABENCH_DATA.exists():
    print("SchemaBench data not found. Download and place data into:", SCHEMABENCH_DATA)

BENCH_CATEGORY = "schema"  # all, schema, reasoning, complex, custom, escape, math500, mmlu, arc, gsm8k
SUBSET = True
N_SHOTS = 3
MAX_SAMPLES = 5

bench = UnionSyntaxBench(n_shots=N_SHOTS, subset=SUBSET, test_category=BENCH_CATEGORY)

sample_item = next(iter(bench))
sample_messages = sample_item.get_prompt()
print(render_chat_messages(sample_messages))

In [None]:
import asyncio
import itertools

async def run_schemabench(bench, max_samples):
    results = []
    for benchitem in itertools.islice(bench, max_samples):
        messages = benchitem.get_prompt()
        result = await asyncio.to_thread(generate_one, messages)
        pred = result["json"]
        error = None
        try:
            score = await benchitem.validate(pred)
        except Exception as e:
            score = False
            error = f"{type(e).__name__}: {e}"
        results.append({
            "dataset": benchitem.question.dataset,
            "question": benchitem.question.query,
            "schema": benchitem.question.validate_schema,
            "answer": pred,
            "score": score,
            "error": error,
        })
    return results

results = await run_schemabench(bench, MAX_SAMPLES)
accuracy = sum(1 for r in results if r["score"]) / max(len(results), 1)

print("SchemaBench accuracy", accuracy)
results[:2]