In [2]:
import os

from grounded_ai import Evaluator

In [3]:
print("--- 1. TOXICITY EVALUATION ---")
# Phi-4 Mini Judge loaded in Toxicity mode
evaluator = Evaluator(
    model="grounded-ai/phi4-mini-judge",
    device="cuda" if os.path.exists("/proc/driver/nvidia") else "cpu",  # Auto-detect
)

In [4]:
toxic_input = "You are completely useless and nobody likes you."
safe_input = "The weather is nice today."

# Set mode on backend
evaluator.backend.set_eval_mode("TOXICITY")
print(f"Evaluating: '{toxic_input}'")
res1 = evaluator.evaluate(response=toxic_input, eval_mode="TOXICITY")
print(f"Result: {res1.label} (Score: {res1.score})\n")

print(f"Evaluating: '{safe_input}'")
res2 = evaluator.evaluate(response=safe_input, eval_mode="TOXICITY")
print(f"Result: {res2.label} (Score: {res2.score})")

Evaluating: 'You are completely useless and nobody likes you.'
Result: toxic (Score: 1.0)

Evaluating: 'The weather is nice today.'
Result: non-toxic (Score: 0.0)


In [5]:
print("\n--- 2. RAG RELEVANCE EVALUATION ---")
# Same model, different mode (different system prompt/parsing)
evaluator.backend.set_eval_mode("RAG_RELEVANCE")
query = "What are the benefits of vitamin D?"
doc_relevant = "Vitamin D is essential for strong bones because it helps the body use calcium from the diet."
doc_irrelevant = (
    "The Eiffel Tower is located in Paris, France and was constructed in 1889."
)

print(f"Query: {query}")
print(f"Doc (Relevant): {doc_relevant}")
res1 = evaluator.evaluate(query=query, context=doc_relevant)
print(f"Result: {res1.label} (Score: {res1.score})\n")

print(f"Doc (Irrelevant): {doc_irrelevant}")
res2 = evaluator.evaluate(query=query, context=doc_irrelevant)
print(f"Result: {res2.label} (Score: {res2.score})")


--- 2. RAG RELEVANCE EVALUATION ---
Query: What are the benefits of vitamin D?
Doc (Relevant): Vitamin D is essential for strong bones because it helps the body use calcium from the diet.
Result: relevant (Score: 1.0)

Doc (Irrelevant): The Eiffel Tower is located in Paris, France and was constructed in 1889.
Result: unrelated (Score: 0.0)


In [6]:
print("\n--- 3. HALLUCINATION EVALUATION ---")
# Same model, different mode
evaluator.backend.set_eval_mode("HALLUCINATION")
context = (
    "The company reported Q3 revenue of $1.2 billion, a 5% increase year-over-year."
)
query = "What was the revenue?"

faithful_response = "The revenue was $1.2 billion."
hallucinated_response = "The revenue was $2.5 billion."

print(f"Context: {context}")
print(f"Faithful: {faithful_response}")
res1 = evaluator.evaluate(query=query, context=context, response=faithful_response)
print(f"Result: {res1.label} (Score: {res1.score})\n")

print(f"Hallucinated: {hallucinated_response}")
res2 = evaluator.evaluate(query=query, context=context, response=hallucinated_response)
print(f"Result: {res2.label} (Score: {res2.score})")


--- 3. HALLUCINATION EVALUATION ---
Context: The company reported Q3 revenue of $1.2 billion, a 5% increase year-over-year.
Faithful: The revenue was $1.2 billion.
Result: faithful (Score: 0.0)

Hallucinated: The revenue was $2.5 billion.
Result: hallucination (Score: 1.0)
