#### Pip and installs

In [None]:
!git clone https://github.com/gpapageorgiouedu/Evaluating-Faithfulness-in-Agentic-RAG-Systems-for-e-Governance-applications-LLM-Based.git

In [None]:
# Important: Please restart the session update the packages install

!pip install -q \
neo4j-haystack==2.2.1 \
anthropic-haystack==3.1.0 \
google-ai-haystack==5.3.0 \
openai==1.72.0 \
sentence-transformers==3.4.1 \
yfiles_jupyter_graphs==1.10.2 \
trafilatura==2.0.0 \
demjson3==3.0.6 \
tiktoken==0.9.0

In [None]:
# core libs imports for data handling, file management, and type annotations
import json
import demjson3
import openai
import os
import re
import ast
import shutil
import time
from collections import defaultdict
from pathlib import Path
from typing import List, Dict, Any, Optional
import pandas as pd
import numpy as np
import tiktoken

# google colab utils for output and secure data storage
from google.colab import output, userdata
from google.colab.output import eval_js
from google.colab import drive
from google.colab import files

# neo4j database integration
from neo4j import GraphDatabase, basic_auth

# haystack/ neo4j integration components
from neo4j_haystack import Neo4jDocumentStore, Neo4jEmbeddingRetriever

# graph visualization in notebooks (explanatory)
from yfiles_jupyter_graphs import GraphWidget

# haystack components for pipeline construction and data processing
from haystack import Pipeline, component, Document
from haystack.core.component import component, Component
from haystack.components.agents import Agent
from haystack.components.builders import PromptBuilder
from haystack.components.converters import HTMLToDocument
from haystack.components.embedders import OpenAIDocumentEmbedder, OpenAITextEmbedder
from haystack.components.fetchers import LinkContentFetcher
from haystack.components.generators import OpenAIGenerator
from haystack.components.generators.chat import OpenAIChatGenerator
from haystack_integrations.components.generators.anthropic import AnthropicChatGenerator
from haystack_integrations.components.generators.google_ai import GoogleAIGeminiChatGenerator
from haystack.components.preprocessors import DocumentCleaner
from haystack.components.rankers import TransformersSimilarityRanker
from haystack.components.websearch import SerperDevWebSearch
from haystack.components.evaluators import FaithfulnessEvaluator, ContextRelevanceEvaluator
from haystack.core import SuperComponent
from haystack.dataclasses import ChatMessage, ToolCall, ToolCallResult, TextContent
from haystack.tools.component_tool import ComponentTool
from haystack.utils import Secret

In [None]:
# set environment variables from secure colab userdata and read environment variables into local constants
os.environ["NEO4J_URI"] = userdata.get("NEO4J_URI")
os.environ["NEO4J_USERNAME"] = userdata.get("NEO4J_USERNAME")
os.environ["NEO4J_PASSWORD"] = userdata.get("NEO4J_PASSWORD")
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI")
os.environ["ANTHROPIC_API_KEY"] = userdata.get("ANTHROPIC")
os.environ["GOOGLE_API_KEY"] = userdata.get("GEMINI")
os.environ["SERPERDEV_API_KEY"] = userdata.get("SERPER")

NEO4J_URI = os.environ["NEO4J_URI"]
NEO4J_USER = os.environ["NEO4J_USERNAME"]
NEO4J_PASS = os.environ["NEO4J_PASSWORD"]
SERPERDEV_API_KEY = os.environ["SERPERDEV_API_KEY"]

# init and test neo4j connection
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASS))
try:
    with driver.session() as session:
        info = session.run("RETURN 1 AS result").single()
        print("Neo4j connected, test query result:", info["result"])
finally:
    driver.close()

#### Delete Docs (Optional)

In [None]:
def delete_all(tx):
    """
    Delete all nodes and relationships from the graph.

    Args:
        tx: Neo4j transaction object.
    """
    tx.run("MATCH (n) DETACH DELETE n")


def count_remaining(tx):
    """
    Count remaining nodes in the graph.

    Args:
        tx: Neo4j transaction object.

    Returns:
        int: Number of nodes remaining.
    """
    result = tx.run("MATCH (n) RETURN count(n) AS node_count")
    return result.single()["node_count"]


def list_node_labels(tx):
    """
    List all unique labels in the graph.

    Args:
        tx: Neo4j transaction object.

    Returns:
        List[str]: A list of label names.
    """
    result = tx.run("CALL db.labels()")
    return [record["label"] for record in result]


def count_by_label(tx, label):
    """
    Count the number of nodes for a specific label.

    Args:
        tx: Neo4j transaction object.
        label (str): The label to count.

    Returns:
        int: Number of nodes with the given label.
    """
    result = tx.run(f"MATCH (n:`{label}`) RETURN count(n) AS count")
    return result.single()["count"]


# display node counts grouped by label
with driver.session() as session:
    labels = session.read_transaction(list_node_labels)
    for label in labels:
        count = session.read_transaction(count_by_label, label)
        print(f"Label: {label}, Count: {count}")


def force_delete_by_labels(tx, labels):
    """
    Forcefully delete nodes by specified labels.

    Args:
        tx: Neo4j transaction object.
        labels (List[str]): A list of node labels to delete.
    """
    for label in labels:
        tx.run(f"MATCH (n:`{label}`) DETACH DELETE n")


# delete all nodes by label, and verify complete deletion
with driver.session() as session:
    labels = session.read_transaction(list_node_labels)
    session.write_transaction(force_delete_by_labels, labels)

with driver.session() as session:
    session.write_transaction(delete_all)
    remaining = session.read_transaction(count_remaining)

    if remaining == 0:
        print("All nodes and relationships successfully deleted.")
    else:
        print(f"Deletion incomplete: {remaining} node(s) still exist.")

#### Index Docs (Optional)

In [None]:
# load docs from your drive, otherwise direct to local repo/ upload
drive.mount('/content/drive')

src_folder = '/content/drive/My Drive/folder' # direct it into your folder or make a manual upload
dst_folder = 'json_docs'

os.makedirs(dst_folder, exist_ok=True)

for filename in os.listdir(src_folder):
    src_file = os.path.join(src_folder, filename)
    dst_file = os.path.join(dst_folder, filename)
    if os.path.isfile(src_file):
        shutil.copy(src_file, dst_file)

In [None]:
def load_json_documents(json_folder_path):
    """
    Load JSON files from a folder and convert them into Haystack Document objects.

    Args:
        json_folder_path (str or Path): Path to the folder containing .json files.

    Returns:
        List[Document]: A list of Haystack Document objects with content and metadata.
    """
    documents = []
    for json_file in Path(json_folder_path).glob("*.json"):
        with open(json_file, "r", encoding="utf-8") as f:
            data = json.load(f)

            # standardize metadata keys -> add any fits to your data use case
            if "url" in data:
                data["source_url"] = data.pop("source_url")
            if "date" in data:
                data["date"] = data.pop("date")
            if "title" in data:
                data["title"] = data.pop("title")

            content = data.get("content", "")
            metadata = {k: v for k, v in data.items() if k != "content"}

            documents.append(Document(content=content, meta=metadata))
    return documents


# load raw documents from a local folder
raw_docs = load_json_documents("json_docs")

# clean the documents before embedding
cleaner = DocumentCleaner(
    remove_empty_lines=True,
    remove_extra_whitespaces=True,
    remove_substrings=["..."]
)
cleaned_docs = cleaner.run(raw_docs)["documents"]

# init neo4j document store for embedding storage
document_store = Neo4jDocumentStore(
    url=NEO4J_URI, username=NEO4J_USER, password=NEO4J_PASS,
    database="neo4j",
    index="document-embeddings",
    embedding_field="embedding",
    embedding_dim=1536,
    node_label="Document"
)

# generate embeddings (ada-002 in our use case)
embedder = OpenAIDocumentEmbedder(model="text-embedding-ada-002")
documents_with_emb = embedder.run(cleaned_docs)["documents"]

# index the embedded documents in the neo4j store
document_store.write_documents(documents_with_emb)
print(f"Indexed {document_store.count_documents()} documents in Neo4j.")

In [None]:
def count_tokens(text, model="gpt-4.1-mini"):
    """
    Count the number of tokens in a given text using the specified model's tokenizer.

    Parameters:
        text (str): The input text to tokenize.
        model (str): The model name to determine which tokenizer to use. Defaults to "gpt-4.1-mini".

    Returns:
        int: The number of tokens in the input text.
    """
    try:
        enc = tiktoken.encoding_for_model(model)
    except Exception:
        enc = tiktoken.get_encoding("cl100k_base")
    return len(enc.encode(text))

In [None]:
def evaluate_triples_with_gpt_metrics(text, extracted_triples, doc_id, model="gpt-4.1"):
    system_prompt = (
        "You are a rigorous information extraction evaluator. You receive a passage of text and a set of knowledge triples. "
        "For each triple, determine if it is 'Correct' (supported by the text) or 'Hallucinated' (not fully supported by the text). "
        "Give a brief explanation for each. If there are factual triples in the text that are missing from the set, list them as 'missed_triples'. "
        "Return your evaluation as JSON in this format: "
        "{ 'results': [ {'triple': {...}, 'evaluation': 'Correct' or 'Hallucinated', 'explanation': '...' } ], 'missed_triples': [ {...}, ... ] }"
    )

    user_prompt = f"Text:\n{text}\n\nExtracted Triples:\n{json.dumps(extracted_triples, indent=2)}"

    # count input tokens
    input_tokens = count_tokens(system_prompt + user_prompt, model)

    # start timer
    start_time = time.perf_counter()
    response = openai.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0
    )
    latency = time.perf_counter() - start_time

    content = response.choices[0].message.content.strip()
    try:
        evaluation = json.loads(content)
    except json.JSONDecodeError as e:
        print("Error parsing evaluation response:", e)
        print("Raw response was:\n", content)
        return None

    evaluation['doc_id'] = doc_id
    evaluation['input_tokens'] = input_tokens
    evaluation['latency_seconds'] = latency

    return evaluation

In [None]:
def extract_structured_triples_with_metrics(text_chunk, doc_id, model="gpt-4.1-mini"):
    """
    Extract structured knowledge triples from a given text using an OpenAI language model.

    The output is a list of dictionaries, where each dictionary represents a triple with:
    - 'head': subject of the triple
    - 'head_type': type/classification of the head
    - 'relation': relationship between head and tail (in UPPER_SNAKE_CASE)
    - 'tail': object of the triple
    - 'tail_type': type/classification of the tail

    Args:
        text_chunk (str): A passage of text from which to extract triples.

    Returns:
        List[Dict[str, str]]: A list of structured triples, or an empty list on failure.
    """
    system_prompt = (
        "You are an information extraction assistant."
        "You are an expert in European Union's news, policies, laws, and actions. "
        "Extract all factual knowledge triples from the text in a structured format. "
        "Return the results as a JSON list where each item is an object with the keys: "
        "'head', 'head_type', 'relation', 'tail', 'tail_type'.\n\n"
        "Guidelines:\n"
        "Resolve vague pronouns (like 'I', 'we', 'they', 'he/she') to actual entities based on context.\n"
        "Use the standard full format, even when abbreviations are used in the text. For example, when 'EU' is used, write it as 'European Union'."
        "Use the standard full format for names, even if the full name is not used entirely in a specific sentence."
        "If provide information please include the full context on triples.\n"
        "Maintain consistency: refer to entities by their full and most complete identifiers.\n"
        "Use concise relation phrases written in UPPER_SNAKE_CASE.\n"
        "Avoid vague, incomplete, or uninformative triples. Use full context to provide informative and comprehensive triples.\n"
        "Return only the JSON list of objects. Do not include any explanations, additional knowledge, or markdown.\n"
        "If an entity type is unclear, make a reasonable guess or use a general type like 'Entity'."
    )

    user_prompt = f"Text: ```{text_chunk}```"
    # token count for full input
    input_tokens = count_tokens(system_prompt + user_prompt, model)

    # start timer
    start_time = time.perf_counter()
    response = openai.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0
    )
    latency = time.perf_counter() - start_time

    content = response.choices[0].message.content.strip()
    content = re.sub(r"^```json\n?", "", content)
    content = re.sub(r"\n?```$", "", content)

    try:
        structured_triples = json.loads(content)
    except json.JSONDecodeError as json_err:
        print("JSON decoding error:", json_err)
        print("Raw output was:\n", content[:500])
        structured_triples = []
    except Exception as e:
        print("Error:", e)
        structured_triples = []

    # return with metrics and doc_id
    return {
        "doc_id": doc_id,
        "triples": structured_triples,
        "latency_seconds": latency,
        "input_tokens": input_tokens,
    }

In [None]:
with driver.session() as session:
    try:
        session.run("CREATE FULLTEXT INDEX entity_index IF NOT EXISTS FOR (n) ON EACH [n.id]")
    except:
        pass

In [None]:
def sanitize_label(label):
    """
    Sanitize a string to be a valid Neo4j label.

    Converts non-alphanumeric characters to underscores and ensures the first letter is uppercase.
    Returns a default value if the label is empty after sanitization.

    Args:
        label (str): The label string to sanitize.

    Returns:
        str: A sanitized, Neo4j-safe label.
    """
    label = re.sub(r"[^a-zA-Z0-9]", "_", label.strip())
    if not label:
        return "Entity"
    return label[0].upper() + label[1:]


def retry_on_json_error(fn, max_retries=10, wait_sec=1):
    """
    Retry a function multiple times when JSON-related errors or empty results occur.

    This utility repeatedly calls the provided function until it returns a non-None 
    result or the maximum number of retries is reached. It is useful when dealing with 
    unreliable JSON responses (e.g., decoding failures).

    Args:
        fn (Callable): The function to call. It should return a non-None value 
            on success, or None on failure (e.g., due to JSON decode issues).
        max_retries (int, optional): The maximum number of attempts before giving up. 
            Defaults to 10.
        wait_sec (int or float, optional): The number of seconds to wait between attempts. 
            Defaults to 1.

    Returns:
        Any: The first non-None result returned by `fn`.  
        None: If all retries fail or return None.
    """
    for attempt in range(max_retries):
        try:
            result = fn()
            if result is not None:
                return result
            else:
                print(f"Attempt {attempt+1} returned None (likely JSON decode failure or empty result)")
        except Exception as e:
            print(f"Attempt {attempt+1} failed with exception: {e}")
        time.sleep(wait_sec)
    print(f"All {max_retries} attempts failed.")
    return None

os.makedirs("triple_extraction_results", exist_ok=True)
os.makedirs("triple_evaluation_results", exist_ok=True)

extraction_path = "triple_extraction_results/all_extractions.jsonl"
evaluation_path = "triple_evaluation_results/all_evaluations.jsonl"

# clear files if they exist (for clean run)
open(extraction_path, "w").close()
open(evaluation_path, "w").close()

with driver.session() as session:
    try:
        session.run("CREATE FULLTEXT INDEX entity_index IF NOT EXISTS FOR (n) ON EACH [n.id]")
    except Exception as e:
        print("Could not create index:", e)

with driver.session() as session:
    for doc in documents_with_emb:
        doc_id = doc.id
        text = doc.content

        # extraction with retry
        def do_extraction():
            return extract_structured_triples_with_metrics(text, doc_id)
        extraction = retry_on_json_error(do_extraction)
        if extraction is None:
            print(f"Extraction failed for doc {doc_id}, skipping.")
            continue
        triples = extraction['triples']

        # Neo4j indexing
        for triple in triples:
            subj = triple.get("head")
            subj_type = sanitize_label(triple.get("head_type", "Entity"))
            pred = triple.get("relation")
            obj = triple.get("tail")
            obj_type = sanitize_label(triple.get("tail_type", "Entity"))
            if not subj or not pred or not obj:
                continue
            rel_type = "_".join(pred.strip().split()).upper()
            rel_type = re.sub(r"[^A-Z0-9_]", "_", rel_type)
            cypher = f"""
            MERGE (s:{subj_type} {{id: $subj}})
            MERGE (o:{obj_type} {{id: $obj}})
            MERGE (s)-[r:{rel_type}]->(o)
            MERGE (d:Document {{id: $doc_id}})
            MERGE (d)-[:MENTIONS]->(s)
            MERGE (d)-[:MENTIONS]->(o)
            """
            session.run(cypher, {
                "subj": subj,
                "obj": obj,
                "doc_id": doc_id
            })

        # save extraction dynamically (append as a line)
        with open(extraction_path, "a") as f:
            f.write(json.dumps(extraction) + "\n")

        print(
            f"Doc {doc_id}: {len(triples)} triples, "
            f"{extraction['latency_seconds']:.2f}s, "
            f"{extraction['input_tokens']} tokens"
        )

        # evaluation with retry
        def do_evaluation():
            return evaluate_triples_with_gpt_metrics(text, triples, doc_id)
        evaluation = retry_on_json_error(do_evaluation)
        if evaluation:
            # save evaluation dynamically (append as a line)
            with open(evaluation_path, "a") as f:
                f.write(json.dumps(evaluation) + "\n")
            print(
                f"Eval {doc_id}: {len(evaluation.get('results', []))} evaluated, "
                f"{evaluation['latency_seconds']:.2f}s, "
                f"{evaluation['input_tokens']} tokens"
            )
        else:
            print(f"Evaluation failed for doc {doc_id}")


# convert extractions
with open("triple_extraction_results/all_extractions.jsonl") as fin:
    all_extractions = [json.loads(line) for line in fin]

with open("triple_extraction_results/all_extractions.json", "w") as fout:
    json.dump(all_extractions, fout, indent=2)

# convert evaluations
with open("triple_evaluation_results/all_evaluations.jsonl") as fin:
    all_evaluations = [json.loads(line) for line in fin]

with open("triple_evaluation_results/all_evaluations.json", "w") as fout:
    json.dump(all_evaluations, fout, indent=2)

#### RAG Pipelines

In [None]:
@component
class KnowledgeGraphRetriever():
    """
    A custom Haystack component for retrieving context-rich documents from a Neo4j knowledge graph
    based on search terms extracted by an OpenAI model.
    """

    def __init__(self, neo4j_uri: str, neo4j_user: str, neo4j_pass: str, openai_model="gpt-4.1-mini"):
        self._driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_pass))
        self._model = openai_model

    @component.output_types(documents=List[Document])
    def run(self, query: str) -> Dict[str, List[Document]]:
        """
        Run retrieval based on an input query.

        Uses a language model to extract search terms, runs Cypher queries against Neo4j,
        and formats the results into Haystack Document objects.

        Args:
            query (str): The natural language query from the user.

        Returns:
            Dict[str, List[Document]]: A dictionary with a single key "documents" containing the result set.
        """
        system_prompt = (
            "You are a search term extractor. Based on the user question, return a list of 1–10 keywords or named entities "
            "that should be used to search a knowledge graph. Use lowercase, and return only a clean list in JSON like [\"term1\", \"term2\"]"
        )

        response = openai.chat.completions.create(
            model=self._model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": query}
            ],
            temperature=0
        )

        raw_content = response.choices[0].message.content.strip()
        try:
            terms = re.findall(r'"(.*?)"', raw_content)
            if not terms:
                terms = [query]
        except Exception:
            terms = [query]

        documents = []

        with self._driver.session() as session:
            for term in terms:
                cypher = """
                    MATCH (n)-[r]-(connected)
                    WHERE toLower(n.id) CONTAINS toLower($query)
                    OPTIONAL MATCH (n)<-[:MENTIONS]-(d:Document)
                    OPTIONAL MATCH (connected)<-[:MENTIONS]-(d2:Document)
                    RETURN n, r, connected, coalesce(d, d2) AS doc
                """
                result = session.run(cypher, {"query": term})

                grouped_output = defaultdict(lambda: {"to_doc": [], "other": []})
                doc_text_lookup = {}

                for record in result:
                    n = record["n"]
                    r = record["r"]
                    connected = record["connected"]
                    doc_node = record.get("doc")

                    n_label = list(n.labels)[0] if n.labels else "Entity"
                    connected_label = list(connected.labels)[0] if connected.labels else "Entity"
                    n_id = n.get("id", "[no-id]")

                    if doc_node:
                        doc_id = doc_node.get("id", "unknown")
                        doc_content = doc_node.get("content", "[No content]")
                        doc_title = doc_node.get("title", "[No Title]")
                        doc_url = doc_node.get("source_url", "[No URL]")
                        doc_date = doc_node.get("date", "[No Date]")
                        full_doc_text = f"Title: {doc_title}\nDate: {doc_date}\nURL: {doc_url}\n\n{doc_content}"
                        doc_text_lookup[doc_id] = full_doc_text
                    else:
                        doc_id = "no_doc"

                    is_connected_doc = "Document" in connected.labels

                    if is_connected_doc:
                        triple_line = f"({n_label}: {n_id}) -[{r.type}]-> In Document below:"
                        grouped_output[doc_id]["to_doc"].append(triple_line)
                    else:
                        connected_value = connected.get("id", "[no-id]")
                        triple_line = f"({n_label}: {n_id}) -[{r.type}]-> ({connected_label}: {connected_value})"
                        grouped_output[doc_id]["other"].append(triple_line)

                for doc_id, groups in grouped_output.items():
                    doc_lines = groups["to_doc"]
                    other_lines = groups["other"]
                    content_parts = []

                    if doc_lines:
                        content_parts.extend(doc_lines)
                        doc_text = doc_text_lookup.get(doc_id, "[No document content]")
                        content_parts.append("\n\nDocument:\n" + doc_text)

                    if other_lines:
                        content_parts.append("")
                        content_parts.extend(other_lines)

                    final_content = "\n\n".join(content_parts).strip()
                    meta = {"source_doc_id": doc_id} if doc_id != "no_doc" else {}
                    documents.append(Document(content=final_content, meta=meta))

        if not documents:
            documents.append(Document(content="(No results found)"))

        return {"documents": documents}

@component
class DocumentPassthrough:
    def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
        return {"documents": documents}

In [None]:
# config the neo4j document store for retrieval
document_store = Neo4jDocumentStore(
    url=NEO4J_URI, username=NEO4J_USER, password=NEO4J_PASS,
    database="neo4j",
    index="document-embeddings",
    embedding_field="embedding",
    embedding_dim=1536,
    node_label="Document"
)

# init components
embedder_emb = OpenAITextEmbedder(model="text-embedding-ada-002", api_key= Secret.from_env_var("OPENAI_API_KEY"))

retriever_emb = Neo4jEmbeddingRetriever(document_store=document_store)

ranker_emb = TransformersSimilarityRanker(
    model="intfloat/simlm-msmarco-reranker", top_k=5
)

prompt_template_emb = """
You are an AI Assistant with access to official Documents about the European Union's news, policies, laws, and actions.

Your task is to answer user questions **STRICTLY** based on the Documents provided below.

Question: {{ query }}

Guidelines:
- Use only the content from the provided  Documents.
- Do NOT rely on prior or external knowledge.
- Answer strictly by **copying**, **quoting**, or **paraphrasing** only what is written in the provided Documents.
- **Never generate information, conclusions, or details that are not explicitly present in the documents.**
- Do NOT ask the user for additional information.
- Include inline HTML links for referencing URL sources in the answer, using the URLs provided in the Documents.
  - Use the document’s title as the anchor text.
  - If the title is missing, use the domain name of the document’s URL as the anchor text.
- Each fact you refer to should be followed by the corresponding reference.
- Output the answer in a structured markdown format.
- Use bullet lists whenever it makes sense.
- Do not add a references section at the end of the answer, just use references within the body of text.

If a definitive answer cannot be found in the Documents, respond with:
Final Answer: inconclusive

Always end your answer with this disclaimer:
Disclaimer: This is AI generated content — please use it with caution.

Documents:
{% for doc in documents %}
Source: <a href="{{ doc.meta.source_url }}"</a><br>
Title: <a href="{{ doc.meta.title }}"</a><br>
Date: <a href="{{ doc.meta.date }}"</a><br>

{{ doc.content }}
{% endfor %}

Question: {{ query }}

Answer:
"""

prompt_builder_emb = PromptBuilder(
    template=prompt_template_emb,
    required_variables=["documents", "query"]
)

generator_emb = OpenAIGenerator(
    model="gpt-4.1-mini", api_key=Secret.from_env_var("OPENAI_API_KEY"),
    generation_kwargs={
        "temperature": 0,
        "top_p": 0
    }
)

# create the pipeline, register components and connect pipeline components
emb_pipeline = Pipeline()
emb_pipeline.add_component("embedder", embedder_emb)
emb_pipeline.add_component("retriever", retriever_emb)
emb_pipeline.add_component("reranker", ranker_emb)
emb_pipeline.add_component("output_docs", DocumentPassthrough())
emb_pipeline.add_component("prompt_builder", prompt_builder_emb)
emb_pipeline.add_component("generator", generator_emb)

emb_pipeline.connect("embedder.embedding", "retriever.query_embedding")
emb_pipeline.connect("retriever.documents", "reranker.documents")
emb_pipeline.connect("reranker.documents", "output_docs.documents")
emb_pipeline.connect("reranker.documents", "prompt_builder.documents")
emb_pipeline.connect("prompt_builder", "generator")

# prepare pipeline for use
emb_pipeline.warm_up()

In [None]:
# init the knowledge graph retriever component
kg_retriever = KnowledgeGraphRetriever(
    neo4j_uri=NEO4J_URI,
    neo4j_user=NEO4J_USER,
    neo4j_pass=NEO4J_PASS
)

# Set up reranker for refining retrieved graph-based documents
ranker_graph = TransformersSimilarityRanker(
    model="intfloat/simlm-msmarco-reranker", top_k=5
)

# Define prompt template tailored for graph-based documents
prompt_template_graph = """
You are an AI Assistant with access to official Knowledge Graphs and Documents about the European Union's news, policies, laws, and actions.

Your task is to answer user questions **STRICTLY** based on the Knowledge Graphs and Documents provided below.

Generate your answer based on the presented extracted **Triples**, using the Document passage as supplementary information, without making any assumptions.

Question: {{ query }}

Guidelines:
- Use only the content from the provided Knowledge Graphs and Documents.
- Do NOT rely on prior or external knowledge.
- Answer strictly by **copying**, **quoting**, or **paraphrasing** only what is written in the provided Knowledge Graphs and Documents.
- **Never generate information, conclusions, or details that are not explicitly present in the documents.**
- Do NOT ask the user for additional information.
- Include inline HTML links for referencing URL sources in the answer, using the URLs provided in the Knowledge Graphs and Documents.
  - Use the document’s title as the anchor text.
  - If the title is missing, use the domain name of the document’s URL as the anchor text.
- Each fact you refer to should be followed by the corresponding reference.
- Output the answer in a structured markdown format.
- Use bullet lists whenever it makes sense.
- Do not add a references section at the end of the answer, just use references within the body of text.

If a definitive answer cannot be found in the Knowledge Graphs and Documents, respond with:
Final Answer: inconclusive

Always end your answer with this disclaimer:
Disclaimer: This is AI generated content — please use it with caution.

Knowledge Graphs and Documents:

Each source below may contain both structured (graph/triple) information and unstructured (document/text) content.

{% for doc in documents %}
- <b>Source:</b><br>
{{ doc.content }}<br><br>
{% endfor %}

Question: {{ query }}

Answer (with references using HTML links):
"""

prompt_builder_graph = PromptBuilder(
    template=prompt_template_graph,
    required_variables=["documents", "query"]
)

# init generator
generator_graph = OpenAIGenerator(
    model="gpt-4.1-mini", api_key=Secret.from_env_var("OPENAI_API_KEY"),
    generation_kwargs={
        "temperature": 0,
        "top_p": 0
    }
)

# Build the knowledge graph pipeline
graph_pipeline = Pipeline()
graph_pipeline.add_component("kg_retriever", kg_retriever)
graph_pipeline.add_component("ranker", ranker_graph)
graph_pipeline.add_component("output_docs", DocumentPassthrough())
graph_pipeline.add_component("prompt_builder", prompt_builder_graph)
graph_pipeline.add_component("generator", generator_graph)

# Define the flow of data between components
graph_pipeline.connect("kg_retriever.documents", "ranker.documents")
graph_pipeline.connect("ranker.documents", "output_docs.documents")
graph_pipeline.connect("ranker.documents", "prompt_builder.documents")
graph_pipeline.connect("prompt_builder", "generator")

# Prepare the pipeline
graph_pipeline.warm_up()

In [None]:
# define components for web based retrieval and generation
web_search = SerperDevWebSearch(top_k=5, api_key=Secret.from_env_var("SERPERDEV_API_KEY"))
fetcher = LinkContentFetcher()
converter = HTMLToDocument()
ranker_web = TransformersSimilarityRanker(model="intfloat/simlm-msmarco-reranker", top_k=5)

# prompt template with required variables for generation
prompt_template_web = """
You are an AI Assistant with access to Web Search Documents about the European Union's news, policies, laws, and actions.

Your task is to answer user questions **STRICTLY** based on the Documents provided below.

Question: {{ query }}

Guidelines:
- Use only the content from the provided  Documents.
- Do NOT rely on prior or external knowledge.
- Answer strictly by **copying**, **quoting**, or **paraphrasing** only what is written in the provided Documents.
- **Never generate information, conclusions, or details that are not explicitly present in the documents.**
- Do NOT ask the user for additional information.
- Include inline HTML links for referencing URL sources in the answer, using the URLs provided in the Documents.
  - Use the document’s title as the anchor text.
  - If the title is missing, use the domain name of the document’s URL as the anchor text.
- Each fact you refer to should be followed by the corresponding reference.
- Output the answer in a structured markdown format.
- Use bullet lists whenever it makes sense.
- Do not add a references section at the end of the answer, just use references within the body of text.

If a definitive answer cannot be found in the Documents, respond with:
Final Answer: inconclusive

Always end your answer with this disclaimer:
Disclaimer: This is AI generated content — please use it with caution.

Documents:
{% for doc in documents %}
- <b>Source:</b> <a href="{{ doc.meta.url }}">{{ doc.meta.url }}</a><br>
<p>{{ doc.content }}</p><br>
{% endfor %}

Question: {{ query }}

Answer:
"""

prompt_builder_web = PromptBuilder(
    template=prompt_template_web,
    required_variables=["documents", "query"]
)

generator_web = OpenAIGenerator(
    model="gpt-4.1-mini", api_key=Secret.from_env_var("OPENAI_API_KEY"),
    generation_kwargs={
        "temperature": 0,
        "top_p": 0
    }
)

# create, config the web search pipeline and connect the components
web_pipeline = Pipeline()
web_pipeline.add_component("search", web_search)
web_pipeline.add_component("fetcher", fetcher)
web_pipeline.add_component("converter", converter)
web_pipeline.add_component("ranker", ranker_web)
web_pipeline.add_component("output_docs", DocumentPassthrough())
web_pipeline.add_component("prompt_builder", prompt_builder_web)
web_pipeline.add_component("generator", generator_web)

web_pipeline.connect("search.links", "fetcher.urls")
web_pipeline.connect("fetcher.streams", "converter.sources")
web_pipeline.connect("converter.documents", "ranker.documents")
web_pipeline.connect("ranker.documents", "output_docs.documents")
web_pipeline.connect("ranker.documents", "prompt_builder.documents")
web_pipeline.connect("prompt_builder", "generator")

# prepare the pipeline for inference
web_pipeline.warm_up()

#### Agent Pipeline

In [None]:
# init the chat generator for multi-turn interaction
chat_generator = OpenAIChatGenerator(
    model="gpt-4.1",
    api_key=Secret.from_env_var("OPENAI_API_KEY"),
    generation_kwargs={
        "temperature": 0,
        "top_p": 0
    }
)

# wrap the pipelines with input/output mappings
graph_super = SuperComponent(
    pipeline=graph_pipeline,
    input_mapping={
        "query": ["kg_retriever.query", "ranker.query", "prompt_builder.query"]
    },
    output_mapping={"generator.replies": "replies"}
)

embedding_super = SuperComponent(
    pipeline=emb_pipeline,
    input_mapping={
        "query": ["embedder.text", "reranker.query", "prompt_builder.query"]
    },
    output_mapping={"generator.replies": "replies"}
)

web_super = SuperComponent(
    pipeline=web_pipeline,
    input_mapping={
        "query": ["search.query", "ranker.query", "prompt_builder.query"]
    },
    output_mapping={"generator.replies": "replies"}
)

# define tools based on the wrapped pipelines
graph_tool = ComponentTool(
    name="graph_search",
    component=graph_super,
    description=(
        "Answer questions using structured information from a knowledge graph containing factual relationships "
        "about the European Union’s news, policies, laws, and actions. The graph includes relationships (triples) "
        "and the original source documents. Answers are grounded in these facts, with references provided as HTML links."
    )
)

embedding_tool = ComponentTool(
    name="embedding_search",
    component=embedding_super,
    description=(
        "Answer questions using information retrieved from an internal document store containing content "
        "about the European Union’s news, policies, laws, and actions. Answers are based strictly on "
        "retrieved documents using semantic similarity, with no assumptions. References are included as HTML links."
    )
)

web_tool = ComponentTool(
    name="web_search",
    component=web_super,
    description=(
        "Retrieve potentially relevant information from the web. Results are based on live internet content and may "
        "include a variety of sources. The retrieved information is not guaranteed to be factual. References are provided "
        "as HTML links using either inferred titles or domain names."
    )
)

# define the agent's system behavior and reasoning instructions
system_prompt = """
You are a highly intelligent assistant with access to 3 specialized tools for answering questions
about the European Union’s news, policies, laws, and actions.

You have access to:

- graph_search: Uses a knowledge graph containing factual relationships (triples) and their source documents.
  Answers should be grounded in these structured relationships, using HTML links for citations.

- embedding_search: Retrieves semantically relevant information from an internal document store.
  Answers must be based strictly on the retrieved documents, using inline HTML links for references.

- web_search: Retrieves the most recent and relevant information from the web.
  Answers should reflect real-time sources, with references using HTML links. If no title is available,
  use the domain name of the URL as the anchor text.

Your task:
1. Use all three tools to answer the user's query.
2. Combine insights from graph_search and embedding_search tools to create a complete and informative response in the Internal Search Answer section.
3. Provide separetely inshights from web_search tool to complete the informative response in the Web Search Insights section.
3. In each sentece of your answer add the references you were based on.
4. Ensure all references are included as inline HTML anchor tags, using titles or domain names as specified.
5. If there is a conflict between the information retrieved from the Web Search and the other tools, highlight the discrepancy separetely if there is one in the Conflicts for Internal and Web Search section.
6. For any part of the answer generated from web_search too, always clearly indicate that the information comes from the web.
7. Output the answer in a structured markdown format.
8. Use bullet lists whenever it makes sense.
9. Do not add a references section at the end of the answer, just use references within the body of text.

Your output should have three sections if there are no conflicts or four sections if there are conflicts:

Thought Process:
- Describe step-by-step how each tool contributed to your reasoning and answer.

Internal Search Answer:
- Provide a clear, concise answer supported by insights from graph_search and embedding_search tools, indicating from which tool the answer is based on.

Web Search Insights:
- Any content derived from a web search must be explicitly identified as such in the response here.

Conflicts for Internal and Web Search:
- Any conflict of information derived from the internal compared with the web search be explicitly identified as such in the response here.

Always include this disclaimer at the end of the final answer:
Disclaimer: This is AI generated content — please use it with caution.
"""

# create the agent with the toolset and system prompt
agent = Agent(
    chat_generator=chat_generator,
    tools=[embedding_tool, graph_tool, web_tool],
    system_prompt=system_prompt
)

# prepare the agent for interaction
agent.warm_up()

In [None]:
def run_qa_turn(agent, messages, user_input):
    """
    Run a single Q&A turn with the agent.

    Appends the user's input to the message history, executes the agent run,
    and parses the response into key parts including tool calls, tool outputs,
    and the final answer.

    Args:
        agent (Agent): The config multi-tool agent.
        messages (List[ChatMessage]): Conversation history.
        user_input (str): The current user input.

    Returns:
        Tuple[List[ChatMessage], Dict]: A tuple containing the updated messages list,
        and a dictionary with:
            - user_input: the last user message as a string
            - tool_calls: list of ToolCall objects
            - tool_results: mapping of tool names to stringified output
            - final_answer: the assistant’s concluding response
    """
    # ddd user input to message history and execute the agent pipeline
    messages.append(ChatMessage.from_user(user_input))

    result = agent.run(messages=messages, max_steps=10)
    messages = result["messages"]

    # init output containers
    tool_calls = []
    tool_results = {}
    final_answer = None
    last_user_input = None

    # parse returned messages
    for msg in messages:
        role = msg._role.value.lower()
        content = msg._content

        if role == "user" and content and isinstance(content[0], TextContent):
            last_user_input = content[0].text

        elif role == "assistant" and content:
            if isinstance(content[0], ToolCall):
                tool_calls.extend(content)
            elif isinstance(content[0], TextContent):
                final_answer = content[0].text

        elif role == "tool" and content:
            for tool_result in content:
                if isinstance(tool_result, ToolCallResult):
                    tool_name = tool_result.origin.tool_name
                    raw = tool_result.result

                    try:
                        parsed = ast.literal_eval(raw)

                        if isinstance(parsed, dict) and "replies" in parsed:
                            replies = parsed["replies"]
                            if isinstance(replies, list):
                                reply_text = "\n".join(replies)
                            else:
                                reply_text = str(replies)
                        else:
                            reply_text = str(parsed)

                    except Exception as e:
                        reply_text = raw  # fallback
                        print(f"Failed to parse tool result for {tool_name}: {e}")

                    tool_results[tool_name] = reply_text

    response = {
        "user_input": last_user_input,
        "tool_calls": tool_calls,
        "tool_results": tool_results,
        "final_answer": final_answer
    }

    return messages, response

#### RAG Evaluation Config

##### Helpers


In [None]:
def fix_inner_double_quotes_in_single_quoted_values(json_str, verbose=False):
    """
    Replaces any double quotes inside single-quoted string values with single quotes.
    Only affects values of the form: ...: '...'
    """
    # this regex finds single-quoted values after a colon and optional whitespace
    pattern = re.compile(r":\s*'([^']*)'", re.DOTALL)

    def replacer(match):
        inner = match.group(1)
        fixed = inner.replace('"', "'")
        if verbose and inner != fixed:
            print(f"Replaced inner double quotes in value: {inner}")
        return ": '" + fixed + "'"

    return pattern.sub(replacer, json_str)

def replace_double_quote_pair_near_position(json_str, error_pos, verbose=False):
    """
    Replaces the closest pair of unescaped double quotes near error_pos with single quotes.
    Returns the new string.
    """
    chars = list(json_str)
    # find the first " to the left
    left = error_pos
    while left >= 0 and chars[left] != '"':
        left -= 1
    # find the first " to the right
    right = error_pos
    while right < len(chars) and chars[right] != '"':
        right += 1
    replaced = 0
    if left >= 0 and chars[left] == '"' and (left == 0 or chars[left-1] != '\\'):
        chars[left] = "'"
        replaced += 1
        if verbose:
            print(f"Replaced \" with ' at position {left}")
    if right < len(chars) and chars[right] == '"' and (right == 0 or chars[right-1] != '\\'):
        chars[right] = "'"
        replaced += 1
        if verbose:
            print(f"Replaced \" with ' at position {right}")
    if replaced == 0 and verbose:
        print("No double quotes found to replace near error position.")
    return ''.join(chars)


def find_json_block(text: str) -> str:
    """
    Return the largest valid JSON object block in the text (from first { to last }).
    Handles markdown fences, smart quotes, and multi-line LLM output.
    """
    text = re.sub(r"```json|```", "", text)
    text = text.replace("“", '"').replace("”", '"').replace("’", "'")
    start, end = text.find('{'), text.rfind('}')
    if start != -1 and end != -1 and end > start:
        candidate = text[start:end+1]
        try:
            json.loads(candidate)
            return candidate
        except Exception:
            pass
    return text  # fallback

def unwrap_evaluations(parsed):
    """
    Returns a list of evaluation dicts regardless of LLM output style.
    Supports:
    - {'evaluations': [ ... ]}
    - [ ... ]  # just a list
    - { ... }  # a single evaluation dict (rare)
    """
    if isinstance(parsed, dict) and "evaluations" in parsed:
        return parsed["evaluations"]
    elif isinstance(parsed, list):
        return parsed
    elif isinstance(parsed, dict):  # if just a single dict
        # defensive: check for statement/score keys
        if any(k in parsed for k in ("statement", "score", "justification")):
            return [parsed]
        else:
            return []
    return []

def try_fix_json(json_str: str, verbose: bool = False) -> str:
    """
    Attempt to fix common LLM-to-JSON errors:
    - Remove markdown/code fences
    - Normalize smart quotes
    - Remove trailing commas before closing braces
    - Balance brackets
    - Print error context if requested
    """
    json_str = re.sub(r"```json|```", "", json_str)
    json_str = json_str.replace("“", '"').replace("”", '"').replace("’", "'")
    json_str = re.sub(r',\s*([}\]])', r'\1', json_str)
    json_str = re.sub(r'\n+', '\n', json_str)
    json_str = json_str.replace("\t", " ")
    while json_str.count('{') > json_str.count('}'):
        json_str += '}'
    while json_str.count('[') > json_str.count(']'):
        json_str += ']'
    json_str = re.sub(r',\s*,', ',', json_str)
    json_str = re.sub(r'//.*?\n', '', json_str)
    json_str = json_str.replace('\ufeff', '')
    json_str = re.sub(r'\bNone\b', 'null', json_str)
    return json_str

def parse_llm_json_reply(reply_text: str, verbose: bool = False, unwrap_evaluations: bool = False):
    """
    Tries to extract and parse a JSON object from LLM reply.
    Always returns a list of dicts (evaluations), or empty list.
    """
    candidate = find_json_block(reply_text)
    parsed = None
    try:
        parsed = json.loads(candidate)
    except json.JSONDecodeError as e:
        if verbose:
            print("First JSON decode failed:", e)
            print("Context around error:", candidate[e.pos-40:e.pos+40])
        fixed = try_fix_json(candidate, verbose=verbose)
        try:
            parsed = json.loads(fixed)
        except json.JSONDecodeError as e2:
            if verbose:
                print("Second JSON decode failed:", e2)
                print("Context around error:", fixed[e2.pos-40:e2.pos+40])
            try:
                parsed = demjson3.decode(fixed)
            except Exception as e3:
                if verbose:
                    print("demjson3 also failed:", e3)
                try:
                    json.loads(fixed)
                except json.JSONDecodeError as e4:
                    fixed2 = replace_double_quote_pair_near_position(fixed, e4.pos, verbose=verbose)
                    try:
                        parsed = json.loads(fixed2)
                    except Exception as e5:
                        if verbose:
                            print("replace_double_quote_pair_near_position also failed:", e5)
                        fixed3 = fix_inner_double_quotes_in_single_quoted_values(fixed, verbose=verbose)
                        try:
                            parsed = json.loads(fixed3)
                        except Exception as e6:
                            if verbose:
                                print("fix_inner_double_quotes_in_single_quoted_strings also failed:", e6)
                            return []
    if unwrap_evaluations: # for eval
        if isinstance(parsed, dict) and "evaluations" in parsed:
            return parsed["evaluations"]
        elif isinstance(parsed, list):
            return parsed
        elif isinstance(parsed, dict):
            if any(k in parsed for k in ("statement", "score", "justification")):
                return [parsed]
            return []
        return []
    else:
        # for statements extraction, just return the parsed JSON (dict)
        return parsed

def has_valid_results(results):
    """
    True if 'results' contains at least one entry in its 'results' list
    where at least one of statements, statement_scores, justifications is non-empty.
    """
    if not isinstance(results, dict):
        return False
    results_list = results.get("results")
    if not (isinstance(results_list, list) and len(results_list) > 0):
        return False
    for entry in results_list:
        for key in ["statements", "statement_scores", "justifications"]:
            val = entry.get(key, [])
            if isinstance(val, list) and len(val) > 0:
                return True
    return False  # all empty


##### Statements extractor config

In [None]:
class StatementsExtractor(Component):
    """
    Extracts factual statements from a generated answer, given the question and answer.
    Contexts may be provided for tracking, but are not sent to the LLM.
    """
    def __init__(
        self,
        model="gpt-4.1",
        provider="openai",
        instructions: str = None
    ):
        super().__init__()
        self.provider = provider.lower()
        self.model = model

        if self.provider == "openai":
            self.chat_generator = OpenAIChatGenerator(
                model=model,
                generation_kwargs={"temperature": 0, "top_p": 0},
                api_key=Secret.from_env_var("OPENAI_API_KEY")
            )
        else:
            raise ValueError("provider must be 'openai'")

        self.instructions = instructions or (
            "You will receive a question and a LLM-based generated answer.\n"
            "Your task is to break the generated answer into a list of factual statements by splitting it into sentences, "
            "leaving each sentence unchanged and exactly as it appears in the generated answer, only replace double with single quotes if double quotes are present. "
            "Do not rewrite, paraphrase, or alter the sentences in any way. "
            "Do not include the Disclaimer: This is AI generated content in the statements. "
            "Do not include sentences like For more detailed information.., since there are no factual statements. "
            "Only use information from the answer itself (do NOT use any external knowledge or assumptions).\n\n"
            "All JSON keys and string values must use double quotes (\").\n"
            "If it is required to quote something inside a statement, use ONLY single quotes (').\n"
            "Output only a valid JSON object with the key 'statements', which is a list of strings. Do not add any text before or after the JSON.\n"
            "Example:\n"
            "{\n"
            '  \"statements\": [\n'
            '    \"The Clean Industrial Deal is designed to reduce industrial emissions.\",\n'
            '    \"Affordable energy is the foundation of competitiveness.\"\n'
            "  ]\n"
            "}\n"
            "If the answer contains no factual statements, output an empty list."
        )
    @component.output_types(
        question=str,
        generated_answer=str,
        statements=List[str],
        contexts=Optional[List[str]]
    )
    def run(
        self,
        questions: List[str],
        generated_answers: List[str],
        contexts: Optional[List[List[str]]] = None
    ) -> List[Dict[str, Any]]:
        results = []
        num_samples = len(questions)
        if contexts is None:
            contexts = [None] * num_samples

        for question, generated_answer, context_list in zip(questions, generated_answers, contexts):
            prompt = (
                f"{self.instructions}\n\n"
                f"Question: {question}\n\n"
                f"Generated Answer:\n{generated_answer}\n\n"
                f'Respond with a JSON object: {{"statements": [...]}}'
            )

            parsed = None
            statements_list = []
            retries = 3

            for attempt in range(retries):
                try:
                    response = self.chat_generator.run([ChatMessage.from_user(prompt)])
                    reply_text = response["replies"][0].text

                    parsed = parse_llm_json_reply(reply_text, verbose=True, unwrap_evaluations=False)

                    # accept either {"statements": [...]} or just a list
                    if isinstance(parsed, dict) and "statements" in parsed:
                        statements_list = parsed["statements"]
                        break  # Success!
                    elif isinstance(parsed, list):
                        # sometimes the model just outputs a list
                        statements_list = parsed
                        break
                except Exception as e:
                    print(f"[Attempt {attempt+1}] Failed: {str(e)}")
                    statements_list = []
                    time.sleep(1)

            else:
                print("Failed to get a valid response after retries.")
                statements_list = []

            # PATCH insertation if Final answer: inconclusive and there are no statements extracted
            if (
                (not statements_list or len(statements_list) == 0)
                and isinstance(generated_answer, str)
                and "final answer: inconclusive" in generated_answer.lower()
            ):
                statements_list = [generated_answer]

            result = {
                "question": question,
                "generated_answer": generated_answer,
                "statements": statements_list,
                "contexts": context_list if context_list is not None else None,
            }
            results.append(result)

        return results

# instantiation:
statements_extractor = StatementsExtractor(model="gpt-4.1", provider="openai")

##### RAG Faithfulness evaluator config

In [None]:
class CustomFaithfulnessEvaluator(Component):
    """
    Faithfulness evaluator for QA pipelines, modular for OpenAI or Anthropic providers.
    Expects statements to be provided (does not generate them).
    """
    def __init__(self,
                 model="gpt-4.1",
                 provider="openai",
                 instructions: str = None):
        super().__init__()
        self.provider = provider.lower()
        self.model = model

        if self.provider == "openai":
            self.chat_generator = OpenAIChatGenerator(
                model=model,
                generation_kwargs={"temperature": 0, "top_p": 0}
            )
        elif self.provider == "anthropic":
            self.chat_generator = AnthropicChatGenerator(
                model=model,
                generation_kwargs={"temperature": 0, "max_tokens": 8192}
            )
        elif self.provider == "google":
            self.chat_generator = GoogleAIGeminiChatGenerator(
                model=model,
                generation_config={"temperature": 0}
            )
        else:
            raise ValueError("provider must be 'openai' or 'anthropic'")

        self.instructions = instructions or (
            "You are evaluating the faithfulness of a list of factual statements with respect to a provided context.\n\n"
            "You will receive:\n"
            "- a question\n"
            "- a context: a set of retrieved documents or passages used to generate the answer\n"
            "- a list of factual statements (already extracted from a predicted answer)\n\n"
            "TASK:\n"
            "For each statement in the list, output an object with:\n"
            "   - \"statement\": the statement, **repeated exactly as given, with no modifications**\n"
            "   - \"score\": (see below)\n"
            "   - \"justification\": a short explanation\n\n"
            "Scoring:\n"
            "   a. If the statement is clearly supported by the context → score = 1\n"
            "   b. If the statement is not supported or the context is silent → score = 0\n"
            "   c. If the statement includes or is equivalent to 'Final Answer: inconclusive' AND this is justified by the lack of support in context → score = -1\n"
            "   d. If the statement includes or is equivalent to 'Final Answer: inconclusive' BUT the context does contain sufficient information to answer the question → score = -2\n\n"
            "Formatting instructions:\n"
            "- Output a single valid JSON block, and **do not add any text before or after**.\n"
            "- Format the output as follows:\n"
            '{\n'
            '  \"evaluations\": [\n'
            '    {\"statement\": \"...\", \"score\": 1, \"justification\": \"...\"},\n'
            '    {\"statement\": \"...\", \"score\": 1, \"justification\": \"...\"}\n'
            '  ]\n'
            '}\n'
            "- There MUST be one object for each statement, in the same order as given, and each 'statement' field must match the input exactly.\n"
            "- If you cannot score a statement, set \"score\": null and \"justification\": \"MISSING\".\n\n"
            "JSON Output Format instructions:\n"
            "- **All JSON keys and string values must use double quotes (\").**\n"
            "- **If you must quote text inside any JSON string value (in either statement or justification), you MUST use single quotes ('), NEVER double quotes (\"). Double quotes are reserved for JSON formatting only.**\n"
            "- **This also applies for nested quoting; even if you are quoting a phrase inside another quoted phrase. Use single quotes for all levels of quoting/ referencing in the justifications.**\n"
            "- Do NOT add any commentary, markdown, or preamble; output only **VALID** JSON as shown above.\n\n"
            "Assess and score **every** statement.\n\n"
            "You should ALWAYS output the same number of statements given, along with their assessment.\n"
            "Be sure that you did not miss any given statement.\n"
        )

    @component.output_types(
        individual_scores=List[float],
        score=float,
        results=List[Dict[str, Any]]
    )
    def run(self, questions: List[str], contexts: List[List[str]], statements: List[List[str]]) -> Dict[str, Any]:
        results = []
        individual_scores = []

        for question, context_list, statements_list in zip(questions, contexts, statements):
            full_context = "\n\n".join([f"Document {i+1}:\n{doc}" for i, doc in enumerate(context_list)])
            statements_json = json.dumps(statements_list, ensure_ascii=False)
            evals = []
            error_flag = False

            for attempt in range(3):
                prompt = (
                    f"{self.instructions}\n\n"
                    f"Question: {question}\n\n"
                    f"Context:\n{full_context}\n\n"
                    f"Statements:\n{statements_json}\n\n"
                    f'Respond with a JSON object as specified.'
                )
                try:
                    response = self.chat_generator.run([ChatMessage.from_user(prompt)])
                    reply_text = response["replies"][0].text

                    # coerce reply_text to string if not already
                    if isinstance(reply_text, dict):
                        if len(reply_text) == 1:
                            reply_text = list(reply_text.values())[0]
                        else:
                            reply_text = json.dumps(reply_text)
                    elif not isinstance(reply_text, str):
                        reply_text = str(reply_text)

                    evals = parse_llm_json_reply(reply_text, verbose=True, unwrap_evaluations=True)
                    if len(evals) == len(statements_list):
                        # Success!
                        break
                    else:
                        print(f"Attempt {attempt+1}: Number of output evaluations ({len(evals)}) does not match number of input statements ({len(statements_list)}). Retrying...")
                        time.sleep(1)
                except Exception as e:
                    print(f"[Attempt {attempt+1}] Failed: {str(e)}")
                    evals = []
                    time.sleep(1)
            else:
                # all attempts failed or count mismatch
                error_flag = True
                print("Failed to get a valid response with matching statement count after 3 attempts.")
                evals = [{"statement": s, "score": None, "justification": "MISSING"} for s in statements_list]

            statement_scores = []
            justifications = []
            output_statements = []

            for eval_obj in evals:
                output_statements.append(eval_obj.get("statement", ""))
                statement_scores.append(eval_obj.get("score", None))
                justifications.append(eval_obj.get("justification", ""))

            # compute mean only on non-null scores
            numeric_scores = [s for s in statement_scores if isinstance(s, (int, float))]
            score = float(np.mean(numeric_scores)) if numeric_scores else 0.0
            result_dict = {
                "statements": output_statements,
                "statement_scores": statement_scores,
                "justifications": justifications,
                "score": score,
                "error": "count_mismatch" if error_flag else None,
            }
            results.append(result_dict)
            individual_scores.append(score)

        final_score = float(np.mean(individual_scores)) if individual_scores else 0.0

        return {
            "results": results,
            "individual_scores": individual_scores,
            "score": final_score
        }

# instantiations:
standard_evaluator_gpt = CustomFaithfulnessEvaluator(model="gpt-4.1", provider="openai")
standard_evaluator_anthropic = CustomFaithfulnessEvaluator(model="claude-sonnet-4-0", provider="anthropic")
standard_evaluator_google = CustomFaithfulnessEvaluator(model="gemini-2.5-pro", provider="google")

#### Standard Pipelines Evaluation

##### Generate answers

In [None]:
def generate_answers(pipeline_type, pipeline_obj, questions_json_path):
    """
    Generate answers for all questions using the selected pipeline (emb, graph, web).

    Args:
        pipeline_type: "emb", "graph", or "web"
        pipeline_obj:   the corresponding pipeline object
        questions_json_path: path to the input JSON with {"questions": [...]}
    """
    assert pipeline_type in ("emb", "graph", "web"), "Unknown pipeline type"
    answers_file = f"{pipeline_type}_generated_answers.json"

    with open(questions_json_path, "r") as f:
        questions = json.load(f).get("questions", [])

    results = [{} for _ in questions]

    for idx, question in enumerate(questions):
        print(f"[{pipeline_type}] Generating answer for question {idx+1}/{len(questions)}")
        try:
            start_time = time.time()
            if pipeline_type == "emb":
                result = pipeline_obj.run({
                    "embedder": {"text": question},
                    "retriever": {"top_k": 5},
                    "reranker": {"query": question},
                    "prompt_builder": {"query": question}
                })
            elif pipeline_type == "graph":
                result = pipeline_obj.run({
                    "kg_retriever": {"query": question},
                    "ranker": {"query": question},
                    "prompt_builder": {"query": question}
                })
            elif pipeline_type == "web":
                result = pipeline_obj.run({
                    "search": {"query": question},
                    "ranker": {"query": question},
                    "prompt_builder": {"query": question}
                })
            else:
                raise ValueError("Unknown pipeline type")
            end_time = time.time()
            latency = end_time - start_time

            generated_answer = result["generator"]["replies"][0]
            contexts = [doc.content for doc in result["output_docs"]["documents"]]

            entry = {
                "question": question,
                "generated_answer": generated_answer,
                "contexts": contexts,
                "latency_seconds": latency
            }
        except Exception as e:
            entry = {
                "question": question,
                "error": str(e)
            }

        results[idx] = entry

        with open(answers_file, "w") as f:
            json.dump(results, f, indent=2)

    print(f"Results saved to {answers_file}")

# usage
generate_answers("emb", emb_pipeline, "eval/eval_questions.json")
generate_answers("graph", graph_pipeline, "eval/eval_questions.json")
generate_answers("web", web_pipeline, "eval/eval_questions.json")

##### Extract statements

In [None]:
def extract_statements(answers_file, statements_file, statements_extractor):

    """
    For each entry in answers_file, use statements_extractor to extract statements,
    and write the results (with statements) to statements_file.
    """
    with open(answers_file, "r") as f:
        answer_entries = json.load(f)

    # this will hold the enriched entries
    statements_entries = answer_entries.copy()

    for idx, entry in enumerate(statements_entries):
        if "error" in entry or "generated_answer" not in entry:
            continue  # skip failed/empty entries

        if "statements" in entry:  # already done
            continue

        print(f"Extracting statements for question {idx+1}")

        statements_result = statements_extractor.run(
            questions=[entry["question"]],
            generated_answers=[entry["generated_answer"]],
            contexts=[entry.get("contexts", [])]  # for tracking, not sent to LLM
        )

        entry["statements"] = statements_result[0]["statements"]

        with open(statements_file, "w") as f:
            json.dump(statements_entries, f, indent=2)

    print(f"Statements saved to {statements_file}")

# usage
extract_statements("emb_generated_answers.json", "emb_statements.json", statements_extractor)
extract_statements("graph_generated_answers.json", "graph_statements.json", statements_extractor)
extract_statements("web_generated_answers.json", "web_statements.json", statements_extractor)

##### Faithfulness eval

In [None]:
def run_faithfulness_eval(statements_file, eval_results_file, evaluators):
    """
    Runs faithfulness evaluation for each entry in the statements_file,
    and saves enriched results (with eval outputs) to eval_results_file.

    Args:
        statements_file: input file with 'statements' for each entry
        eval_results_file: output file with evaluator results
        evaluators: dict of {"evaluator_name": evaluator_object, ...}
    """
    with open(statements_file, "r") as f:
        answer_entries = json.load(f)

    results_entries = answer_entries.copy()

    for idx, entry in enumerate(results_entries):
        if "error" in entry or "statements" not in entry or not entry["statements"]:
            continue  # skip if no statements

        # only run if not already done for all requested evaluators
        skip = True
        for name in evaluators:
            if f"results_{name}" not in entry:
                skip = False
        if skip:
            continue

        print(f"Evaluating faithfulness for question {idx+1}")

        questions = [entry["question"]]
        contexts = [[c for c in entry.get("contexts", []) if isinstance(c, str) and c.strip()]]
        statements = [entry["statements"]]


        for eval_name, eval_obj in evaluators.items():
            key = f"results_{eval_name}"
            if key in entry:
                continue
            result = eval_obj.run(
                questions=questions,
                contexts=contexts,
                statements=statements
            )
            entry[key] = result

        with open(eval_results_file, "w") as f:
            json.dump(results_entries, f, indent=2)

    print(f"Faithfulness eval results saved to {eval_results_file}")

# usage
# define which evaluators you want to use (any subset)
evaluators = {
    "gpt": standard_evaluator_gpt,
    "anthropic": standard_evaluator_anthropic,
    "google": standard_evaluator_google,
}

run_faithfulness_eval("emb_statements.json", "emb_faithfulness_evaluation_results.json", evaluators)
run_faithfulness_eval("graph_statements.json", "graph_faithfulness_evaluation_results.json", evaluators)
run_faithfulness_eval("web_statements.json", "web_faithfulness_evaluation_results.json", evaluators)

#### Agent Evaluation

##### Agent evaluator config

In [None]:
class CustomAgentFaithfulnessEvaluator(Component):
    """
    Faithfulness and attribution evaluator for agent answers, robust and standard-output style.
    """
    def __init__(self,
                 model="gpt-4.1",
                 provider="openai",
                 instructions: str = None):
        super().__init__()
        self.provider = provider.lower()
        self.model = model

        if self.provider == "openai":
            self.chat_generator = OpenAIChatGenerator(
                model=model,
                generation_kwargs={"temperature": 0, "top_p": 0}
            )
        elif self.provider == "anthropic":
            self.chat_generator = AnthropicChatGenerator(
                model=model,
                generation_kwargs={"temperature": 0, "max_tokens": 8192}
            )
        elif self.provider == "google":
            self.chat_generator = GoogleAIGeminiChatGenerator(
                model=model,
                generation_config={"temperature": 0}
            )
        else:
            raise ValueError("provider must be 'openai' or 'anthropic'")

        self.instructions = instructions or (
            "You are evaluating the faithfulness and attribution of a list of factual statements, given specific tool contexts and a predicted answer from an agent.\n\n"
            "You will receive:\n"
            "- a question\n"
            "- embedding_search_context: information from internal knowledge from the embedding_search tool\n"
            "- graph_search_context: information from internal knowledge from the graph_search tool\n"
            "- web_search_context: information retrieved from the web_search tool\n"
            "- a predicted answer from an agent\n"
            "- a list of factual statements (already extracted from the answer; do NOT extract statements yourself)\n\n"
            "The predicted answer you receive SHOULD include:\n"
            "   - an Internal Search Answer section, produced with embedding_search and graph_search tool results\n"
            "   - a Web Search Insights section, produced with web_search tool results\n\n"
            "Your task is:\n"
            "For each provided statement, output an object with:\n"
            "   - \"statement\": the statement, **repeated exactly as given, with no modifications**\n"
            "   - \"score\": (see detailed rubric below)\n"
            "   - \"justification\": a short explanation citing which context(s) support the statement and why that score applies\n\n"
            "Scoring rubric:\n"
            "   a. If the statement is supported ONLY by embedding_search_context:\n"
            "      → score = 1\n"
            "      justification: reference the specific content from embedding_search_context that supports the statement\n\n"
            "   b. If the statement is supported ONLY by graph_search_context:\n"
            "      → score = 2\n"
            "      justification: reference the specific content from graph_search_context that supports the statement\n\n"
            "   c. If the statement is supported by BOTH embedding_search_context and graph_search_context:\n"
            "      → score = 3\n"
            "      justification: show support from both contexts and explain how they jointly confirm the statement\n\n"
            "   d. If the statement is a conflict statement AND is correctly identified because of conflict information from embedding_search_context or graph_search_context compared with web_search_context AND is clearly indicated/marked as conflict:\n"
            "      → score = 4\n"
            "      justification: describe the conflict and how the answer clearly signals it as such\n\n"
            "   e. If the statement is a conflict statement BUT is incorrectly identified (because there is no actual conflict in the contexts) AND is clearly indicated/marked as conflict:\n"
            "      → score = 5\n"
            "      justification: note the false marking of conflict and explain how the sources do not disagree\n\n"
            "   f. If the statement is supported by embedding_search_context or graph_search_context, but there is a conflict with web_search_context AND it is NOT clearly indicated/marked as conflict:\n"
            "      → score = 6\n"
            "      justification: explain the internal vs. web disagreement and the lack of conflict signaling in the answer\n\n"
            "   g. If there is a conflict between internal (embedding or graph) and web context BUT it is NOT clearly indicated/marked as conflict:\n"
            "      → score = 7\n"
            "      justification: highlight the missed conflict and explain how it should have been addressed\n\n"
            "   h. If the statement reflects an explicit indication that no conflicts were found:\n"
            "      → score = 8\n"
            "      justification: explain how the answer claims consensus and the contexts indeed show agreement\n\n"
            "   i. If the statement is not supported by any context:\n"
            "      → score = 0\n"
            "      justification: explain the absence of supporting evidence in all contexts\n\n"
            "   j. If the statement is supported by web_search_context AND is clearly indicated/marked as coming from web search in the answer:\n"
            "      → score = 9\n"
            "      justification: show the web-based support and cite how it was properly marked as web-sourced\n\n"
            "   k. If the statement is supported by web_search_context BUT is NOT clearly indicated/marked as from the web:\n"
            "      → score = -9\n"
            "      justification: provide evidence from the web_search_context and explain how the answer failed to mark it as web-sourced\n\n"
            "Formatting instructions:\n"
            "- Output a single valid JSON block, and **do not add any text before or after**.\n"
            "- Format the output as follows:\n"
            "{\n"
            '  \"evaluations\": [\n'
            '    {\"statement\": \"...\", \"score\": 1, \"justification\": \"...\"},\n'
            '    {\"statement\": \"...\", \"score\": 2, \"justification\": \"...\"}\n'
            '  ]\n'
            "}\n"
            "- There MUST be one object for each statement, in the same order as given, and each 'statement' field must match the input exactly.\n"
            "- If you cannot score a statement, set \"score\": null and \"justification\": \"MISSING\".\n"
            "- **All JSON keys and string values must use double quotes (\").**\n"
            "- **If you need to quote text inside a string value, use only single quotes (').**\n"
            "- Assess and score **every** statement.\n"
            "- Do NOT add any commentary, markdown, or preamble; output only valid JSON as shown above.\n\n"
        )

    @component.output_types(
        individual_scores=List[float],
        score=float,
        results=List[Dict[str, Any]]
    )
    def run(
        self,
        questions: List[str],
        predicted_answers: List[str],
        emb_contexts: List[List[str]],
        graph_contexts: List[List[str]],
        web_search_contexts: List[List[str]],
        statements: List[List[str]],
    ) -> Dict[str, Any]:
        import numpy as np
        results = []
        individual_scores = []

        for question, emb_retrieved, graph_retrieved, web_search, stmts, answer in zip(
            questions, emb_contexts, graph_contexts, web_search_contexts, statements, predicted_answers
        ):
            full_emb = " ".join(emb_retrieved)
            full_graph = " ".join(graph_retrieved)
            full_web = " ".join(web_search)
            statements_json = json.dumps(stmts, ensure_ascii=False, indent=2)

            # Short-circuit if answer is inconclusive (optional)
            if "final answer: inconclusive" in answer.lower():
                result_dict = {
                    "statements": [],
                    "statement_scores": [],
                    "justifications": [],
                    "score": 0.0,
                    "error": "inconclusive"
                }
                results.append(result_dict)
                individual_scores.append(0.0)
                continue

            prompt = (
                f"{self.instructions}\n\n"
                f"Question:\n{question}\n\n"
                f"embedding_search_context:\n{full_emb}\n\n"
                f"graph_search_context:\n{full_graph}\n\n"
                f"web_search_context:\n{full_web}\n\n"
                f"Predicted Answer:\n{answer}\n"
                f"Statements:\n{statements_json}\n\n"
                f"Respond with a JSON object as specified."
            )

            statements_list = []
            scores_list = []
            justifications_list = []
            error_flag = False

            for attempt in range(3):
                try:
                    response = self.chat_generator.run([ChatMessage.from_user(prompt)])
                    reply_text = response["replies"][0].text
                    parsed = parse_llm_json_reply(reply_text, verbose=True, unwrap_evaluations=True)

                    # Accept either {"statements": [...]} or just a list of dicts (rare)
                    if isinstance(parsed, dict) and "statements" in parsed:
                        statements_list = parsed.get("statements", [])
                        scores_list = parsed.get("statement_scores", [])
                        justifications_list = parsed.get("justifications", [])
                        if len(statements_list) == len(scores_list) == len(justifications_list):
                            break
                    elif isinstance(parsed, list):
                        # Accept a list of dicts (rare LLM fail)
                        statements_list = [item.get("statement", "") for item in parsed]
                        scores_list = [item.get("score", None) for item in parsed]
                        justifications_list = [item.get("justification", "") for item in parsed]
                        if len(statements_list) == len(scores_list) == len(justifications_list):
                            break
                except Exception as e:
                    print(f"Attempt {attempt+1} failed: {str(e)}")
                    statements_list = []
                    scores_list = []
                    justifications_list = []
                    time.sleep(1)
            else:
                error_flag = True
                print("Failed to get valid response after retries.")
                # ff nothing at all, fill with empty
                statements_list = []
                scores_list = []
                justifications_list = []

            # compute mean only on non-null scores
            numeric_scores = [s for s in scores_list if isinstance(s, (int, float))]
            mean_score = float(np.mean(numeric_scores)) if numeric_scores else 0.0
            result_dict = {
                "statements": statements_list,
                "statement_scores": scores_list,
                "justifications": justifications_list,
                "score": mean_score,
                "error": "count_mismatch" if error_flag else None,

            }
            results.append(result_dict)
            individual_scores.append(mean_score)

        final_score = float(np.mean(individual_scores)) if individual_scores else 0.0

        return {
            "results": results,
            "individual_scores": individual_scores,
            "score": final_score
        }

# instantiations:
agent_evaluator_gpt = CustomAgentFaithfulnessEvaluator(model="gpt-4.1", provider="openai")
agent_evaluator_anthropic = CustomAgentFaithfulnessEvaluator(model="claude-sonnet-4-0", provider="anthropic")
agent_evaluator_google = CustomAgentFaithfulnessEvaluator(model="gemini-2.5-pro", provider="google")


##### Generate agentic answers

In [None]:
def generate_agentic_answers(agent, questions_json_path, results_file="agent_generated_answers.json"):
    # load questions
    with open(questions_json_path, "r") as f:
        questions = json.load(f).get("questions", [])

    # load previous results if any
    if os.path.exists(results_file):
        with open(results_file, "r") as f:
            results = json.load(f)
    else:
        results = [{} for _ in questions]

    for idx, question in enumerate(questions):
        if results[idx] and "generated_answer" in results[idx]:
            continue
        print(f"[agent] Generating answer for question {idx+1}/{len(questions)}")
        try:
            start_time = time.time()
            conversation_state = {}
            user_id = "default"
            messages = conversation_state.get(user_id, [])

            messages, result = run_qa_turn(agent, messages, question)
            conversation_state[user_id] = messages

            tool_results = result.get("tool_results", {})
            answer = result.get("final_answer", "No answer generated.")
            if isinstance(answer, tuple):  # sometimes it's a tuple
                answer = answer[0]

            # contexts (should be list-of-string for each)
            emb_context = tool_results.get("embedding_search", [])
            if not isinstance(emb_context, list):
                emb_context = [emb_context]
            graph_context = tool_results.get("graph_search", [])
            if not isinstance(graph_context, list):
                graph_context = [graph_context]
            web_context = tool_results.get("web_search", [])
            if not isinstance(web_context, list):
                web_context = [web_context]

            latency = time.time() - start_time

            entry = {
                "question": question,
                "generated_answer": answer,
                "emb_based_context": emb_context,
                "graph_based_context": graph_context,

                "web_based_context": web_context,
                "latency_seconds": latency
            }
        except Exception as e:
            entry = {
                "question": question,
                "error": str(e)
            }
        results[idx] = entry

        with open(results_file, "w") as f:
            json.dump(results, f, indent=2)
    print(f"Agentic results saved to {results_file}")

# sage ---
generate_agentic_answers(agent, "eval/eval_questions.json")

##### Extract agentic statements

In [None]:
def extract_statements_agentic(answers_file, statements_file, statements_extractor):
    with open(answers_file, "r") as f:
        answer_entries = json.load(f)
    statements_entries = answer_entries.copy()

    for idx, entry in enumerate(statements_entries):
        if "error" in entry or "generated_answer" not in entry:
            continue
        if "statements" in entry:
            continue

        print(f"Extracting statements for agentic question {idx+1}")

        statements_result = statements_extractor.run(
            questions=[entry["question"]],
            generated_answers=[entry["generated_answer"]],
            contexts=[[]]  # or you could pass some tracking info, but it's not sent to LLM
        )

        entry["statements"] = statements_result[0]["statements"]

        # attach contexts for later evaluation (not used by extractor)
        entry["emb_based_context"] = entry.get("emb_based_context", [])
        entry["graph_based_context"] = entry.get("graph_based_context", [])
        entry["web_based_context"] = entry.get("web_based_context", [])

        with open(statements_file, "w") as f:
            json.dump(statements_entries, f, indent=2)

    print(f"Statements saved to {statements_file}")

# usage:
extract_statements_agentic("agent_generated_answers.json", "agent_statements.json", statements_extractor)

##### Faithfulness agentic evaluation

In [None]:
def run_agentic_faithfulness_eval(statements_file, eval_results_file, evaluators):
    """
    Runs agentic faithfulness evaluation for each entry in the statements_file,
    and saves enriched results (with eval outputs) to eval_results_file.

    Args:
        statements_file: input file with 'statements' and all contexts for each entry
        eval_results_file: output file with agentic evaluator results
        evaluators: dict of {"evaluator_name": evaluator_object, ...}
    """
    with open(statements_file, "r") as f:
        answer_entries = json.load(f)

    results_entries = answer_entries.copy()

    for idx, entry in enumerate(results_entries):
        if (
            "error" in entry
            or "statements" not in entry
            or not entry["statements"]
        ):
            continue  # skip if no statements

        # only run if not already done for all requested evaluators
        skip = True
        for name in evaluators:
            if f"results_{name}" not in entry:
                skip = False
        if skip:
            continue

        print(f"Evaluating agentic faithfulness for question {idx+1}")

        questions = [entry["question"]]
        predicted_answers = [entry["generated_answer"]]
        emb_contexts = [entry.get("emb_based_context", [])]
        graph_contexts = [entry.get("graph_based_context", [])]
        web_search_contexts = [entry.get("web_based_context", [])]
        statements = [entry["statements"]]

        for eval_name, eval_obj in evaluators.items():
            key = f"results_{eval_name}"
            if key in entry:
                continue
            result = eval_obj.run(
                questions=questions,
                predicted_answers=predicted_answers,
                emb_contexts=emb_contexts,
                graph_contexts=graph_contexts,
                web_search_contexts=web_search_contexts,
                statements=statements
            )
            entry[key] = result

        with open(eval_results_file, "w") as f:
            json.dump(results_entries, f, indent=2)

    print(f"Agentic faithfulness eval results saved to {eval_results_file}")

# usage

# define your agent evaluators (use your instantiated ones)
agent_evaluators = {
    "gpt": agent_evaluator_gpt,
    "anthropic": agent_evaluator_anthropic,
    "google": agent_evaluator_google,
}

run_agentic_faithfulness_eval(
    "agent_statements.json",
    "agent_faithfulness_evaluation_results.json",
    agent_evaluators
)