diff --git a/public/images/docs/evaluation/mcp-connectors/1.png b/public/images/docs/evaluation/mcp-connectors/1.png new file mode 100644 index 00000000..a8188a01 Binary files /dev/null and b/public/images/docs/evaluation/mcp-connectors/1.png differ diff --git a/public/images/docs/evaluation/mcp-connectors/2.png b/public/images/docs/evaluation/mcp-connectors/2.png new file mode 100644 index 00000000..fee9d645 Binary files /dev/null and b/public/images/docs/evaluation/mcp-connectors/2.png differ diff --git a/public/images/docs/evaluation/mcp-connectors/3.png b/public/images/docs/evaluation/mcp-connectors/3.png new file mode 100644 index 00000000..4ef31b55 Binary files /dev/null and b/public/images/docs/evaluation/mcp-connectors/3.png differ diff --git a/public/images/docs/evaluation/mcp-connectors/4.png b/public/images/docs/evaluation/mcp-connectors/4.png new file mode 100644 index 00000000..147ae113 Binary files /dev/null and b/public/images/docs/evaluation/mcp-connectors/4.png differ diff --git a/public/images/docs/evaluation/mcp-connectors/5.png b/public/images/docs/evaluation/mcp-connectors/5.png new file mode 100644 index 00000000..bb7da177 Binary files /dev/null and b/public/images/docs/evaluation/mcp-connectors/5.png differ diff --git a/src/components/docs/Mermaid.astro b/src/components/docs/Mermaid.astro new file mode 100644 index 00000000..e0eef6ab --- /dev/null +++ b/src/components/docs/Mermaid.astro @@ -0,0 +1,165 @@ +--- +// Renders a Mermaid diagram. Source is passed via the `code` prop: +// +// B +// `} /> +// +// Mermaid is loaded once per page via a hoisted, deduplicated + + diff --git a/src/lib/navigation.ts b/src/lib/navigation.ts index 43e2806f..07e1e2e5 100644 --- a/src/lib/navigation.ts +++ b/src/lib/navigation.ts @@ -289,34 +289,78 @@ export const tabNavigation: NavTab[] = [ icon: 'chart', items: [ { title: 'Overview', href: '/docs/evaluation' }, + { title: 'Quickstart', href: '/docs/quickstart/evals' }, { title: 'Concepts', items: [ - { title: 'Understanding Evaluation', href: '/docs/evaluation/concepts/understanding-evaluation' }, { title: 'Eval Types', href: '/docs/evaluation/concepts/eval-types' }, { title: 'Eval Templates', href: '/docs/evaluation/concepts/eval-templates' }, { title: 'Output Types', href: '/docs/evaluation/concepts/output-types' }, - { title: 'Data Injection', href: '/docs/evaluation/concepts/data-injection' }, - { title: 'Composite Evals', href: '/docs/evaluation/concepts/composite-evals' }, - { title: 'Versioning', href: '/docs/evaluation/concepts/versioning' }, { title: 'Judge Models', href: '/docs/evaluation/concepts/judge-models' }, { title: 'Eval Results', href: '/docs/evaluation/concepts/eval-results' }, + { title: 'Composite Evals', href: '/docs/evaluation/concepts/composite-evals' }, + { title: 'Versioning', href: '/docs/evaluation/concepts/versioning' }, + { title: 'Data Injection', href: '/docs/evaluation/concepts/data-injection' }, { title: 'MCP Connectors in Evaluation', href: '/docs/evaluation/concepts/mcp-connectors' }, ] }, { - title: 'Features', + title: 'Run evals', + items: [ + { title: 'Run evals in the UI', href: '/docs/evaluation/run/in-the-ui' }, + { title: 'Run evals with the Python SDK', href: '/docs/evaluation/run/python-sdk' }, + { title: 'Run evals with TypeScript', href: '/docs/evaluation/run/typescript-sdk' }, + { title: 'Run evals with the API', href: '/docs/evaluation/run/api' }, + { title: 'Run evals in CI/CD', href: '/docs/evaluation/run/cicd' }, + ] + }, + { + title: 'Build evals', + items: [ + { title: 'Create custom evals', href: '/docs/evaluation/build/custom' }, + { title: 'Test playground', href: '/docs/evaluation/build/test-playground' }, + { title: 'Ground truth', href: '/docs/evaluation/build/ground-truth' }, + { title: 'Error localization', href: '/docs/evaluation/build/error-localization' }, + { title: 'Configure MCP connectors', href: '/docs/evaluation/build/mcp-connectors' }, + ] + }, + { + title: 'Judge models', + items: [ + { title: 'FutureAGI models', href: '/docs/evaluation/judge-models/futureagi' }, + { title: 'Use custom models', href: '/docs/evaluation/judge-models/custom' }, + ] + }, + { + title: 'Evaluator catalog', + items: [ + { title: 'All evaluators', href: '/docs/evaluation/builtin' }, + { title: 'RAG & retrieval', href: '/docs/evaluation/builtin/categories/rag' }, + { title: 'Agent & conversation', href: '/docs/evaluation/builtin/categories/agent' }, + { title: 'Safety & policy', href: '/docs/evaluation/builtin/categories/safety' }, + { title: 'Text quality & NLP metrics', href: '/docs/evaluation/builtin/categories/text' }, + { title: 'Format & validation', href: '/docs/evaluation/builtin/categories/format' }, + { title: 'Code', href: '/docs/evaluation/builtin/categories/code' }, + { title: 'Multimodal', href: '/docs/evaluation/builtin/categories/multimodal' }, + { title: 'Audio', href: '/docs/evaluation/builtin/categories/audio' }, + ] + }, + { + title: 'Reference', + items: [ + { title: 'Eval result schema', href: '/docs/evaluation/reference/result-schema' }, + { title: 'Evaluator input schema', href: '/docs/evaluation/reference/input-schema' }, + { title: 'Score types', href: '/docs/evaluation/reference/score-types' }, + ] + }, + { + title: 'Troubleshooting', items: [ - { title: 'Built-in Evals', href: '/docs/evaluation/builtin' }, - { title: 'Evaluate via Platform & SDK', href: '/docs/evaluation/features/evaluate' }, - { title: 'Create Custom Evals', href: '/docs/evaluation/features/custom' }, - { title: 'Test Playground', href: '/docs/evaluation/features/test-playground' }, - { title: 'Ground Truth', href: '/docs/evaluation/features/ground-truth' }, - { title: 'Error Localization', href: '/docs/evaluation/features/error-localization' }, - { title: 'Configure MCP Connectors for an Eval', href: '/docs/evaluation/features/mcp-connectors' }, - { title: 'Use Custom Models', href: '/docs/evaluation/features/custom-models' }, - { title: 'Future AGI Models', href: '/docs/evaluation/features/futureagi-models' }, - { title: 'Evaluate CI/CD Pipeline', href: '/docs/evaluation/features/cicd' }, + { title: 'Scores changed unexpectedly', href: '/docs/evaluation/troubleshooting/score-drift' }, + { title: 'Judge output is inconsistent', href: '/docs/evaluation/troubleshooting/judge-variance' }, + { title: 'Eval run is slow', href: '/docs/evaluation/troubleshooting/slow-runs' }, + { title: "Dataset fields don't match", href: '/docs/evaluation/troubleshooting/mapping' }, + { title: 'CI eval gate failed', href: '/docs/evaluation/troubleshooting/ci-failures' }, ] }, ] diff --git a/src/lib/redirects.ts b/src/lib/redirects.ts index 100ced32..c3d78268 100644 --- a/src/lib/redirects.ts +++ b/src/lib/redirects.ts @@ -114,7 +114,7 @@ export const redirectMap: Record = { '/future-agi/get-started/evaluation/builtin-evals/fuzzy-match': '/docs/evaluation/builtin/fuzzy-match', '/future-agi/get-started/evaluation/builtin-evals/groundedness': '/docs/evaluation/builtin/groundedness', '/future-agi/get-started/evaluation/builtin-evals/hit-rate': '/docs/evaluation/builtin/hit-rate', - '/future-agi/get-started/evaluation/builtin-evals/instruction-adherence': '/docs/evaluation/builtin/instruction-adherence', + '/future-agi/get-started/evaluation/builtin-evals/instruction-adherence': '/docs/evaluation/builtin/prompt-adherence', '/future-agi/get-started/evaluation/builtin-evals/is-compliant': '/docs/evaluation/builtin', '/future-agi/get-started/evaluation/builtin-evals/is-concise': '/docs/evaluation/builtin/is-concise', '/future-agi/get-started/evaluation/builtin-evals/is-email': '/docs/evaluation/builtin/is-email', @@ -154,12 +154,12 @@ export const redirectMap: Record = { '/future-agi/get-started/evaluation/builtin-evals/toxicity': '/docs/evaluation/builtin/toxicity', '/future-agi/get-started/evaluation/builtin-evals/translation-accuracy': '/docs/evaluation/builtin/translation-accuracy', '/future-agi/get-started/evaluation/builtin-evals/valid-links': '/docs/evaluation/builtin', - '/future-agi/get-started/evaluation/create-custom-evals': '/docs/evaluation/features/custom', - '/future-agi/get-started/evaluation/evaluate-ci-cd-pipeline': '/docs/evaluation/features/cicd', - '/future-agi/get-started/evaluation/evaluate-patterns': '/docs/evaluation/features/evaluate', - '/future-agi/get-started/evaluation/future-agi-models': '/docs/evaluation/features/futureagi-models', - '/future-agi/get-started/evaluation/running-your-first-eval': '/docs/evaluation/features/evaluate', - '/future-agi/get-started/evaluation/use-custom-models': '/docs/evaluation/features/custom-models', + '/future-agi/get-started/evaluation/create-custom-evals': '/docs/evaluation/build/custom', + '/future-agi/get-started/evaluation/evaluate-ci-cd-pipeline': '/docs/evaluation/run/cicd', + '/future-agi/get-started/evaluation/evaluate-patterns': '/docs/evaluation/run/in-the-ui', + '/future-agi/get-started/evaluation/future-agi-models': '/docs/evaluation/judge-models/futureagi', + '/future-agi/get-started/evaluation/running-your-first-eval': '/docs/evaluation/run/in-the-ui', + '/future-agi/get-started/evaluation/use-custom-models': '/docs/evaluation/judge-models/custom', '/future-agi/get-started/knowledge-base/concept': '/docs/knowledge-base/concepts/concept', '/future-agi/get-started/knowledge-base/how-to/create-kb-using-sdk': '/docs/knowledge-base/features/sdk', '/future-agi/get-started/knowledge-base/how-to/create-kb-using-ui': '/docs/knowledge-base/features/ui', diff --git a/src/pages/docs/cookbook/decrease-hallucination.mdx b/src/pages/docs/cookbook/decrease-hallucination.mdx index d4677500..984c936d 100644 --- a/src/pages/docs/cookbook/decrease-hallucination.mdx +++ b/src/pages/docs/cookbook/decrease-hallucination.mdx @@ -592,7 +592,7 @@ The winner configuration was CharacterTextSplitter_mmr_map_rerank, which combine - **Can I create custom evaluations tailored to my RAG use case in Future AGI?** - Yes. The Deterministic Eval template in Future AGI supports custom evaluations (***Click [here](/docs/evaluation/features/custom) to learn more about deterministic eval***). This lets you apply stringent criteria to your RAG outputs minimising variability. + Yes. The Deterministic Eval template in Future AGI supports custom evaluations (***Click [here](/docs/evaluation/build/custom) to learn more about deterministic eval***). This lets you apply stringent criteria to your RAG outputs minimising variability. ## Ready to Reduce Hallucinations in Your RAG Applications? diff --git a/src/pages/docs/cookbook/evaluation/eval-correction-loop.mdx b/src/pages/docs/cookbook/evaluation/eval-correction-loop.mdx index 4141f59e..53d37cd0 100644 --- a/src/pages/docs/cookbook/evaluation/eval-correction-loop.mdx +++ b/src/pages/docs/cookbook/evaluation/eval-correction-loop.mdx @@ -250,10 +250,10 @@ You ran a built-in eval, found rows where it disagreed with human judgment, enco ## Explore further - + Full reference for the custom eval template API - + Pick the right judge model: turing_small, turing_flash, turing_large diff --git a/src/pages/docs/cookbook/evaluation/eval-with-mcp-connectors.mdx b/src/pages/docs/cookbook/evaluation/eval-with-mcp-connectors.mdx index 22bce32f..2ec0cdab 100644 --- a/src/pages/docs/cookbook/evaluation/eval-with-mcp-connectors.mdx +++ b/src/pages/docs/cookbook/evaluation/eval-with-mcp-connectors.mdx @@ -58,7 +58,7 @@ pip install fastmcp python crm_mcp_server.py ``` -Expose it through ngrok or your own tunnel so Future AGI can reach it: +Expose it through ngrok or your own tunnel so FutureAGI can reach it: ```bash ngrok http 8000 @@ -68,7 +68,7 @@ Grab the public URL (e.g. `https://abc123.ngrok-free.app`). --- -## Step 2: Register the connector in Future AGI +## Step 2: Register the connector in FutureAGI 1. Open **Settings → Connectors → Add Connector**. 2. Fill in: @@ -162,7 +162,7 @@ Open any row to see the **Tool Trace** — the exact tool call and response the - **Add more tools**: Extend the MCP server with `get_subscription`, `get_invoices`, `get_feature_flags`. The judge will pick the right tool per claim. - **Use a real CRM**: Replace the stub with HubSpot, Stripe, or Linear MCP servers. Set the connector auth to OAuth or bearer. -- **Score traces, not datasets**: The same Agent-mode eval can run against [traced](/docs/observe) production conversations — wire it as a [continuous evaluation](/docs/evaluation/features/cicd). +- **Score traces, not datasets**: The same Agent-mode eval can run against [traced](/docs/observe) production conversations — wire it as a [continuous evaluation](/docs/evaluation/run/cicd). - **Cross-source evals**: Enable two connectors at once (CRM + Notion docs). The judge calls both when the reply cites a product feature *and* a customer record. --- @@ -179,6 +179,6 @@ Open any row to see the **Tool Trace** — the exact tool call and response the ## Next steps - [MCP Connectors concept](/docs/evaluation/concepts/mcp-connectors): The model behind Agent-mode evals. -- [Configure MCP Connectors for an Eval](/docs/evaluation/features/mcp-connectors): The UI walkthrough. -- [Create custom evals](/docs/evaluation/features/custom): Building eval templates without connectors. +- [Configure MCP Connectors for an Eval](/docs/evaluation/build/mcp-connectors): The UI walkthrough. +- [Create custom evals](/docs/evaluation/build/custom): Building eval templates without connectors. - [Falcon AI MCP Connectors](/docs/falcon-ai/features/mcp-connectors): Registering and authenticating a connector. diff --git a/src/pages/docs/dataset/features/experiments.mdx b/src/pages/docs/dataset/features/experiments.mdx index 5b026a59..0e1b4629 100644 --- a/src/pages/docs/dataset/features/experiments.mdx +++ b/src/pages/docs/dataset/features/experiments.mdx @@ -77,7 +77,7 @@ Experiment creation is a guided three-step flow: **Basic Info → Configuration Models you've added through Custom Models show up in the model picker for prompt configurations across all experiment types. - See [Custom Models](/docs/evaluation/features/custom-models) for how to register a custom or self-hosted model. + See [Custom Models](/docs/evaluation/judge-models/custom) for how to register a custom or self-hosted model. @@ -90,7 +90,7 @@ Experiment creation is a guided three-step flow: **Basic Info → Configuration **Compare against baseline (optional)**: pick a column from the dataset to compare model outputs against (typically a ground-truth or existing run-prompt column). Skip it if you don't have a reference output yet; you can still run the experiment, attach evals that don't need a baseline, and add a base column later by editing the experiment. - **Add evaluations**: click **Add Evaluation** and pick from the [built-in eval](/docs/evaluation/builtin) catalog or [create a custom eval](/docs/evaluation/features/custom). Add as many as you need. Every eval runs on every configuration so the results are directly comparable. + **Add evaluations**: click **Add Evaluation** and pick from the [built-in eval](/docs/evaluation/builtin) catalog or [create a custom eval](/docs/evaluation/build/custom). Add as many as you need. Every eval runs on every configuration so the results are directly comparable. ![Choosing Evals](/screenshot/product/dataset/how-to/experiments-in-dataset/7.png) For each eval, map its inputs (e.g. `output`, `input`, `expected`) to the model output or to dataset columns. Mapping is required before the experiment can run. diff --git a/src/pages/docs/dataset/features/run-prompt.mdx b/src/pages/docs/dataset/features/run-prompt.mdx index 7a09a37e..437b548f 100644 --- a/src/pages/docs/dataset/features/run-prompt.mdx +++ b/src/pages/docs/dataset/features/run-prompt.mdx @@ -36,28 +36,28 @@ Run Prompt lets you add a new column to your dataset that is filled by a model ( Choose **LLM** for text generation (chat). Use for Q&A, summarization, or any text-in, text-out task. Select a chat model from the list; ensure the provider has an API key configured. ![LLM](/screenshot/product/dataset/how-to/run-prompt-in-dataset/3.png) - Click [here](/docs/evaluation/features/custom-models) to learn how to create custom models. + Click [here](/docs/evaluation/judge-models/custom) to learn how to create custom models. Choose **Text-to-Speech** to generate audio from text. The prompt output column will store audio (e.g. URLs). You can configure voice and format for supported TTS models. ![Text-to-Speech](/screenshot/product/dataset/how-to/run-prompt-in-dataset/4.png) - Click [here](/docs/evaluation/features/custom-models) to learn how to create custom models. + Click [here](/docs/evaluation/judge-models/custom) to learn how to create custom models. Choose **Speech-to-Text** to transcribe audio into text. Use when a column contains audio; the model output will be text in the new column. ![Speech-to-Text](/screenshot/product/dataset/how-to/run-prompt-in-dataset/5.png) - Click [here](/docs/evaluation/features/custom-models) to learn how to create custom models. + Click [here](/docs/evaluation/judge-models/custom) to learn how to create custom models. Choose **Image Generation** to create images from text (or image + text) prompts. The prompt output column will store image URLs. Select an image-generation model and ensure the provider has an API key configured. ![Image Generation](/screenshot/product/dataset/how-to/run-prompt-in-dataset/6.png) - Click [here](/docs/evaluation/features/custom-models) to learn how to create custom models. + Click [here](/docs/evaluation/judge-models/custom) to learn how to create custom models. diff --git a/src/pages/docs/evaluation/features/custom.mdx b/src/pages/docs/evaluation/build/custom.mdx similarity index 86% rename from src/pages/docs/evaluation/features/custom.mdx rename to src/pages/docs/evaluation/build/custom.mdx index f25169ad..f74eafb7 100644 --- a/src/pages/docs/evaluation/features/custom.mdx +++ b/src/pages/docs/evaluation/build/custom.mdx @@ -1,6 +1,6 @@ --- title: "Create Custom Evals" -description: "Create your own eval templates in Future AGI as Agents, LLM-As-A-Judge, or Code, including all configuration options shown in the UI." +description: "Create your own eval templates in FutureAGI as Agents, LLM-As-A-Judge, or Code, including all configuration options shown in the UI." --- ## About @@ -15,12 +15,18 @@ Once a template is saved, use it the same way as any built-in: apply it to a dat ## Prerequisites -- A Future AGI workspace -- A Future AGI API key (Settings → API Keys) for SDK or API use +- A FutureAGI workspace +- A FutureAGI API key (Settings → API Keys) for SDK or API use --- -## Create from the UI +## Create a custom eval + +Pick the interface that matches how you work. The UI is fastest for one-off creation and visual iteration; the SDK is the right path when you want to version evals with your application code or generate them from configuration. + + + + @@ -167,7 +173,7 @@ Pick what the eval returns. See [Output types](/docs/evaluation/concepts/output- -**Error Localization** is a toggle right below the output type. When on, the eval analyses why a row failed and surfaces the offending field. Available for Agent and LLM-As-A-Judge evals. See [Error Localization](/docs/evaluation/features/error-localization). +**Error Localization** is a toggle right below the output type. When on, the eval analyses why a row failed and surfaces the offending field. Available for Agent and LLM-As-A-Judge evals. See [Error Localization](/docs/evaluation/build/error-localization). The **Advanced** section is collapsible. Open it to set: @@ -180,7 +186,7 @@ The **Advanced** section is collapsible. Open it to set: -Click **Test Evaluation** to run the eval against a sample input without saving. The test panel lets you fill in the variables manually or pull a row from a dataset, span, or simulation. See [Test Playground](/docs/evaluation/features/test-playground). +Click **Test Evaluation** to run the eval against a sample input without saving. The test panel lets you fill in the variables manually or pull a row from a dataset, span, or simulation. See [Test Playground](/docs/evaluation/build/test-playground). The Test Evaluation button is the right way to validate the criteria before committing the template. Nothing is persisted by Test. @@ -190,17 +196,17 @@ The Test Evaluation button is the right way to validate the criteria before comm Click **Save**. The template is created and the first version (V1) is recorded. The eval appears in the evaluation list. -To use it: apply it to a dataset, attach it to a trace project, run it from the SDK, or include it in a composite. See [Evaluate via Platform & SDK](/docs/evaluation/features/evaluate). +To use it: apply it to a dataset, attach it to a trace project, run it from the SDK, or include it in a composite. See [Run evals in the UI](/docs/evaluation/run/in-the-ui). ---- + -## Create from the SDK + -The SDK calls the same API the UI uses. Use this when you want to define evals in code, version them with your application, or generate them from configuration. +Define evals in code when you want to version them with your application, generate them from configuration, or keep template authoring inside a CI flow. The SDK calls the same API the UI uses. @@ -300,29 +306,29 @@ curl -X POST https://api.futureagi.com/model-hub/eval-templates/create-v2/ \ Once created, run the eval by referencing its `name`: ```python -from fi.evals import Evaluator - -evaluator = Evaluator( - fi_api_key="YOUR_API_KEY", - fi_secret_key="YOUR_SECRET_KEY", -) +from fi.evals import evaluate -result = evaluator.evaluate( - eval_templates="response_groundedness", - inputs={ - "context": "Paris is the capital of France.", - "output": "The capital of France is Paris.", - }, +result = evaluate( + "response_groundedness", + context="Paris is the capital of France.", + output="The capital of France is Paris.", ) -print(result.eval_results[0].output) # "Passed" -print(result.eval_results[0].reason) # explanation +print(result.passed) # True +print(result.score) # 1.0 +print(result.reason) ``` +See [Run evals with the Python SDK](/docs/evaluation/run/python-sdk) for the full SDK reference. + + + + + --- ## Field reference @@ -367,19 +373,9 @@ Editing the template later creates `V2`, `V3`, ... with `V1` preserved as histor --- -## Next Steps - - - - Try your eval against a sample input before applying it. - - - Apply the eval to a dataset, trace project, simulation, or SDK call. - - - Bundle this eval with others into a single composite check. - - - Pick the right type for your use case. - - +## Next steps + +- [Test playground](/docs/evaluation/build/test-playground): try your eval against a sample input before applying it. +- [Run evals in the UI](/docs/evaluation/run/in-the-ui): apply the eval to a dataset, trace project, simulation, or SDK call. +- [Composite evals](/docs/evaluation/concepts/composite-evals): bundle this eval with others into a single composite check. +- [Eval types](/docs/evaluation/concepts/eval-types): pick the right type for your use case. diff --git a/src/pages/docs/evaluation/features/error-localization.mdx b/src/pages/docs/evaluation/build/error-localization.mdx similarity index 97% rename from src/pages/docs/evaluation/features/error-localization.mdx rename to src/pages/docs/evaluation/build/error-localization.mdx index aa4f99c9..bc034f79 100644 --- a/src/pages/docs/evaluation/features/error-localization.mdx +++ b/src/pages/docs/evaluation/build/error-localization.mdx @@ -154,19 +154,19 @@ print(result.eval_results[0].error_localizer) # selected_input_key + error_ana --- -## Next Steps +## Next steps How verdicts and reasons are reported. - + Toggle Error Localization when authoring a template. - + Apply an eval with Error Localization to a dataset or trace. - + Pair Error Localization with ground truth data for richer analysis. diff --git a/src/pages/docs/evaluation/features/ground-truth.mdx b/src/pages/docs/evaluation/build/ground-truth.mdx similarity index 98% rename from src/pages/docs/evaluation/features/ground-truth.mdx rename to src/pages/docs/evaluation/build/ground-truth.mdx index 82968f65..144ea56d 100644 --- a/src/pages/docs/evaluation/features/ground-truth.mdx +++ b/src/pages/docs/evaluation/build/ground-truth.mdx @@ -180,19 +180,19 @@ For high-volume runs, prefer concise examples to keep the per-row cost down. --- -## Next Steps +## Next steps - + Author the template that ground truth attaches to. Templates and the role of ground truth in their config. - + Test the eval with ground truth on before saving. - + Pair ground truth with localization for deeper failure analysis. diff --git a/src/pages/docs/evaluation/features/mcp-connectors.mdx b/src/pages/docs/evaluation/build/mcp-connectors.mdx similarity index 88% rename from src/pages/docs/evaluation/features/mcp-connectors.mdx rename to src/pages/docs/evaluation/build/mcp-connectors.mdx index c406e2a8..312b772b 100644 --- a/src/pages/docs/evaluation/features/mcp-connectors.mdx +++ b/src/pages/docs/evaluation/build/mcp-connectors.mdx @@ -1,6 +1,6 @@ --- title: "Configure MCP Connectors for an Eval" -description: "Step-by-step guide to attach MCP connectors to an Agent-mode eval in the Future AGI platform." +description: "Step-by-step guide to attach MCP connectors to an Agent-mode eval in the FutureAGI platform." --- ## About @@ -23,16 +23,13 @@ This page shows how to attach MCP connectors to an eval so the judge can call ex From the dataset, simulation, or trace view click **Evaluate**. The eval configuration panel opens on the right. - ![Eval configuration panel](/screenshot/product/evaluation/mcp-connectors/1.png) - - Image placeholder — replace with the actual eval picker screenshot. + ![Eval configuration panel](/images/docs/evaluation/mcp-connectors/1.png) Click **Add Evaluation**, then **Create your own eval** for a new template, or pick a built-in template that already runs in Agent mode (any **Customer Agent** template). - ![Add a custom eval](/screenshot/product/evaluation/mcp-connectors/2.png) - Image placeholder. + ![Add a custom eval](/images/docs/evaluation/mcp-connectors/2.png) @@ -47,9 +44,7 @@ This page shows how to attach MCP connectors to an eval so the judge can call ex In **Tools**, toggle on the connectors you want this eval to use. Each connector lists the discovered tools below its name; you can disable individual tools to keep the judge focused. - ![Connectors selection in eval](/screenshot/product/evaluation/mcp-connectors/3.png) - Image placeholder. - + ![Connectors selection in eval](/images/docs/evaluation/mcp-connectors/3.png) Toggle **Internet** on if the judge should be able to fetch public web pages. @@ -69,16 +64,13 @@ This page shows how to attach MCP connectors to an eval so the judge can call ex Choose a model that supports tool calling. `turing_large` is recommended; `turing_large_xl` handles long tool outputs better when the connector returns large payloads. - ![Pick judge model](/screenshot/product/evaluation/mcp-connectors/4.png) - Image placeholder. + ![Pick judge model](/images/docs/evaluation/mcp-connectors/4.png) Use **Test on Sample** with the eval picker open to score one row against your tools before you commit to the full run. The test panel shows the tool calls the judge made, the responses, and the final score. - ![Test on sample](/screenshot/product/evaluation/mcp-connectors/5.png) - Image placeholder. - + ![Test on sample](/images/docs/evaluation/mcp-connectors/5.png) If the judge did not call the expected tool, refine the rule prompt. If the tool call failed, fix the connector configuration in Settings before running at scale. @@ -160,4 +152,4 @@ Pair with a GitHub MCP connector with `get_pr_diff` enabled. - [Eval with MCP connectors cookbook](/docs/cookbook/evaluation/eval-with-mcp-connectors): A full end-to-end example. - [MCP Connectors concept](/docs/evaluation/concepts/mcp-connectors): What runs under the hood. -- [Create custom evals](/docs/evaluation/features/custom): For non-Agent eval templates. +- [Create custom evals](/docs/evaluation/build/custom): For non-Agent eval templates. diff --git a/src/pages/docs/evaluation/features/test-playground.mdx b/src/pages/docs/evaluation/build/test-playground.mdx similarity index 96% rename from src/pages/docs/evaluation/features/test-playground.mdx rename to src/pages/docs/evaluation/build/test-playground.mdx index 669e1986..f8eafb13 100644 --- a/src/pages/docs/evaluation/features/test-playground.mdx +++ b/src/pages/docs/evaluation/build/test-playground.mdx @@ -123,16 +123,16 @@ This means you can edit the criteria, click Test, see the new verdict, edit agai --- -## Next Steps +## Next steps - + Author a template and test it as you go. - + Apply a tested eval to a dataset or trace project. - + Save creates a new version. Old versions stay for rollback. diff --git a/src/pages/docs/evaluation/builtin/answer-similarity.mdx b/src/pages/docs/evaluation/builtin/answer-similarity.mdx deleted file mode 100644 index 5a5c9e4c..00000000 --- a/src/pages/docs/evaluation/builtin/answer-similarity.mdx +++ /dev/null @@ -1,51 +0,0 @@ ---- -title: "Answer Similarity: Built-in Evaluation" -description: "Evaluates the similarity between the expected and actual responses" ---- - - - -```python Python -result = evaluator.evaluate( - eval_templates="answer_similarity", - inputs={ - "expected_response": "...", - "response": "..." - }, -) - -print(result.eval_results[0].output) -print(result.eval_results[0].reason) -``` - -```typescript JS/TS -import { Evaluator } from "@future-agi/ai-evaluation"; - -const evaluator = new Evaluator(); - -const result = await evaluator.evaluate( - "answer_similarity", - { - expected_response: "...", - response: "..." - } -); - -console.log(result); -``` - - - -| **Input** | | | | -| ------ | --------- | ---- | ----------- | -| | **Required Input** | **Type** | **Description** | -| | `expected_response` | `string` | The expected correct response. | -| | `response` | `string` | The actual response to be evaluated. | - -| **Output** | | | -| ------ | ----- | ----------- | -| | **Field** | **Description** | -| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | -| | **Reason** | A plain-language explanation of the verdict. | - -**Tags:** `NLP Metrics`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/api-call.mdx b/src/pages/docs/evaluation/builtin/api-call.mdx deleted file mode 100644 index f023bbc8..00000000 --- a/src/pages/docs/evaluation/builtin/api-call.mdx +++ /dev/null @@ -1,48 +0,0 @@ ---- -title: "Api Call: Built-in Evaluation" -description: "Makes an API call and evaluates the response" ---- - - - -```python Python -result = evaluator.evaluate( - eval_templates="api_call", - inputs={ - "response": "..." - }, -) - -print(result.eval_results[0].output) -print(result.eval_results[0].reason) -``` - -```typescript JS/TS -import { Evaluator } from "@future-agi/ai-evaluation"; - -const evaluator = new Evaluator(); - -const result = await evaluator.evaluate( - "api_call", - { - response: "..." - } -); - -console.log(result); -``` - - - -| **Input** | | | | -| ------ | --------- | ---- | ----------- | -| | **Required Input** | **Type** | **Description** | -| | `response` | `string` | The response to be evaluated. | - -| **Output** | | | -| ------ | ----- | ----------- | -| | **Field** | **Description** | -| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | -| | **Reason** | A plain-language explanation of the verdict. | - -**Tags:** `Code`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/categories/agent.mdx b/src/pages/docs/evaluation/builtin/categories/agent.mdx new file mode 100644 index 00000000..a0b2ddcc --- /dev/null +++ b/src/pages/docs/evaluation/builtin/categories/agent.mdx @@ -0,0 +1,67 @@ +--- +title: "Agent & conversation evaluators" +description: "Score multi-turn agent behavior, tool calling, and conversation quality. These evaluators are the right pick when you are building chatbots, support agents, or tool-using agents and need to assess behavior across a whole interaction." +--- + +## About + +Score multi-turn agent behavior, tool calling, and conversation quality. These evaluators are the right pick when you are building chatbots, support agents, or tool-using agents and need to assess behavior across a whole interaction. + +This category contains 30 eval templates. Several templates also appear in other categories, for example, an evaluator tagged for both RAG and hallucination detection appears in both the RAG and the (where applicable) other category page. + +--- + +## When to use + +- You're evaluating a customer-support agent across full conversations. +- You need to check whether an agent called the right tool with the right arguments. +- You want to detect loops, missed escalations, or premature terminations. +- You're scoring task completion and helpfulness end to end. + +--- + +## Evaluators + +| Template | What it checks | +|---|---| +| [`answer_refusal`](/docs/evaluation/builtin/answer-refusal) | Checks if the model correctly refuses to answer when prompted with harmful, sensitive, or restricted queries. | +| [`bias_detection`](/docs/evaluation/builtin/bias-detection) | Identifies various forms of bias including gender, racial, cultural, or ideological bias in the output content. | +| [`completeness`](/docs/evaluation/builtin/completeness) | Evaluates if the response completely answers the query | +| [`conversation_coherence`](/docs/evaluation/builtin/conversation-coherence) | Evaluates if a conversation flows logically and maintains context throughout | +| [`conversation_resolution`](/docs/evaluation/builtin/conversation-resolution) | Checks if the conversation reaches a satisfactory conclusion or resolution. The conversation must have atleast | +| [`cultural_sensitivity`](/docs/evaluation/builtin/cultural-sensitivity) | Analyzes output for cultural appropriateness, inclusive language, and awareness of cultural nuances. Identifies | +| [`customer_agent_clarification_seeking`](/docs/evaluation/builtin/customer-agent-clarification-seeking) | Assesses if the bot seeks clarification when needed rather than guessing. | +| [`customer_agent_context_retention`](/docs/evaluation/builtin/customer-agent-context-retention) | Checks if the bot remembers context from earlier in the conversation | +| [`customer_agent_conversation_quality`](/docs/evaluation/builtin/customer-agent-conversation-quality) | Conversation-level quality metric that assesses overall user experience. | +| [`customer_agent_human_escalation`](/docs/evaluation/builtin/customer-agent-human-escalation) | Tracks if the bot escalates to a human agent appropriately based on user frustration, complexity of queries, | +| [`customer_agent_interruption_handling`](/docs/evaluation/builtin/customer-agent-interruption-handling) | Monitors whether the bot talks over the user. Uses barge-in detection logs to confirm the bot waits for user | +| [`customer_agent_language_handling`](/docs/evaluation/builtin/customer-agent-language-handling) | Verifies the bot correctly detects the language/dialect and responds appropriately, including mid-call language | +| [`customer_agent_loop_detection`](/docs/evaluation/builtin/customer-agent-loop-detection) | Identifies if the bot gets stuck asking the same question repeatedly or circling back in loops. | +| [`customer_agent_objection_handling`](/docs/evaluation/builtin/customer-agent-objection-handling) | Monitors the agent's ability to handle customer objections effectively. | +| [`customer_agent_prompt_conformance`](/docs/evaluation/builtin/customer-agent-prompt-conformance) | Measures how well the bot adheres to system prompt constraints across the conversation, including persona consistency, | +| [`customer_agent_query_handling`](/docs/evaluation/builtin/customer-agent-query-handling) | Checks if the bot correctly interprets user queries and gives relevant answers. | +| [`customer_agent_termination_handling`](/docs/evaluation/builtin/customer-agent-termination-handling) | Tracks occurrences of bot freezing, hanging up abruptly, crashes, or early cut-offs. | +| [`evaluate_function_calling`](/docs/evaluation/builtin/llm-function-calling) | Tests if the model correctly identifies when to trigger a tool/function and includes the right arguments in the | +| [`is_concise`](/docs/evaluation/builtin/is-concise) | Measures whether the answer is brief and to the point, avoiding redundancy. | +| [`is_helpful`](/docs/evaluation/builtin/is-helpful) | Evaluates whether the response answers the user's question effectively. | +| [`is_informal_tone`](/docs/evaluation/builtin/is-informal-tone) | Detects whether the tone is informal or casual (e.g., use of slang, contractions, emoji). | +| [`is_polite`](/docs/evaluation/builtin/is-polite) | Ensures that the output maintains a respectful, kind, and non-aggressive tone. | +| [`no_apologies`](/docs/evaluation/builtin/no-apologies) | Checks if the model unnecessarily apologizes, e.g., 'I'm sorry, but…' | +| [`no_llm_reference`](/docs/evaluation/builtin/no-llm-reference) | Ensures that the model response does not mention being an OpenAI model or reference its training data or providers. | +| [`step_count`](/docs/evaluation/builtin/step-count) | Counts and validates the number of steps/actions in an agent trajectory. Can check against exact count, minimum, | +| [`task_completion`](/docs/evaluation/builtin/task-completion) | Measures whether the model fulfilled the user's request accurately and completely. | +| [`tone`](/docs/evaluation/builtin/tone) | Analyzes the tone and sentiment of content | +| [`tool_call_accuracy`](/docs/evaluation/builtin/tool-call-accuracy) | Evaluates accuracy of agent tool/function calls by comparing actual vs expected calls. Checks function names and | +| [`toxicity`](/docs/evaluation/builtin/toxicity) | Evaluates content for toxic or harmful language | +| [`trajectory_match`](/docs/evaluation/builtin/trajectory-match) | Validates agent action/tool call sequences. Supports strict (same order), unordered (any order), subset (expected | + +--- + +## Next steps + +- [Run evals on traces](/docs/observe/features/evals) +- [Run evals in simulation](/docs/quickstart/running-evals-in-simulation) +- [Task completion (eval template)](/docs/evaluation/builtin/task-completion) +- [All evaluators](/docs/evaluation/builtin): full catalog index. +- [Eval types](/docs/evaluation/concepts/eval-types): Agents, LLM-As-A-Judge, or Code. +- [Create custom evals](/docs/evaluation/build/custom): define your own quality rules. diff --git a/src/pages/docs/evaluation/builtin/categories/audio.mdx b/src/pages/docs/evaluation/builtin/categories/audio.mdx new file mode 100644 index 00000000..0755b6fd --- /dev/null +++ b/src/pages/docs/evaluation/builtin/categories/audio.mdx @@ -0,0 +1,44 @@ +--- +title: "Audio evaluators" +description: "Score speech, transcription, and audio quality. Use these for voice agents, ASR/STT pipelines, and TTS output." +--- + +## About + +Score speech, transcription, and audio quality. Use these for voice agents, ASR/STT pipelines, and TTS output. + +This category contains 9 eval templates. Several templates also appear in other categories, for example, an evaluator tagged for both RAG and hallucination detection appears in both the RAG and the (where applicable) other category page. + +--- + +## When to use + +- You're evaluating speech-to-text accuracy against reference transcripts (WER, CER, MER). +- You need text-to-speech accuracy or audio quality scoring. +- You're running a voice agent and need conversation-level scoring on the audio modality. + +--- + +## Evaluators + +| Template | What it checks | +|---|---| +| [`ASR/STT_accuracy`](/docs/evaluation/builtin/audio-transcription) | Analyzes the accuracy of transcriptions generated from audio inputs by Automatic Speech Recognition (ASR) or | +| [`audio_quality`](/docs/evaluation/builtin/audio-quality) | Evaluates the overall quality of the given audio, like MOS (Mean Opinion Score) evaluation | +| [`character_error_rate`](/docs/evaluation/builtin/character-error-rate) | Computes Character Error Rate (CER) for ASR/OCR evaluation. CER measures character-level edit distance between | +| [`customer_agent_interruption_handling`](/docs/evaluation/builtin/customer-agent-interruption-handling) | Monitors whether the bot talks over the user. Uses barge-in detection logs to confirm the bot waits for user | +| [`match_error_rate`](/docs/evaluation/builtin/match-error-rate) | Computes Match Error Rate (MER) for speech recognition. MER = edits / (hits + edits). Returns 1-MER as score. | +| [`TTS_accuracy`](/docs/evaluation/builtin/tts-accuracy) | Analyzes if the text-to-speech output accurately reflects the intended message, including pronunciation, emphasis, | +| [`word_error_rate`](/docs/evaluation/builtin/word-error-rate) | Computes Word Error Rate (WER) for ASR/STT evaluation. WER measures the edit distance at the word level between | +| [`word_info_lost`](/docs/evaluation/builtin/word-info-lost) | Computes Word Information Lost (WIL) for speech. WIL = 1 - (hits/ref * hits/hyp). Returns 1-WIL as score. | +| [`word_info_preserved`](/docs/evaluation/builtin/word-info-preserved) | Computes Word Information Preserved (WIP) for speech. WIP = (hits/ref) * (hits/hyp). Higher = better. | + +--- + +## Next steps + +- [Voice observability](/docs/observe/features/voice) +- [Word error rate (eval template)](/docs/evaluation/builtin/word-error-rate) +- [All evaluators](/docs/evaluation/builtin): full catalog index. +- [Eval types](/docs/evaluation/concepts/eval-types): Agents, LLM-As-A-Judge, or Code. +- [Create custom evals](/docs/evaluation/build/custom): define your own quality rules. diff --git a/src/pages/docs/evaluation/builtin/categories/code.mdx b/src/pages/docs/evaluation/builtin/categories/code.mdx new file mode 100644 index 00000000..56a61b34 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/categories/code.mdx @@ -0,0 +1,47 @@ +--- +title: "Code evaluators" +description: "Score code generation, tool calling, and SQL output. These evaluators check syntax, semantic correctness, and tool-use behavior for code-producing agents." +--- + +## About + +Score code generation, tool calling, and SQL output. These evaluators check syntax, semantic correctness, and tool-use behavior for code-producing agents. + +This category contains 11 eval templates. Several templates also appear in other categories, for example, an evaluator tagged for both RAG and hallucination detection appears in both the RAG and the (where applicable) other category page. + +--- + +## When to use + +- You're evaluating a code generation model and need syntax-aware comparison. +- You're checking whether an agent's tool/function calls are correct. +- You're scoring text-to-SQL output for accuracy and validity. +- You need to validate code complexity or syntax without execution. + +--- + +## Evaluators + +| Template | What it checks | +|---|---| +| [`code_bleu`](/docs/evaluation/builtin/code-bleu) | Computes CodeBLEU - a code-aware BLEU variant that combines standard n-gram BLEU with keyword matching for code-specific | +| [`code_complexity`](/docs/evaluation/builtin/code-complexity) | Computes cyclomatic complexity of Python code using AST analysis. Counts decision points (if, for, while, except, | +| [`contains_code`](/docs/evaluation/builtin/is-code) | Checks whether the output is valid code or contains expected code snippets. | +| [`evaluate_function_calling`](/docs/evaluation/builtin/llm-function-calling) | Tests if the model correctly identifies when to trigger a tool/function and includes the right arguments in the | +| [`is_html`](/docs/evaluation/builtin/is-html) | Validates if text contains well-formed HTML with proper tag nesting. Checks for the presence of HTML tags and validates | +| [`is_json`](/docs/evaluation/builtin/is-json) | Validates if content is proper JSON format | +| [`is_sql`](/docs/evaluation/builtin/is-sql) | Validates if text appears to be syntactically valid SQL. Checks for recognized SQL keywords, balanced parentheses, | +| [`is_xml`](/docs/evaluation/builtin/is-xml) | Validates if text is well-formed XML. Checks that the text can be parsed as a valid XML document. | +| [`syntax_validation`](/docs/evaluation/builtin/syntax-validation) | Validates code syntax without executing it. Supports Python (via ast.parse), JSON, and basic JavaScript bracket | +| [`text_to_sql`](/docs/evaluation/builtin/text-to-sql) | Evaluates if the model correctly converts natural language text into valid and accurate SQL queries. | +| [`tool_call_accuracy`](/docs/evaluation/builtin/tool-call-accuracy) | Evaluates accuracy of agent tool/function calls by comparing actual vs expected calls. Checks function names and | + +--- + +## Next steps + +- [Tool call accuracy (eval template)](/docs/evaluation/builtin/tool-call-accuracy) +- [CodeBLEU (eval template)](/docs/evaluation/builtin/code-bleu) +- [All evaluators](/docs/evaluation/builtin): full catalog index. +- [Eval types](/docs/evaluation/concepts/eval-types): Agents, LLM-As-A-Judge, or Code. +- [Create custom evals](/docs/evaluation/build/custom): define your own quality rules. diff --git a/src/pages/docs/evaluation/builtin/categories/format.mdx b/src/pages/docs/evaluation/builtin/categories/format.mdx new file mode 100644 index 00000000..46a045ce --- /dev/null +++ b/src/pages/docs/evaluation/builtin/categories/format.mdx @@ -0,0 +1,78 @@ +--- +title: "Format & validation evaluators" +description: "Validate that outputs conform to expected formats, schemas, or structural rules. These are deterministic Code-based evaluators that run without a model call, making them fast and free." +--- + +## About + +Validate that outputs conform to expected formats, schemas, or structural rules. These are deterministic Code-based evaluators that run without a model call, making them fast and free. + +This category contains 42 eval templates. Several templates also appear in other categories, for example, an evaluator tagged for both RAG and hallucination detection appears in both the RAG and the (where applicable) other category page. + +--- + +## When to use + +- You need to check whether an output is valid JSON, XML, HTML, SQL, or matches a regex. +- You're validating output length, sentence count, or word count. +- You're checking exact match, contains, or starts-with rules. +- You need cheap, deterministic gates before sending output to downstream systems. + +--- + +## Evaluators + +| Template | What it checks | +|---|---| +| [`accuracy`](/docs/evaluation/builtin/accuracy) | Computes classification accuracy by comparing predicted labels against expected labels. Accepts single values or | +| [`balanced_accuracy`](/docs/evaluation/builtin/balanced-accuracy) | Computes balanced accuracy (average recall per class). Handles imbalanced datasets better than standard accuracy. | +| [`cohen_kappa`](/docs/evaluation/builtin/cohen-kappa) | Computes Cohen's Kappa coefficient for inter-rater agreement. Accounts for agreement occurring by chance. Range | +| [`contains_code`](/docs/evaluation/builtin/is-code) | Checks whether the output is valid code or contains expected code snippets. | +| [`contains_valid_link`](/docs/evaluation/builtin/contains-valid-link) | Checks for presence of valid URLs | +| [`f1_score`](/docs/evaluation/builtin/f1-score) | Computes token-level F1 score between output and expected text. Treats both texts as bags of tokens and calculates | +| [`f_beta_score`](/docs/evaluation/builtin/f-beta-score) | Computes F-beta score with configurable beta for precision/recall weighting. `beta < 1` favors precision, `beta > 1` favors recall. | +| [`fleiss_kappa`](/docs/evaluation/builtin/fleiss-kappa) | Computes Fleiss' Kappa for multi-rater agreement. Extends Cohen's Kappa to N raters. Input: matrix where rows=subjects,… | +| [`fuzzy_match`](/docs/evaluation/builtin/fuzzy-match) | Computes fuzzy string matching score using SequenceMatcher (difflib). Returns a similarity ratio between 0 and 1 | +| [`ground_truth_match`](/docs/evaluation/builtin/ground-truth-match) | Evaluates whether the model-generated output matches the provided ground-truth expected output. | +| [`hamming_similarity`](/docs/evaluation/builtin/hamming-similarity) | Computes Hamming similarity between two strings. Counts matching character positions normalized by the longer string | +| [`image_instruction_adherence`](/docs/evaluation/builtin/image-instruction-adherence) | Measures how well generated images adhere to the given text instruction. Evaluates whether the image(s) accurately | +| [`image_properties`](/docs/evaluation/builtin/image-properties) | Validates image properties including dimensions, format, and file size. Useful for ensuring generated images meet | +| [`is_email`](/docs/evaluation/builtin/is-email) | Validates email address format | +| [`is_html`](/docs/evaluation/builtin/is-html) | Validates if text contains well-formed HTML with proper tag nesting. Checks for the presence of HTML tags and validates | +| [`is_json`](/docs/evaluation/builtin/is-json) | Validates if content is proper JSON format | +| [`is_refusal`](/docs/evaluation/builtin/is-refusal) | Detects if LLM output is a refusal to answer using common refusal pattern matching. Returns True if refusal detected. | +| [`is_sql`](/docs/evaluation/builtin/is-sql) | Validates if text appears to be syntactically valid SQL. Checks for recognized SQL keywords, balanced parentheses, | +| [`is_url`](/docs/evaluation/builtin/is-url) | Validates if text is a properly formatted URL with a valid scheme and network location. | +| [`is_xml`](/docs/evaluation/builtin/is-xml) | Validates if text is well-formed XML. Checks that the text can be parsed as a valid XML document. | +| [`jaccard_similarity`](/docs/evaluation/builtin/jaccard-similarity) | Computes Jaccard similarity (intersection over union) between token sets of two texts. Useful for measuring set-level | +| [`jaro_winkler_similarity`](/docs/evaluation/builtin/jaro-winkler-similarity) | Computes Jaro-Winkler similarity between two strings. Particularly effective for short strings like names, labels, | +| [`json_diff`](/docs/evaluation/builtin/json-diff) | Deep structural comparison between two JSON objects. Recursively compares keys and values at all levels, returning | +| [`latency_check`](/docs/evaluation/builtin/latency-check) | Validates that response latency is within acceptable bounds. Pass if `latency <= max_latency_ms`. | +| [`levenshtein_similarity`](/docs/evaluation/builtin/lavenshtein-similarity) | Measures the number of edits (insertions, deletions, or substitutions) to transform generated text to reference | +| [`log_loss`](/docs/evaluation/builtin/log-loss) | Computes log loss (cross-entropy) for probability predictions. Returns 1/(1+loss) as score. Lower loss = higher score. | +| [`matthews_correlation`](/docs/evaluation/builtin/matthews-correlation) | Computes Matthews Correlation Coefficient (MCC). A balanced metric for binary and multiclass classification that | +| [`no_invalid_links`](/docs/evaluation/builtin/no-invalid-links) | Checks if the text contains no invalid URLs | +| [`numeric_similarity`](/docs/evaluation/builtin/numeric-similarity) | Extracts numeric values from generated text and computes the normalized difference from the reference number. | +| [`one_line`](/docs/evaluation/builtin/one-line) | Checks if the text is a single line | +| [`pearson_correlation`](/docs/evaluation/builtin/pearson-correlation) | Computes Pearson correlation coefficient between two sets of numeric values. Measures linear relationship strength (-1 to 1, normalized… | +| [`precision_score`](/docs/evaluation/builtin/precision-score) | Computes classification precision (TP / (TP + FP)) for binary or multiclass tasks. Measures how many positive predictions | +| [`prompt_instruction_adherence`](/docs/evaluation/builtin/prompt-adherence) | Evaluates whether the output follows the prompt’s instructions and required format. | +| [`r2_score`](/docs/evaluation/builtin/r2-score) | Computes R-squared (coefficient of determination). Measures proportion of variance explained by predictions. | +| [`rmse`](/docs/evaluation/builtin/rmse) | Computes Root Mean Squared Error between predicted and actual values. Returns 1/(1+RMSE) as score (higher=better). | +| [`semantic_list_contains`](/docs/evaluation/builtin/semantic-list-contains) | Checks if the generated response semantically contains one or more phrases from a reference list. | +| [`sentence_count`](/docs/evaluation/builtin/sentence-count) | Counts sentences in text and optionally validates against a min/max range. Useful for enforcing structural constraints | +| [`spearman_correlation`](/docs/evaluation/builtin/spearman-correlation) | Computes Spearman rank correlation coefficient. Measures monotonic relationship between two sets of values (-1 to 1, normalized to 0-1). | +| [`step_count`](/docs/evaluation/builtin/step-count) | Counts and validates the number of steps/actions in an agent trajectory. Can check against exact count, minimum, | +| [`syntax_validation`](/docs/evaluation/builtin/syntax-validation) | Validates code syntax without executing it. Supports Python (via ast.parse), JSON, and basic JavaScript bracket | +| [`trajectory_match`](/docs/evaluation/builtin/trajectory-match) | Validates agent action/tool call sequences. Supports strict (same order), unordered (any order), subset (expected | +| [`word_count_in_range`](/docs/evaluation/builtin/word-count-in-range) | Checks if the word count of text falls within a specified range. Useful for enforcing length constraints on generated | + +--- + +## Next steps + +- [Code eval type](/docs/evaluation/concepts/eval-types#code) +- [Is JSON (eval template)](/docs/evaluation/builtin/is-json) +- [All evaluators](/docs/evaluation/builtin): full catalog index. +- [Eval types](/docs/evaluation/concepts/eval-types): Agents, LLM-As-A-Judge, or Code. +- [Create custom evals](/docs/evaluation/build/custom): define your own quality rules. diff --git a/src/pages/docs/evaluation/builtin/categories/multimodal.mdx b/src/pages/docs/evaluation/builtin/categories/multimodal.mdx new file mode 100644 index 00000000..c9642c41 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/categories/multimodal.mdx @@ -0,0 +1,45 @@ +--- +title: "Multimodal evaluators" +description: "Score image, OCR, and PDF outputs. These evaluators handle the modalities your text-only metrics can't reach." +--- + +## About + +Score image, OCR, and PDF outputs. These evaluators handle the modalities your text-only metrics can't reach. + +This category contains 9 eval templates. Several templates also appear in other categories, for example, an evaluator tagged for both RAG and hallucination detection appears in both the RAG and the (where applicable) other category page. + +--- + +## When to use + +- You're generating images and need to check adherence to a text instruction. +- You need image quality metrics (CLIP score, FID, SSIM, PSNR). +- You're validating OCR output against ground truth. +- You need to detect AI-generated images. + +--- + +## Evaluators + +| Template | What it checks | +|---|---| +| [`caption_hallucination`](/docs/evaluation/builtin/caption-hallucination) | Evaluates whether image captions or descriptions contain factual inaccuracies or hallucinated details that are | +| [`clip_score`](/docs/evaluation/builtin/clip-score) | 'Computes the CLIP Score between images and text prompts. CLIP Score measures how well images match their text | +| [`fid_score`](/docs/evaluation/builtin/fid-score) | Computes the Frechet Inception Distance (FID) between two sets of images. FID measures the similarity between | +| [`image_instruction_adherence`](/docs/evaluation/builtin/image-instruction-adherence) | Measures how well generated images adhere to the given text instruction. Evaluates whether the image(s) accurately | +| [`image_properties`](/docs/evaluation/builtin/image-properties) | Validates image properties including dimensions, format, and file size. Useful for ensuring generated images meet | +| [`ocr_evaluation`](/docs/evaluation/builtin/ocr-evaluation) | Evaluates the quality of the given OCR output | +| [`psnr`](/docs/evaluation/builtin/psnr) | Computes Peak Signal-to-Noise Ratio (PSNR) between two images. Higher PSNR indicates more similar images. Returns | +| [`ssim`](/docs/evaluation/builtin/ssim) | Computes Structural Similarity Index (SSIM) between two images. Measures perceptual similarity based on luminance, | +| [`synthetic_image_evaluator`](/docs/evaluation/builtin/synthetic-image-evaluator) | Evaluates if the given image is generated by AI/Computer Graphics or authentically created. | + +--- + +## Next steps + +- [CLIP score (eval template)](/docs/evaluation/builtin/clip-score) +- [Image instruction adherence (eval template)](/docs/evaluation/builtin/image-instruction-adherence) +- [All evaluators](/docs/evaluation/builtin): full catalog index. +- [Eval types](/docs/evaluation/concepts/eval-types): Agents, LLM-As-A-Judge, or Code. +- [Create custom evals](/docs/evaluation/build/custom): define your own quality rules. diff --git a/src/pages/docs/evaluation/builtin/categories/rag.mdx b/src/pages/docs/evaluation/builtin/categories/rag.mdx new file mode 100644 index 00000000..91af35b7 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/categories/rag.mdx @@ -0,0 +1,55 @@ +--- +title: "RAG & retrieval evaluators" +description: "Score retrieval-augmented generation pipelines and ranking quality. These evaluators check whether retrieved context is relevant, whether responses stay grounded in that context, and how well retrieval algorithms rank relevant chunks." +--- + +## About + +Score retrieval-augmented generation pipelines and ranking quality. These evaluators check whether retrieved context is relevant, whether responses stay grounded in that context, and how well retrieval algorithms rank relevant chunks. + +This category contains 18 eval templates. Several templates also appear in other categories, for example, an evaluator tagged for both RAG and hallucination detection appears in both the RAG and the (where applicable) other category page. + +--- + +## When to use + +- You have a RAG application and want to know whether wrong answers are coming from retrieval or from generation. +- You need ranking quality metrics (Hit Rate, MRR, NDCG, Precision@K, Recall@K) for tuning a retriever. +- You want to detect hallucinations in generated answers against retrieved evidence. +- You need to compare retrieval algorithms on the same corpus. + +--- + +## Evaluators + +| Template | What it checks | +|---|---| +| [`caption_hallucination`](/docs/evaluation/builtin/caption-hallucination) | Evaluates whether image captions or descriptions contain factual inaccuracies or hallucinated details that are | +| [`chunk_attribution`](/docs/evaluation/builtin/chunk-attribution) | Tracks if the context chunk is used in generating the response. | +| [`chunk_utilization`](/docs/evaluation/builtin/chunk-utilization) | Measures how effectively context chunks are used in responses | +| [`context_adherence`](/docs/evaluation/builtin/context-adherence) | Measures how well responses stay within the provided context | +| [`context_relevance`](/docs/evaluation/builtin/context-relevance) | Evaluates the relevancy of the context to the query | +| [`detect_hallucination`](/docs/evaluation/builtin/detect-hallucination) | Identifies if the model fabricated facts or added information that was not present in the input or reference. | +| [`embedding_similarity`](/docs/evaluation/builtin/embedding-similarity) | Measures the cosine semantic similarity between the generated text and the reference text. | +| [`eval_ranking`](/docs/evaluation/builtin/eval-ranking) | Provides ranking score for each context based on specified criteria | +| [`groundedness`](/docs/evaluation/builtin/groundedness) | Evaluates whether the output content is grounded in the provided context. | +| [`hit_rate`](/docs/evaluation/builtin/hit-rate) | 'Hit Rate: Percentage of queries where at least one relevant chunk is retrieved.' | +| [`mean_average_precision`](/docs/evaluation/builtin/mean-average-precision) | Computes Mean Average Precision (MAP) for information retrieval. Averages precision at each relevant item across queries. | +| [`mrr`](/docs/evaluation/builtin/mrr) | 'MRR (Mean Reciprocal Rank): Measures how early the first relevant chunk appears in the ranked results.' | +| [`ndcg_at_k`](/docs/evaluation/builtin/ndcg-at-k) | 'NDCG@K (Normalized Discounted Cumulative Gain): Measures ranking quality at top K, giving more credit to relevant | +| [`non_llm_context_precision`](/docs/evaluation/builtin/non-llm-context-precision) | Non-LLM context precision for RAG evaluation. Measures what fraction of retrieved contexts match reference contexts using exact string… | +| [`non_llm_context_recall`](/docs/evaluation/builtin/non-llm-context-recall) | Non-LLM context recall for RAG evaluation. Measures what fraction of reference contexts were successfully retrieved. | +| [`precision_at_k`](/docs/evaluation/builtin/precision-at-k) | 'Precision@K: Out of the top K retrieved chunks, what fraction is actually relevant.' | +| [`recall_at_k`](/docs/evaluation/builtin/recall-at-k) | 'Recall@K: Out of all truly relevant chunks, what fraction appears in the top K retrieved results.' | +| [`squad_score`](/docs/evaluation/builtin/squad-score) | Computes SQuAD-style evaluation: exact match + token F1 for QA tasks. Normalizes text (lowercase, remove articles/punctuation) before… | + +--- + +## Next steps + +- [Run evals on a dataset](/docs/evaluation/run/in-the-ui) +- [Groundedness (eval template)](/docs/evaluation/builtin/groundedness) +- [RAG evaluation cookbook](/docs/cookbook/quickstart/rag-evaluation) +- [All evaluators](/docs/evaluation/builtin): full catalog index. +- [Eval types](/docs/evaluation/concepts/eval-types): Agents, LLM-As-A-Judge, or Code. +- [Create custom evals](/docs/evaluation/build/custom): define your own quality rules. diff --git a/src/pages/docs/evaluation/builtin/categories/safety.mdx b/src/pages/docs/evaluation/builtin/categories/safety.mdx new file mode 100644 index 00000000..8778c57b --- /dev/null +++ b/src/pages/docs/evaluation/builtin/categories/safety.mdx @@ -0,0 +1,54 @@ +--- +title: "Safety & policy evaluators" +description: "Detect toxic, harmful, biased, or non-compliant output. Use these evaluators as gates on production traffic, in CI before release, or as part of red-teaming workflows." +--- + +## About + +Detect toxic, harmful, biased, or non-compliant output. Use these evaluators as gates on production traffic, in CI before release, or as part of red-teaming workflows. + +This category contains 17 eval templates. Several templates also appear in other categories, for example, an evaluator tagged for both RAG and hallucination detection appears in both the RAG and the (where applicable) other category page. + +--- + +## When to use + +- You need to block toxic or harmful responses before they reach users. +- You're checking output for PII or compliance violations (GDPR, HIPAA). +- You're red-teaming a prompt for injection or jailbreak resistance. +- You need bias and fairness checks across protected attributes. + +--- + +## Evaluators + +| Template | What it checks | +|---|---| +| [`answer_refusal`](/docs/evaluation/builtin/answer-refusal) | Checks if the model correctly refuses to answer when prompted with harmful, sensitive, or restricted queries. | +| [`bias_detection`](/docs/evaluation/builtin/bias-detection) | Identifies various forms of bias including gender, racial, cultural, or ideological bias in the output content. | +| [`clinically_inappropriate_tone`](/docs/evaluation/builtin/clinically-inappropriate-tone) | Evaluates whether the model's tone is unsuitable for clinical or mental health contexts. | +| [`cultural_sensitivity`](/docs/evaluation/builtin/cultural-sensitivity) | Analyzes output for cultural appropriateness, inclusive language, and awareness of cultural nuances. Identifies | +| [`data_privacy_compliance`](/docs/evaluation/builtin/data-privacy) | Checks output content for compliance with GDPR and HIPAA, identifying potential privacy violations and sensitive | +| [`is_harmful_advice`](/docs/evaluation/builtin/is-harmful-advice) | Detects whether the model gives advice that could be physically, emotionally, legally, or financially harmful. | +| [`is_refusal`](/docs/evaluation/builtin/is-refusal) | Detects if LLM output is a refusal to answer using common refusal pattern matching. Returns True if refusal detected. | +| [`no_age_bias`](/docs/evaluation/builtin/no-age-bias) | Evaluates if the content is free from stereotypes, discrimination, or assumptions based on age. | +| [`no_gender_bias`](/docs/evaluation/builtin/no-gender-bias) | Checks that the output content does not reinforce gender stereotypes or exhibit bias based on gender identity. | +| [`no_harmful_therapeutic_guidance`](/docs/evaluation/builtin/no-harmful-therapeutic-guidance) | Ensures that the model does not provide potentially harmful psychological or therapeutic advice. | +| [`no_llm_reference`](/docs/evaluation/builtin/no-llm-reference) | Ensures that the model response does not mention being an OpenAI model or reference its training data or providers. | +| [`no_racial_bias`](/docs/evaluation/builtin/no-racial-bias) | Ensures that the output content does not contain or imply racial bias, stereotypes, or preferential treatment. | +| [`pii`](/docs/evaluation/builtin/pii) | Detects personally identifiable information (PII) in content | +| [`prompt_injection`](/docs/evaluation/builtin/prompt-injection) | Evaluates content for potential prompt injection attempts | +| [`regex_pii_detection`](/docs/evaluation/builtin/regex-pii-detection) | Detects Personally Identifiable Information (PII) using regex patterns. Scans for SSN, credit card numbers, phone | +| [`sexist`](/docs/evaluation/builtin/sexist) | Detects sexist content and gender bias in the output | +| [`toxicity`](/docs/evaluation/builtin/toxicity) | Evaluates content for toxic or harmful language | + +--- + +## Next steps + +- [Run evals on traces](/docs/observe/features/evals) +- [Toxicity (eval template)](/docs/evaluation/builtin/toxicity) +- [PII detection (eval template)](/docs/evaluation/builtin/pii) +- [All evaluators](/docs/evaluation/builtin): full catalog index. +- [Eval types](/docs/evaluation/concepts/eval-types): Agents, LLM-As-A-Judge, or Code. +- [Create custom evals](/docs/evaluation/build/custom): define your own quality rules. diff --git a/src/pages/docs/evaluation/builtin/categories/text.mdx b/src/pages/docs/evaluation/builtin/categories/text.mdx new file mode 100644 index 00000000..66517fbf --- /dev/null +++ b/src/pages/docs/evaluation/builtin/categories/text.mdx @@ -0,0 +1,105 @@ +--- +title: "Text quality & NLP metrics evaluators" +description: "Compare generated text against reference text or score it against quality criteria. Includes classical NLP metrics (BLEU, ROUGE, METEOR), similarity scores, statistical measures, and readability checks." +--- + +## About + +Compare generated text against reference text or score it against quality criteria. Includes classical NLP metrics (BLEU, ROUGE, METEOR), similarity scores, statistical measures, and readability checks. + +This category contains 68 eval templates. Several templates also appear in other categories, for example, an evaluator tagged for both RAG and hallucination detection appears in both the RAG and the (where applicable) other category page. + +--- + +## When to use + +- You have ground-truth references and want to measure overlap (BLEU, ROUGE, METEOR, CHRF). +- You need similarity metrics for paraphrase or near-duplicate detection. +- You want statistical accuracy/precision/recall metrics for classification tasks. +- You need numeric or string similarity for structured outputs. + +--- + +## Evaluators + +| Template | What it checks | +|---|---| +| [`accuracy`](/docs/evaluation/builtin/accuracy) | Computes classification accuracy by comparing predicted labels against expected labels. Accepts single values or | +| [`ASR/STT_accuracy`](/docs/evaluation/builtin/audio-transcription) | Analyzes the accuracy of transcriptions generated from audio inputs by Automatic Speech Recognition (ASR) or | +| [`audio_quality`](/docs/evaluation/builtin/audio-quality) | Evaluates the overall quality of the given audio, like MOS (Mean Opinion Score) evaluation | +| [`balanced_accuracy`](/docs/evaluation/builtin/balanced-accuracy) | Computes balanced accuracy (average recall per class). Handles imbalanced datasets better than standard accuracy. | +| [`bleu_score`](/docs/evaluation/builtin/bleu) | Computes a bleu score between the expected gold answer and the model output. | +| [`character_error_rate`](/docs/evaluation/builtin/character-error-rate) | Computes Character Error Rate (CER) for ASR/OCR evaluation. CER measures character-level edit distance between | +| [`chrf_score`](/docs/evaluation/builtin/chrf-score) | Computes ChrF score (character n-gram F-score). More robust than BLEU for morphologically rich languages and short | +| [`clip_score`](/docs/evaluation/builtin/clip-score) | 'Computes the CLIP Score between images and text prompts. CLIP Score measures how well images match their text | +| [`code_bleu`](/docs/evaluation/builtin/code-bleu) | Computes CodeBLEU - a code-aware BLEU variant that combines standard n-gram BLEU with keyword matching for code-specific | +| [`code_complexity`](/docs/evaluation/builtin/code-complexity) | Computes cyclomatic complexity of Python code using AST analysis. Counts decision points (if, for, while, except, | +| [`cohen_kappa`](/docs/evaluation/builtin/cohen-kappa) | Computes Cohen's Kappa coefficient for inter-rater agreement. Accounts for agreement occurring by chance. Range | +| [`completeness`](/docs/evaluation/builtin/completeness) | Evaluates if the response completely answers the query | +| [`distinct_n`](/docs/evaluation/builtin/distinct-n) | Computes Distinct-N: ratio of unique n-grams to total n-grams. Measures vocabulary diversity in generated text. Higher = more diverse. | +| [`embedding_similarity`](/docs/evaluation/builtin/embedding-similarity) | Measures the cosine semantic similarity between the generated text and the reference text. | +| [`f1_score`](/docs/evaluation/builtin/f1-score) | Computes token-level F1 score between output and expected text. Treats both texts as bags of tokens and calculates | +| [`f_beta_score`](/docs/evaluation/builtin/f-beta-score) | Computes F-beta score with configurable beta for precision/recall weighting. `beta < 1` favors precision, `beta > 1` favors recall. | +| [`fid_score`](/docs/evaluation/builtin/fid-score) | Computes the Frechet Inception Distance (FID) between two sets of images. FID measures the similarity between | +| [`fleiss_kappa`](/docs/evaluation/builtin/fleiss-kappa) | Computes Fleiss' Kappa for multi-rater agreement. Extends Cohen's Kappa to N raters. Input: matrix where rows=subjects,… | +| [`fuzzy_match`](/docs/evaluation/builtin/fuzzy-match) | Computes fuzzy string matching score using SequenceMatcher (difflib). Returns a similarity ratio between 0 and 1 | +| [`gleu_score`](/docs/evaluation/builtin/gleu-score) | Computes Google BLEU (GLEU) score. A sentence-level BLEU variant that takes the minimum of precision and recall | +| [`ground_truth_match`](/docs/evaluation/builtin/ground-truth-match) | Evaluates whether the model-generated output matches the provided ground-truth expected output. | +| [`hamming_similarity`](/docs/evaluation/builtin/hamming-similarity) | Computes Hamming similarity between two strings. Counts matching character positions normalized by the longer string | +| [`is_concise`](/docs/evaluation/builtin/is-concise) | Measures whether the answer is brief and to the point, avoiding redundancy. | +| [`is_good_summary`](/docs/evaluation/builtin/is-good-summary) | Evaluates if a summary is clear, well-structured, and includes the most important points from the source material. | +| [`is_helpful`](/docs/evaluation/builtin/is-helpful) | Evaluates whether the response answers the user's question effectively. | +| [`is_informal_tone`](/docs/evaluation/builtin/is-informal-tone) | Detects whether the tone is informal or casual (e.g., use of slang, contractions, emoji). | +| [`is_polite`](/docs/evaluation/builtin/is-polite) | Ensures that the output maintains a respectful, kind, and non-aggressive tone. | +| [`jaccard_similarity`](/docs/evaluation/builtin/jaccard-similarity) | Computes Jaccard similarity (intersection over union) between token sets of two texts. Useful for measuring set-level | +| [`jaro_winkler_similarity`](/docs/evaluation/builtin/jaro-winkler-similarity) | Computes Jaro-Winkler similarity between two strings. Particularly effective for short strings like names, labels, | +| [`levenshtein_similarity`](/docs/evaluation/builtin/lavenshtein-similarity) | Measures the number of edits (insertions, deletions, or substitutions) to transform generated text to reference | +| [`log_loss`](/docs/evaluation/builtin/log-loss) | Computes log loss (cross-entropy) for probability predictions. Returns 1/(1+loss) as score. Lower loss = higher score. | +| [`match_error_rate`](/docs/evaluation/builtin/match-error-rate) | Computes Match Error Rate (MER) for speech recognition. MER = edits / (hits + edits). Returns 1-MER as score. | +| [`matthews_correlation`](/docs/evaluation/builtin/matthews-correlation) | Computes Matthews Correlation Coefficient (MCC). A balanced metric for binary and multiclass classification that | +| [`mean_average_precision`](/docs/evaluation/builtin/mean-average-precision) | Computes Mean Average Precision (MAP) for information retrieval. Averages precision at each relevant item across queries. | +| [`meteor_score`](/docs/evaluation/builtin/meteor-score) | Computes METEOR score between reference and hypothesis. Uses unigram matching with exact and stem matching, penalizing | +| [`mrr`](/docs/evaluation/builtin/mrr) | 'MRR (Mean Reciprocal Rank): Measures how early the first relevant chunk appears in the ranked results.' | +| [`ndcg_at_k`](/docs/evaluation/builtin/ndcg-at-k) | 'NDCG@K (Normalized Discounted Cumulative Gain): Measures ranking quality at top K, giving more credit to relevant | +| [`no_apologies`](/docs/evaluation/builtin/no-apologies) | Checks if the model unnecessarily apologizes, e.g., 'I'm sorry, but…' | +| [`numeric_similarity`](/docs/evaluation/builtin/numeric-similarity) | Extracts numeric values from generated text and computes the normalized difference from the reference number. | +| [`ocr_evaluation`](/docs/evaluation/builtin/ocr-evaluation) | Evaluates the quality of the given OCR output | +| [`pearson_correlation`](/docs/evaluation/builtin/pearson-correlation) | Computes Pearson correlation coefficient between two sets of numeric values. Measures linear relationship strength (-1 to 1, normalized… | +| [`precision_at_k`](/docs/evaluation/builtin/precision-at-k) | 'Precision@K: Out of the top K retrieved chunks, what fraction is actually relevant.' | +| [`precision_score`](/docs/evaluation/builtin/precision-score) | Computes classification precision (TP / (TP + FP)) for binary or multiclass tasks. Measures how many positive predictions | +| [`prompt_instruction_adherence`](/docs/evaluation/builtin/prompt-adherence) | Evaluates whether the output follows the prompt’s instructions and required format. | +| [`psnr`](/docs/evaluation/builtin/psnr) | Computes Peak Signal-to-Noise Ratio (PSNR) between two images. Higher PSNR indicates more similar images. Returns | +| [`r2_score`](/docs/evaluation/builtin/r2-score) | Computes R-squared (coefficient of determination). Measures proportion of variance explained by predictions. | +| [`readability_score`](/docs/evaluation/builtin/readability-score) | Computes Flesch-Kincaid readability metrics. Returns a normalized score (0-1) based on Flesch Reading Ease. Higher | +| [`recall_at_k`](/docs/evaluation/builtin/recall-at-k) | 'Recall@K: Out of all truly relevant chunks, what fraction appears in the top K retrieved results.' | +| [`repetition_rate`](/docs/evaluation/builtin/repetition-rate) | Measures repeated n-gram rate in text. Returns 1-rate as score (higher = less repetitive = better). Useful for detecting… | +| [`rmse`](/docs/evaluation/builtin/rmse) | Computes Root Mean Squared Error between predicted and actual values. Returns 1/(1+RMSE) as score (higher=better). | +| [`rouge_score`](/docs/evaluation/builtin/rouge) | Computes a rouge score between the expected gold answer and the model output. | +| [`semantic_list_contains`](/docs/evaluation/builtin/semantic-list-contains) | Checks if the generated response semantically contains one or more phrases from a reference list. | +| [`sentence_count`](/docs/evaluation/builtin/sentence-count) | Counts sentences in text and optionally validates against a min/max range. Useful for enforcing structural constraints | +| [`spearman_correlation`](/docs/evaluation/builtin/spearman-correlation) | Computes Spearman rank correlation coefficient. Measures monotonic relationship between two sets of values (-1 to 1, normalized to 0-1). | +| [`squad_score`](/docs/evaluation/builtin/squad-score) | Computes SQuAD-style evaluation: exact match + token F1 for QA tasks. Normalizes text (lowercase, remove articles/punctuation) before… | +| [`ssim`](/docs/evaluation/builtin/ssim) | Computes Structural Similarity Index (SSIM) between two images. Measures perceptual similarity based on luminance, | +| [`summary_quality`](/docs/evaluation/builtin/summary-quality) | Evaluates if a summary effectively captures the main points, maintains factual accuracy, and achieves appropriate | +| [`task_completion`](/docs/evaluation/builtin/task-completion) | Measures whether the model fulfilled the user's request accurately and completely. | +| [`text_to_sql`](/docs/evaluation/builtin/text-to-sql) | Evaluates if the model correctly converts natural language text into valid and accurate SQL queries. | +| [`tone`](/docs/evaluation/builtin/tone) | Analyzes the tone and sentiment of content | +| [`translation_accuracy`](/docs/evaluation/builtin/translation-accuracy) | Evaluates the quality of translation by checking semantic accuracy, cultural appropriateness, and preservation | +| [`translation_edit_rate`](/docs/evaluation/builtin/translation-edit-rate) | Computes Translation Edit Rate (TER). TER measures the minimum number of edits (insertions, deletions, substitutions) | +| [`TTS_accuracy`](/docs/evaluation/builtin/tts-accuracy) | Analyzes if the text-to-speech output accurately reflects the intended message, including pronunciation, emphasis, | +| [`type_token_ratio`](/docs/evaluation/builtin/type-token-ratio) | Computes Type-Token Ratio (TTR): unique tokens divided by total tokens. Measures lexical diversity. | +| [`word_count_in_range`](/docs/evaluation/builtin/word-count-in-range) | Checks if the word count of text falls within a specified range. Useful for enforcing length constraints on generated | +| [`word_error_rate`](/docs/evaluation/builtin/word-error-rate) | Computes Word Error Rate (WER) for ASR/STT evaluation. WER measures the edit distance at the word level between | +| [`word_info_lost`](/docs/evaluation/builtin/word-info-lost) | Computes Word Information Lost (WIL) for speech. WIL = 1 - (hits/ref * hits/hyp). Returns 1-WIL as score. | +| [`word_info_preserved`](/docs/evaluation/builtin/word-info-preserved) | Computes Word Information Preserved (WIP) for speech. WIP = (hits/ref) * (hits/hyp). Higher = better. | + +--- + +## Next steps + +- [Eval templates](/docs/evaluation/concepts/eval-templates) +- [BLEU (eval template)](/docs/evaluation/builtin/bleu) +- [ROUGE (eval template)](/docs/evaluation/builtin/rouge) +- [All evaluators](/docs/evaluation/builtin): full catalog index. +- [Eval types](/docs/evaluation/concepts/eval-types): Agents, LLM-As-A-Judge, or Code. +- [Create custom evals](/docs/evaluation/build/custom): define your own quality rules. diff --git a/src/pages/docs/evaluation/builtin/contain-evals.mdx b/src/pages/docs/evaluation/builtin/contain-evals.mdx deleted file mode 100644 index 2dce3713..00000000 --- a/src/pages/docs/evaluation/builtin/contain-evals.mdx +++ /dev/null @@ -1,444 +0,0 @@ ---- -title: "Contain: Keyword Presence and Pattern Validation Metric" -description: "Validates whether generated content contains specific keywords or patterns, checking presence or absence against desired criteria in text output." ---- - -Following evals help in assessing whether the text aligns with specific requirements, such as containing necessary information, adhering to expected formats, or avoiding unwanted terms: - -- [Contains](/docs/evaluation/builtin/contain-evals#1-contains) -- [Contains Any](/docs/evaluation/builtin/contain-evals#2-contains-any) -- [Contains All](/docs/evaluation/builtin/contain-evals#3-contains-all) -- [Contains None](/docs/evaluation/builtin/contain-evals#4-contains-none) -- [Starts With](/docs/evaluation/builtin/contain-evals#5-starts-with) -- [Ends With](/docs/evaluation/builtin/contain-evals#6-ends-with) -- [Equals](/docs/evaluation/builtin/contain-evals#7-equals) - ---- - -### **1. Contains** - -**Definition**: Evaluates whether the input text contains a specific keyword. This is useful for ensuring that essential terms are present in the text. - -**Evaluation Using Interface** - -**Input:** - -- **Required Inputs:** - - **text**: The content column to search within. -- **Configuration Parameters:** - - **keyword**: String - The text to search for in the input `text`. - - **case_sensitive**: Boolean (optional) - Whether the search should match case (defaults to `False` if omitted). - -**Output:** - -- **Score**: Passed or Failed - -**Interpretation:** - -- **Passed:** The specified `keyword` is present in the `text`. -- **Failed:** The specified `keyword` is not present in the `text`. - -**Evaluation using Python SDK** - -> Click [here](https://docs.futureagi.com/future-agi/get-started/evaluation/running-your-first-eval#using-python-sdk-sync) to learn how to setup evaluation using the Python SDK. -> - -| Input Type | Parameter | Type | Description | UI Component | -| --- | --- | --- | --- | --- | -| Required Inputs | `text` | `string` | The content column to search within. | Column Select | -| Configuration Parameters | `keyword` | `string` | The keyword to search for in the `text`. | Text Input | -| | `case_sensitive` | `bool` | Optional: Whether the keyword search should be case-sensitive. | Checkbox | - -```python -from fi.evals import Evaluator -from fi.testcases import TestCase -from fi.evals.templates import Contains - -evaluator = Evaluator( - fi_api_key="your_api_key", - fi_secret_key="your_secret_key", - fi_base_url="" -) - -contains_eval = Contains(config={ - "keyword": "Hello", - "case_sensitive": True - } -) - -test_case = TestCase( - text="Hello world! How are you?" -) - -result = evaluator.evaluate(eval_templates=[contains_eval], inputs=[test_case]) -contains_text = result.eval_results[0].metrics[0].value - -``` - -**What to Do When Contains Evaluation Fails**: If the evaluation fails, consider revising the text to include the necessary keyword. Providing clearer instructions regarding required terms can help prevent this issue in future evaluations. - ---- - -### **2. Contains Any** - -**Definition**: Checks if the input text contains any of the specified keywords. This evaluation is useful for scenarios where the presence of at least one keyword is required. - -**Evaluation Using InterfaceInput:** - -- **Required Inputs:** - - **text**: The content column to search within. -- **Configuration Parameters:** - - **keywords**: List[String] - A list of possible strings to search for. (Enter as a comma-separated string in UI). - - **case_sensitive**: Boolean (optional) - Whether the search should match case (defaults to `False` if omitted). - -**Output:** - -- **Score**: Passed or Failed - -**Interpretation:** - -- **Passed:** At least one of the specified `keywords` is present in the `text`. -- **Failed:** None of the specified `keywords` are present in the `text`. - -**Evaluation using Python SDK** - -> Click [here](https://docs.futureagi.com/future-agi/get-started/evaluation/running-your-first-eval#using-python-sdk-sync) to learn how to setup evaluation using the Python SDK. -> - -| Input Type | Parameter | Type | Description | UI Component | -| --- | --- | --- | --- | --- | -| Required Inputs | `text` | `string` | The content column to search within. | Column Select | -| Configuration Parameters | `keywords` | `list[string]` | List of keywords to search for in the `text`. (Enter as a comma-separated string in UI). | Text Input | -| | `case_sensitive` | `bool` | Optional: Whether the keyword search should be case-sensitive. | Checkbox | - -```python -from fi.evals import Evaluator -from fi.testcases import TestCase -from fi.evals.templates import ContainsAny - -evaluator = Evaluator( - fi_api_key="your_api_key", - fi_secret_key="your_secret_key", - fi_base_url="" -) - -contains_eval = ContainsAny(config={ - "keywords": ["Hello", "world"], - "case_sensitive": True - } -) - -test_case = TestCase( - text="Hello world! How are you?" -) - -result = evaluator.evaluate(eval_templates=[contains_eval], inputs=[test_case]) -contains_text = result.eval_results[0].metrics[0].value # 1.0 or 0.0 - -``` - -**What to Do When Contains Any Evaluation Fails**: If the evaluation fails, ensure that at least one of the required keywords is included in the text. Adjusting the content to meet this requirement can improve compliance in future evaluations. - ---- - -### **3. Contains All** - -**Definition**: Verifies that the input text contains all specified keywords. This evaluation is critical for ensuring comprehensive coverage of necessary terms. - -**Evaluation Using InterfaceInput:** - -- **Required Inputs:** - - **text**: The content column to search within. -- **Configuration Parameters:** - - **keywords**: List[String] - The list of keywords that must all be present. (Enter as a comma-separated string in UI). - - **case_sensitive**: Boolean (optional) - Whether the search should match case (defaults to `False` if omitted). - -**Output:** - -- **Score**: Passed or Failed - -**Interpretation:** - -- **Passed:** All of the specified `keywords` are present in the `text`. -- **Failed:** At least one of the specified `keywords` is missing from the `text`. - -**Evaluation using Python SDK** - -> Click [here](https://docs.futureagi.com/future-agi/get-started/evaluation/running-your-first-eval#using-python-sdk-sync) to learn how to setup evaluation using the Python SDK. -> - -| Input Type | Parameter | Type | Description | UI Component | -| --- | --- | --- | --- | --- | -| Required Inputs | `text` | `string` | The content column to search within. | Column Select | -| Configuration Parameters | `keywords` | `list[string]` | List of keywords that must all be present in the `text`. (Enter as a comma-separated string in UI). | Text Input | -| | `case_sensitive` | `bool` | Optional: Whether the keyword search should be case-sensitive. | Checkbox | - -```python -from fi.evals import Evaluator -from fi.testcases import TestCase -from fi.evals.templates import ContainsAll - -evaluator = Evaluator( - fi_api_key="your_api_key", - fi_secret_key="your_secret_key", - fi_base_url="" -) - -contains_all_eval = ContainsAll(config={ - "keywords": ["hello", "world"], - "case_sensitive": False}) - -test_case = TestCase( - text="Hello world! How are you?" -) - -result = evaluator.evaluate(eval_templates=[contains_all_eval], inputs=[test_case]) -contains_all = result.eval_results[0].metrics[0].value - -``` - -**What to Do When Contains All Evaluation Fails**: If the evaluation fails, review the text to identify which keywords are missing. Revise the content to include all required keywords to meet the evaluation criteria. - ---- - -### **4. Contains None** - -**Definition**: Verifies that the input text contains none of the specified terms. This evaluation is important for filtering out unwanted or prohibited content. - -**Evaluation Using InterfaceInput:** - -- **Required Inputs:** - - **text**: The content column to search within. -- **Configuration Parameters:** - - **keywords**: List[String] - The list of keywords that should *not* be present. (Enter as a comma-separated string in UI). - - **case_sensitive**: Boolean (optional) - Whether the search should match case (defaults to `False` if omitted). - -**Output:** - -- **Score**: Passed or Failed - -**Interpretation:** - -- **Passed:** None of the specified forbidden `keywords` are present in the `text`. -- **Failed:** At least one of the specified forbidden `keywords` is present in the `text`. - -**Evaluation using Python SDK** - -> Click [here](https://docs.futureagi.com/future-agi/get-started/evaluation/running-your-first-eval#using-python-sdk-sync) to learn how to setup evaluation using the Python SDK. -> - -| Input Type | Parameter | Type | Description | UI Component | -| --- | --- | --- | --- | --- | -| Required Inputs | `text` | `string` | The content column to search within. | Column Select | -| Configuration Parameters | `keywords` | `list[string]` | List of keywords that should *not* be present in the `text`. (Enter as a comma-separated string in UI). | Text Input | -| | `case_sensitive` | `bool` | Optional: Whether the keyword search should be case-sensitive. | Checkbox | - -```python -from fi.evals import Evaluator -from fi.testcases import TestCase -from fi.evals.templates import ContainsNone - -evaluator = Evaluator( - fi_api_key="your_api_key", - fi_secret_key="your_secret_key", - fi_base_url="" -) - -contains_none_eval = ContainsNone(config={ - "keywords": ["hello", "world"], - "case_sensitive": False}) - -test_case = TestCase( - text="This is a good and clean text" -) - -result = evaluator.evaluate(eval_templates=[contains_none_eval], inputs=[test_case]) -contains_none = result.eval_results[0].metrics[0].value - -``` - -**What to Do When Contains None Evaluation Fails**: If the evaluation fails, identify which unwanted terms are present in the text. Revise the content to remove these terms to ensure compliance with the evaluation criteria. - ---- - -### **5. Starts With** - -**Definition**: Checks if the input text begins with a specific substring. This evaluation is useful for ensuring that text adheres to expected formats or structures. - -**Evaluation Using InterfaceInput:** - -- **Required Inputs:** - - **text**: The content column to check. -- **Configuration Parameters:** - - **substring**: String - The required starting text (prefix). - - **case_sensitive**: Boolean (optional) - Whether the comparison should match case (defaults to `False` if omitted). - -**Output:** - -- **Score**: Passed or Failed - -**Interpretation:** - -- **Passed:** The `text` begins with the specified `substring`. -- **Failed:** The `text` does not begin with the specified `substring`. - -**Evaluation using Python SDK** - -> Click [here](https://docs.futureagi.com/future-agi/get-started/evaluation/running-your-first-eval#using-python-sdk-sync) to learn how to setup evaluation using the Python SDK. -> - -| Input Type | Parameter | Type | Description | UI Component | -| --- | --- | --- | --- | --- | -| Required Inputs | `text` | `string` | The content column to check. | Column Select | -| Configuration Parameters | `substring` | `string` | The substring to check for at the start of the `text`. | Text Input | -| | `case_sensitive` | `bool` | Optional: Whether the comparison should be case-sensitive. | Checkbox | - -```python -from fi.evals import Evaluator -from fi.testcases import TestCase -from fi.evals.templates import StartsWith - -evaluator = Evaluator( - fi_api_key="your_api_key", - fi_secret_key="your_secret_key", - fi_base_url="" -) - -starts_with_eval = StartsWith(config={ - "substring": "Dear", - "case_sensitive": True}) - -test_case = TestCase( - text="Dear Sir/Madam," -) - -result = evaluator.evaluate(eval_templates=[starts_with_eval], inputs=[test_case]) -starts_with = result.eval_results[0].metrics[0].value # 1.0 or 0.0 - -``` - -**What to Do When Starts With Evaluation Fails**: If the evaluation fails, consider revising the text to ensure it begins with the required substring. Providing clearer formatting guidelines can help prevent this issue in future evaluations. - ---- - -### **6. Ends With** - -**Definition**: Checks if the input text ends with a specific substring. This evaluation is important for validating the conclusion of the text. - -**Evaluation Using InterfaceInput:** - -- **Required Inputs:** - - **text**: The content column to check. -- **Configuration Parameters:** - - **substring**: String - The required ending text (suffix). - - **case_sensitive**: Boolean (optional) - Whether the comparison should match case (defaults to `False` if omitted). - -**Output:** - -- **Score**: Passed or Failed - -**Interpretation:** - -- **Passed:** The `text` ends with the specified `substring`. -- **Failed:** The `text` does not end with the specified `substring`. - -**Evaluation using Python SDK** - -> Click [here](https://docs.futureagi.com/future-agi/get-started/evaluation/running-your-first-eval#using-python-sdk-sync) to learn how to setup evaluation using the Python SDK. -> - -| Input Type | Parameter | Type | Description | UI Component | -| --- | --- | --- | --- | --- | -| Required Inputs | `text` | `string` | The content column to check. | Column Select | -| Configuration Parameters | `substring` | `string` | The substring to check for at the end of the `text`. | Text Input | -| | `case_sensitive` | `bool` | Optional: Whether the comparison should be case-sensitive. | Checkbox | - -```python -from fi.evals import Evaluator -from fi.testcases import TestCase -from fi.evals.templates import EndsWith - -evaluator = Evaluator( - fi_api_key="your_api_key", - fi_secret_key="your_secret_key", - fi_base_url="" -) - -starts_with_eval = EndsWith(config={ - "substring": "you", - "case_sensitive": True}) - -test_case = TestCase( - text="thank you" -) - -result = evaluator.evaluate(eval_templates=[starts_with_eval], inputs=[test_case]) -ends_with = result.eval_results[0].metrics[0].value - -``` - -**What to Do When Ends With Evaluation Fails**: If the evaluation fails, revise the text to ensure it concludes with the required substring. Clear guidelines on expected endings can help improve compliance in future evaluations. - ---- - -### **7. Equals** - -**Definition**: Compares if the input text is exactly equal to a specified expected text. This evaluation is crucial for scenarios where precise matching is required. - -**Evaluation Using Interface** - -**Input:** - -- **Required Inputs:** - - **text**: The content column to check. - - **expected_text**: The column containing the exact string to match against. -- **Configuration Parameters:** - - **case_sensitive**: Boolean (optional) - Whether the comparison should match case (defaults to `False` if omitted). - -**Output:** - -- **Score**: Passed or Failed - -**Interpretation:** - -- **Passed:** The `text` is identical to the `expected_text` (considering case sensitivity). -- **Failed:** The `text` differs from the `expected_text` (considering case sensitivity). - -**Evaluation using Python SDK** - -> Click [here](https://docs.futureagi.com/future-agi/get-started/evaluation/running-your-first-eval#using-python-sdk-sync) to learn how to setup evaluation using the Python SDK. -> - -| Input Type | Parameter | Type | Description | UI Component | -| --- | --- | --- | --- | --- | -| Required Inputs | `text` | `string` | The content column to check. | Column Select | -| | `expected_text` | `string` | The column containing the exact string to match against. | Column Select | -| Configuration Parameters | `case_sensitive` | `bool` | Optional: Whether the comparison should be case-sensitive. | Checkbox | - -```python -from fi.evals import Evaluator -from fi.testcases import TestCase -from fi.evals.templates import Equals - -evaluator = Evaluator( - fi_api_key="your_api_key", - fi_secret_key="your_secret_key", - fi_base_url="" -) - -equals_eval = Equals(config={"case_sensitive": False}) - -test_case = TestCase( - text="Hello, World!", - expected_text="Hello" -) - -result = evaluator.evaluate(eval_templates=[equals_eval], inputs=[test_case]) -is_equal = result.eval_results[0].metrics[0].value - -``` - -**What to Do When Equals Evaluation Fails**: -If the evaluation fails, review the text for discrepancies. Adjusting the content to match the expected text precisely can help meet the evaluation criteria. - ---- \ No newline at end of file diff --git a/src/pages/docs/evaluation/builtin/contains-all.mdx b/src/pages/docs/evaluation/builtin/contains-all.mdx deleted file mode 100644 index 9fee1d53..00000000 --- a/src/pages/docs/evaluation/builtin/contains-all.mdx +++ /dev/null @@ -1,48 +0,0 @@ ---- -title: "Contains All: Built-in Evaluation" -description: "Verifies text contains all specified keywords" ---- - - - -```python Python -result = evaluator.evaluate( - eval_templates="contains_all", - inputs={ - "text": "Hello, this is a sample text." - }, -) - -print(result.eval_results[0].output) -print(result.eval_results[0].reason) -``` - -```typescript JS/TS -import { Evaluator } from "@future-agi/ai-evaluation"; - -const evaluator = new Evaluator(); - -const result = await evaluator.evaluate( - "contains_all", - { - text: "Hello, this is a sample text." - } -); - -console.log(result); -``` - - - -| **Input** | | | | -| ------ | --------- | ---- | ----------- | -| | **Required Input** | **Type** | **Description** | -| | `text` | `string` | The input text to be evaluated. | - -| **Output** | | | -| ------ | ----- | ----------- | -| | **Field** | **Description** | -| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | -| | **Reason** | A plain-language explanation of the verdict. | - -**Tags:** `Output Validation`, `Text` diff --git a/src/pages/docs/evaluation/builtin/contains-any.mdx b/src/pages/docs/evaluation/builtin/contains-any.mdx deleted file mode 100644 index ecd4e178..00000000 --- a/src/pages/docs/evaluation/builtin/contains-any.mdx +++ /dev/null @@ -1,48 +0,0 @@ ---- -title: "Contains Any: Built-in Evaluation" -description: "Checks if the text contains any of the specified keywords" ---- - - - -```python Python -result = evaluator.evaluate( - eval_templates="contains_any", - inputs={ - "text": "Hello, this is a sample text." - }, -) - -print(result.eval_results[0].output) -print(result.eval_results[0].reason) -``` - -```typescript JS/TS -import { Evaluator } from "@future-agi/ai-evaluation"; - -const evaluator = new Evaluator(); - -const result = await evaluator.evaluate( - "contains_any", - { - text: "Hello, this is a sample text." - } -); - -console.log(result); -``` - - - -| **Input** | | | | -| ------ | --------- | ---- | ----------- | -| | **Required Input** | **Type** | **Description** | -| | `text` | `string` | The input text to be evaluated. | - -| **Output** | | | -| ------ | ----- | ----------- | -| | **Field** | **Description** | -| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | -| | **Reason** | A plain-language explanation of the verdict. | - -**Tags:** `Output Validation`, `Text` diff --git a/src/pages/docs/evaluation/builtin/contains-none.mdx b/src/pages/docs/evaluation/builtin/contains-none.mdx deleted file mode 100644 index af84a649..00000000 --- a/src/pages/docs/evaluation/builtin/contains-none.mdx +++ /dev/null @@ -1,48 +0,0 @@ ---- -title: "Contains None: Built-in Evaluation" -description: "Verifies text contains none of specified terms" ---- - - - -```python Python -result = evaluator.evaluate( - eval_templates="contains_none", - inputs={ - "text": "Hello, this is a sample text." - }, -) - -print(result.eval_results[0].output) -print(result.eval_results[0].reason) -``` - -```typescript JS/TS -import { Evaluator } from "@future-agi/ai-evaluation"; - -const evaluator = new Evaluator(); - -const result = await evaluator.evaluate( - "contains_none", - { - text: "Hello, this is a sample text." - } -); - -console.log(result); -``` - - - -| **Input** | | | | -| ------ | --------- | ---- | ----------- | -| | **Required Input** | **Type** | **Description** | -| | `text` | `string` | The input text to be evaluated. | - -| **Output** | | | -| ------ | ----- | ----------- | -| | **Field** | **Description** | -| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | -| | **Reason** | A plain-language explanation of the verdict. | - -**Tags:** `Output Validation`, `Text` diff --git a/src/pages/docs/evaluation/builtin/content-moderation.mdx b/src/pages/docs/evaluation/builtin/content-moderation.mdx deleted file mode 100644 index 3a12b1ac..00000000 --- a/src/pages/docs/evaluation/builtin/content-moderation.mdx +++ /dev/null @@ -1,52 +0,0 @@ ---- -title: "Content Moderation: Built-in Evaluation" -description: "Uses content moderation to evaluate content safety" ---- - - - -```python Python -result = evaluator.evaluate( - eval_templates="content_moderation", - inputs={ - "output": "The capital of France is Paris." - }, - model_name="turing_flash" -) - -print(result.eval_results[0].output) -print(result.eval_results[0].reason) -``` - -```typescript JS/TS -import { Evaluator } from "@future-agi/ai-evaluation"; - -const evaluator = new Evaluator(); - -const result = await evaluator.evaluate( - "content_moderation", - { - output: "The capital of France is Paris." - }, - { - modelName: "turing_flash", - } -); - -console.log(result); -``` - - - -| **Input** | | | | -| ------ | --------- | ---- | ----------- | -| | **Required Input** | **Type** | **Description** | -| | `output` | `string` | The output to be evaluated. | - -| **Output** | | | -| ------ | ----- | ----------- | -| | **Field** | **Description** | -| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | -| | **Reason** | A plain-language explanation of the verdict. | - -**Tags:** `Red Teaming`, `Safety`, `Harmful Objects` diff --git a/src/pages/docs/evaluation/builtin/content-safety-violation.mdx b/src/pages/docs/evaluation/builtin/content-safety-violation.mdx deleted file mode 100644 index 5333a31a..00000000 --- a/src/pages/docs/evaluation/builtin/content-safety-violation.mdx +++ /dev/null @@ -1,52 +0,0 @@ ---- -title: "Content Safety Violation: Built-in Evaluation" -description: "A broad check for content that violates safety or usage policies—this includes toxicity, hate speech, explicit content, violence, etc" ---- - - - -```python Python -result = evaluator.evaluate( - eval_templates="content_safety_violation", - inputs={ - "output": "The capital of France is Paris." - }, - model_name="turing_flash" -) - -print(result.eval_results[0].output) -print(result.eval_results[0].reason) -``` - -```typescript JS/TS -import { Evaluator } from "@future-agi/ai-evaluation"; - -const evaluator = new Evaluator(); - -const result = await evaluator.evaluate( - "content_safety_violation", - { - output: "The capital of France is Paris." - }, - { - modelName: "turing_flash", - } -); - -console.log(result); -``` - - - -| **Input** | | | | -| ------ | --------- | ---- | ----------- | -| | **Required Input** | **Type** | **Description** | -| | `output` | `string` | The output to be evaluated for content moderation. | - -| **Output** | | | -| ------ | ----- | ----------- | -| | **Field** | **Description** | -| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | -| | **Reason** | A plain-language explanation of the verdict. | - -**Tags:** `Red Teaming`, `Safety`, `Harmful Objects` diff --git a/src/pages/docs/evaluation/builtin/context-adherence.mdx b/src/pages/docs/evaluation/builtin/context-adherence.mdx index 52b07e04..c662d723 100644 --- a/src/pages/docs/evaluation/builtin/context-adherence.mdx +++ b/src/pages/docs/evaluation/builtin/context-adherence.mdx @@ -64,6 +64,6 @@ To improve adherence, implement stricter context binding, integrate fact-checkin ### Comparing Context Adherence with Similar Evals 1. [Context Relevance](/docs/evaluation/builtin/context-relevance): While Context Adherence focuses on staying within context bounds, Context Relevance evaluates if the provided context is sufficient and appropriate for the query. -2. [Prompt/Instruction Adherence](/docs/evaluation/builtin/instruction-adherence): Context Adherence measures factual consistency with context, while Prompt Adherence evaluates following instructions and format requirements. +2. [Prompt/Instruction Adherence](/docs/evaluation/builtin/prompt-adherence): Context Adherence measures factual consistency with context, while Prompt Adherence evaluates following instructions and format requirements. --- \ No newline at end of file diff --git a/src/pages/docs/evaluation/builtin/custom-code-evaluation.mdx b/src/pages/docs/evaluation/builtin/custom-code-evaluation.mdx deleted file mode 100644 index 1ab9b86c..00000000 --- a/src/pages/docs/evaluation/builtin/custom-code-evaluation.mdx +++ /dev/null @@ -1,47 +0,0 @@ ---- -title: "Custom Code Evaluation: Built-in Evaluation" -description: "Executes custom Python code for evaluation" ---- - - - -```python Python -result = evaluator.evaluate( - eval_templates="custom_code_evaluation", - inputs={ - - }, -) - -print(result.eval_results[0].output) -print(result.eval_results[0].reason) -``` - -```typescript JS/TS -import { Evaluator } from "@future-agi/ai-evaluation"; - -const evaluator = new Evaluator(); - -const result = await evaluator.evaluate( - "custom_code_evaluation", - { - - } -); - -console.log(result); -``` - - - -| **Input** | | | | -| ------ | --------- | ---- | ----------- | -| | **Required Input** | **Type** | **Description** | - -| **Output** | | | -| ------ | ----- | ----------- | -| | **Field** | **Description** | -| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | -| | **Reason** | A plain-language explanation of the verdict. | - -**Tags:** `Code`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/customer-agent-prompt-conformance.mdx b/src/pages/docs/evaluation/builtin/customer-agent-prompt-conformance.mdx index 838991dd..c93cc39f 100644 --- a/src/pages/docs/evaluation/builtin/customer-agent-prompt-conformance.mdx +++ b/src/pages/docs/evaluation/builtin/customer-agent-prompt-conformance.mdx @@ -61,5 +61,5 @@ console.log(result); ### Comparing Prompt Conformance with Similar Evals -- [**Instruction Adherence**](/docs/evaluation/builtin/instruction-adherence): Prompt Conformance evaluates alignment with a system-level persona and constraints across a conversation, while Instruction Adherence evaluates whether a single response follows the user's input instructions. +- [**Instruction Adherence**](/docs/evaluation/builtin/prompt-adherence): Prompt Conformance evaluates alignment with a system-level persona and constraints across a conversation, while Instruction Adherence evaluates whether a single response follows the user's input instructions. - [**Customer Agent: Conversation Quality**](/docs/evaluation/builtin/customer-agent-conversation-quality): Prompt Conformance checks rule compliance, while Conversation Quality evaluates the overall user experience of the interaction. diff --git a/src/pages/docs/evaluation/builtin/detect-hallucination.mdx b/src/pages/docs/evaluation/builtin/detect-hallucination.mdx index 38e53bef..5909d66a 100644 --- a/src/pages/docs/evaluation/builtin/detect-hallucination.mdx +++ b/src/pages/docs/evaluation/builtin/detect-hallucination.mdx @@ -67,6 +67,6 @@ If the content is evaluated as containing hallucinations (Failed) and you want t ### Comparing Detect Hallucination with Similar Evals -- [**Instruction Adherence**](/docs/evaluation/builtin/instruction-adherence): Detect Hallucination checks for fabricated information not present in the source, while Instruction Adherence evaluates whether the output follows the instructions provided. +- [**Instruction Adherence**](/docs/evaluation/builtin/prompt-adherence): Detect Hallucination checks for fabricated information not present in the source, while Instruction Adherence evaluates whether the output follows the instructions provided. - [**Groundedness**](/docs/evaluation/builtin/groundedness): Detect Hallucination focuses on absence of fabricated content, while Groundedness measures how well the output is supported by the source material. - [**Context Adherence**](/docs/evaluation/builtin/context-adherence): Detect Hallucination identifies made-up information, while Context Adherence evaluates how well the output adheres to the given context. \ No newline at end of file diff --git a/src/pages/docs/evaluation/builtin/deterministic-evals.mdx b/src/pages/docs/evaluation/builtin/deterministic-evals.mdx deleted file mode 100644 index 4a508cd0..00000000 --- a/src/pages/docs/evaluation/builtin/deterministic-evals.mdx +++ /dev/null @@ -1,51 +0,0 @@ ---- -title: "Deterministic Evals: Built-in Evaluation" -description: "Evaluates if the output is deterministic or not" ---- - - - -```python Python -result = evaluator.evaluate( - eval_templates="deterministic_evals", - inputs={ - - }, - model_name="turing_flash" -) - -print(result.eval_results[0].output) -print(result.eval_results[0].reason) -``` - -```typescript JS/TS -import { Evaluator } from "@future-agi/ai-evaluation"; - -const evaluator = new Evaluator(); - -const result = await evaluator.evaluate( - "deterministic_evals", - { - - }, - { - modelName: "turing_flash", - } -); - -console.log(result); -``` - - - -| **Input** | | | | -| ------ | --------- | ---- | ----------- | -| | **Required Input** | **Type** | **Description** | - -| **Output** | | | -| ------ | ----- | ----------- | -| | **Field** | **Description** | -| | **Result** | Returns one of the predefined categorical labels per row, plus a reason. | -| | **Reason** | A plain-language explanation of the verdict. | - -**Tags:** `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/ends-with.mdx b/src/pages/docs/evaluation/builtin/ends-with.mdx deleted file mode 100644 index fadfa4ad..00000000 --- a/src/pages/docs/evaluation/builtin/ends-with.mdx +++ /dev/null @@ -1,48 +0,0 @@ ---- -title: "Ends With: Built-in Evaluation" -description: "Checks if text ends with specific substring" ---- - - - -```python Python -result = evaluator.evaluate( - eval_templates="ends_with", - inputs={ - "text": "Hello, this is a sample text." - }, -) - -print(result.eval_results[0].output) -print(result.eval_results[0].reason) -``` - -```typescript JS/TS -import { Evaluator } from "@future-agi/ai-evaluation"; - -const evaluator = new Evaluator(); - -const result = await evaluator.evaluate( - "ends_with", - { - text: "Hello, this is a sample text." - } -); - -console.log(result); -``` - - - -| **Input** | | | | -| ------ | --------- | ---- | ----------- | -| | **Required Input** | **Type** | **Description** | -| | `text` | `string` | The input text to be evaluated. | - -| **Output** | | | -| ------ | ----- | ----------- | -| | **Field** | **Description** | -| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | -| | **Reason** | A plain-language explanation of the verdict. | - -**Tags:** `Output Validation`, `Text` diff --git a/src/pages/docs/evaluation/builtin/equals.mdx b/src/pages/docs/evaluation/builtin/equals.mdx deleted file mode 100644 index d5165238..00000000 --- a/src/pages/docs/evaluation/builtin/equals.mdx +++ /dev/null @@ -1,51 +0,0 @@ ---- -title: "Equals: Built-in Evaluation" -description: "Compares if two texts are exactly equal" ---- - - - -```python Python -result = evaluator.evaluate( - eval_templates="equals", - inputs={ - "text": "Hello, this is a sample text.", - "expected_text": "Paris" - }, -) - -print(result.eval_results[0].output) -print(result.eval_results[0].reason) -``` - -```typescript JS/TS -import { Evaluator } from "@future-agi/ai-evaluation"; - -const evaluator = new Evaluator(); - -const result = await evaluator.evaluate( - "equals", - { - text: "Hello, this is a sample text.", - expected_text: "Paris" - } -); - -console.log(result); -``` - - - -| **Input** | | | | -| ------ | --------- | ---- | ----------- | -| | **Required Input** | **Type** | **Description** | -| | `text` | `string` | The input text to be compared. | -| | `expected_text` | `string` | The text to compare against. | - -| **Output** | | | -| ------ | ----- | ----------- | -| | **Field** | **Description** | -| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | -| | **Reason** | A plain-language explanation of the verdict. | - -**Tags:** `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/factual-accuracy.mdx b/src/pages/docs/evaluation/builtin/factual-accuracy.mdx deleted file mode 100644 index 729e8989..00000000 --- a/src/pages/docs/evaluation/builtin/factual-accuracy.mdx +++ /dev/null @@ -1,58 +0,0 @@ ---- -title: "Factual Accuracy: Built-in Evaluation" -description: "Verifies if the provided output is factually correct or not" ---- - - - -```python Python -result = evaluator.evaluate( - eval_templates="factual_accuracy", - inputs={ - "input": "What is the capital of France?", - "output": "The capital of France is Paris.", - "context": "Paris is the capital and most populous city of France." - }, - model_name="turing_flash" -) - -print(result.eval_results[0].output) -print(result.eval_results[0].reason) -``` - -```typescript JS/TS -import { Evaluator } from "@future-agi/ai-evaluation"; - -const evaluator = new Evaluator(); - -const result = await evaluator.evaluate( - "factual_accuracy", - { - input: "What is the capital of France?", - output: "The capital of France is Paris.", - context: "Paris is the capital and most populous city of France." - }, - { - modelName: "turing_flash", - } -); - -console.log(result); -``` - - - -| **Input** | | | | -| ------ | --------- | ---- | ----------- | -| | **Required Input** | **Type** | **Description** | -| | `input` | `string` | The input to be evaluated. | -| | `output` | `string` | The output to be evaluated. | -| | `context` | `string` | The context provided for the response. | - -| **Output** | | | -| ------ | ----- | ----------- | -| | **Field** | **Description** | -| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | -| | **Reason** | A plain-language explanation of the verdict. | - -**Tags:** `Hallucination`, `NLP Metrics` diff --git a/src/pages/docs/evaluation/builtin/index.mdx b/src/pages/docs/evaluation/builtin/index.mdx index 0c338c44..c256739f 100644 --- a/src/pages/docs/evaluation/builtin/index.mdx +++ b/src/pages/docs/evaluation/builtin/index.mdx @@ -1,163 +1,201 @@ --- -title: "Future AGI Built-in Eval Templates Reference" -description: "Complete reference for all built-in evaluation templates available on the Future AGI platform, with quick access to metrics by name." +title: "Evaluator catalog" +description: "Browse FutureAGI's 130+ built-in eval templates by category, or search the full A-Z list. Pick a template, map your data, run." --- -**Built-in evals** are pre-configured evaluation templates you can attach to dataset runs, prompt runs, and simulations. Pick the evals you need, add them to your run, and the platform scores results automatically. +## About + +Built-in evals are pre-configured eval templates you can attach to a dataset, a trace project, a simulation, or call from the SDK. Pick the evals you need, map your data to their required inputs, and FutureAGI scores results automatically. + +For everything in this catalog, the verdict format depends on the [output type](/docs/evaluation/concepts/output-types) (Pass/fail, Scoring, or Choices) and the run-time settings (judge model, context, threshold) are overridable per-application. See [Eval templates](/docs/evaluation/concepts/eval-templates) for the full template model. + +--- + +## Browse by category + + + + Groundedness, context adherence, chunk attribution, hit rate, MRR, NDCG, Precision@K, and other retrieval metrics. + + + Multi-turn agent quality, tool call accuracy, conversation coherence, customer support evaluators. + + + Toxicity, PII detection, prompt injection, bias detection, compliance, refusal handling. + + + BLEU, ROUGE, METEOR, F1, similarity scores, readability, statistical metrics. + + + JSON/XML/HTML/SQL validation, regex, contains, length, exact match, structural checks. + + + Code generation quality (CodeBLEU), syntax validation, tool call accuracy, text-to-SQL. + + + Image captioning, CLIP score, FID, SSIM, OCR evaluation, AI-generated image detection. + + + ASR/STT accuracy, TTS quality, word error rate, audio quality, voice agent evaluators. + + --- +## Built-in vs custom + +| | Built-in | Custom | +|---|---|---| +| Authored by | FutureAGI | You or your team | +| Edit the template | No (you can duplicate and edit the copy) | Yes | +| Available in | Every workspace | Your workspace | +| Coverage | 130+ templates across the categories above | Anything you can express as instructions, code, or an agent rubric | + +To author your own template, see [Create custom evals](/docs/evaluation/build/custom). + +--- + +## Built-in evaluators + +Every built-in template in one searchable table, with its required inputs and evaluation method. Use Cmd+F / Ctrl+F to find one by name. + | Eval | Description | Required Inputs | Use Cases | Evaluation Method | |------|-------------|-----------------|-----------|-------------------| -| [**Conversation Coherence**](/docs/evaluation/builtin/conversation-coherence) | Evaluates if a conversation flows logically and maintains context throughout. | `conversation` | Conversation, Chat, Audio | LLM as Judge | -| [**Conversation Resolution**](/docs/evaluation/builtin/conversation-resolution) | Checks if the conversation reaches a satisfactory conclusion. | `conversation` | Conversation, Chat, Audio | LLM as Judge | -| [**Context Adherence**](/docs/evaluation/builtin/context-adherence) | Measures how well responses stay within the provided context. | `output`, `context` | Text, Audio, Image, Chat, RAG & Retrieval, Hallucination | LLM as Judge | -| [**Context Relevance**](/docs/evaluation/builtin/context-relevance) | Evaluates the relevancy of the context to the user query. | `input`, `context` | Text, Audio, Image, Chat, RAG & Retrieval | LLM as Judge | -| [**Completeness**](/docs/evaluation/builtin/completeness) | Evaluates if the response completely answers the query. | `input`, `output` | Text, Audio, Chat, RAG & Retrieval | LLM as Judge | +| [**Accuracy**](/docs/evaluation/builtin/accuracy) | Computes classification accuracy by comparing predicted labels against expected labels. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | +| [**Answer Refusal**](/docs/evaluation/builtin/answer-refusal) | Checks if the model correctly refuses harmful or restricted queries. | `input`, `output` | Text, Audio, Chat, Safety | LLM as Judge | +| [**Audio Quality**](/docs/evaluation/builtin/audio-quality) | Evaluates the quality of audio (clarity, noise, distortion). | `audio` | Audio | LLM as Judge | +| [**Audio Transcription (ASR/STT)**](/docs/evaluation/builtin/audio-transcription) | Checks accuracy of a speech-to-text transcription against the audio source. | `audio`, `transcription` | Audio | LLM as Judge | +| [**Balanced Accuracy**](/docs/evaluation/builtin/balanced-accuracy) | Computes balanced accuracy (average recall per class). Handles imbalanced datasets better than standard accuracy. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | +| [**Bias Detection**](/docs/evaluation/builtin/bias-detection) | Identifies gender, racial, cultural, or ideological bias in output. | `output` | Text, Audio, Image, Chat, Safety | LLM as Judge | +| [**BLEU Score**](/docs/evaluation/builtin/bleu) | Computes BLEU score between expected answer and model output. | `output`, `expected_response` | Text | Statistical Metric | +| [**Caption Hallucination**](/docs/evaluation/builtin/caption-hallucination) | Detects hallucinated or fabricated details in image captions. | `instruction`, `output` | Image, RAG & Retrieval, Hallucination | LLM as Judge | +| [**Character Error Rate**](/docs/evaluation/builtin/character-error-rate) | Computes Character Error Rate (CER) for ASR/OCR evaluation. | `reference`, `hypothesis` | NLP Metrics, Audio | Statistical Metric | +| [**Chrf Score**](/docs/evaluation/builtin/chrf-score) | Computes ChrF score (character n-gram F-score). More robust than BLEU for morphologically rich languages and short texts. | `reference`, `hypothesis` | NLP Metrics, Text | Statistical Metric | | [**Chunk Attribution**](/docs/evaluation/builtin/chunk-attribution) | Tracks if the context chunk is used in generating the response. | `output`, `context` | RAG & Retrieval | LLM as Judge | | [**Chunk Utilization**](/docs/evaluation/builtin/chunk-utilization) | Measures how effectively context chunks are used in responses. | `output`, `context` | RAG & Retrieval | LLM as Judge | -| [**PII Detection**](/docs/evaluation/builtin/pii) | Detects personally identifiable information (PII) in text. | `output` | Text, Audio, Image, Chat, Safety | LLM as Judge | -| [**Toxicity**](/docs/evaluation/builtin/toxicity) | Evaluates content for toxic or harmful language. | `output` | Text, Audio, Image, Chat, Safety | LLM as Judge | -| [**Tone**](/docs/evaluation/builtin/tone) | Analyzes the tone and sentiment of content. | `output` | Text, Audio, Chat, Safety | LLM as Judge | -| [**Sexist**](/docs/evaluation/builtin/sexist) | Detects sexist content and gender bias. | `output` | Text, Audio, Image, Chat, Safety | LLM as Judge | -| [**Prompt Injection**](/docs/evaluation/builtin/prompt-injection) | Evaluates text for potential prompt injection attempts. | `input`, `output` | Text, Audio, Image, Chat, Safety | LLM as Judge | -| [**Instruction Adherence**](/docs/evaluation/builtin/instruction-adherence) | Assesses how closely the output follows prompt instructions. | `input`, `output` | Text, Audio, Chat, Hallucination | LLM as Judge | -| [**Data Privacy Compliance**](/docs/evaluation/builtin/data-privacy) | Checks output for GDPR, HIPAA, and other privacy regulation compliance. | `output` | Text, Audio, Image, Chat, Safety | LLM as Judge | -| [**Groundedness**](/docs/evaluation/builtin/groundedness) | Ensures response strictly adheres to the provided context without external information. | `output`, `context` | Text, Audio, Chat, RAG & Retrieval, Hallucination | LLM as Judge | -| [**Summary Quality**](/docs/evaluation/builtin/summary-quality) | Evaluates if a summary captures main points and achieves appropriate length. | `input`, `output` | Text, Audio, Image, RAG & Retrieval | LLM as Judge | -| [**Translation Accuracy**](/docs/evaluation/builtin/translation-accuracy) | Evaluates translation quality, accuracy, and cultural appropriateness. | `output`, `expected_response` | Text, Audio, RAG & Retrieval | LLM as Judge | -| [**Cultural Sensitivity**](/docs/evaluation/builtin/cultural-sensitivity) | Analyzes output for cultural appropriateness and inclusive language. | `output` | Text, Audio, Image, Chat, Safety | LLM as Judge | -| [**Bias Detection**](/docs/evaluation/builtin/bias-detection) | Identifies gender, racial, cultural, or ideological bias in output. | `output` | Text, Audio, Image, Chat, Safety | LLM as Judge | -| [**Audio Transcription (ASR/STT)**](/docs/evaluation/builtin/audio-transcription) | Checks accuracy of a speech-to-text transcription against the audio source. | `audio`, `transcription` | Audio | LLM as Judge | -| [**Audio Quality**](/docs/evaluation/builtin/audio-quality) | Evaluates the quality of audio (clarity, noise, distortion). | `audio` | Audio | LLM as Judge | -| [**No Racial Bias**](/docs/evaluation/builtin/no-racial-bias) | Ensures output does not contain or imply racial bias. | `output` | Text, Audio, Image, Chat, Safety | LLM as Judge | -| [**No Gender Bias**](/docs/evaluation/builtin/no-gender-bias) | Checks the response does not reinforce gender stereotypes. | `output` | Text, Audio, Image, Chat, Safety | LLM as Judge | -| [**No Age Bias**](/docs/evaluation/builtin/no-age-bias) | Evaluates if content is free from age-based stereotypes. | `output` | Text, Audio, Image, Chat, Safety | LLM as Judge | -| [**No LLM Reference**](/docs/evaluation/builtin/no-llm-reference) | Ensures output does not reference being an LLM or OpenAI model. | `output` | Text, Audio, Chat, Safety | LLM as Judge | -| [**No Apologies**](/docs/evaluation/builtin/no-apologies) | Checks if the model unnecessarily apologizes. | `output` | Text, Audio, Chat | LLM as Judge | -| [**Is Polite**](/docs/evaluation/builtin/is-polite) | Ensures output maintains a respectful and non-aggressive tone. | `output` | Text, Audio, Chat | LLM as Judge | -| [**Is Concise**](/docs/evaluation/builtin/is-concise) | Measures whether the answer is brief and avoids redundancy. | `output` | Text, Audio, Chat | LLM as Judge | -| [**Is Helpful**](/docs/evaluation/builtin/is-helpful) | Evaluates whether the response answers the user's question effectively. | `input`, `output` | Text, Audio, Chat | LLM as Judge | -| [**Contains Code**](/docs/evaluation/builtin/is-code) | Checks whether the output is valid code or contains expected code snippets. | `output` | Text | LLM as Judge | -| [**Fuzzy Match**](/docs/evaluation/builtin/fuzzy-match) | Compares output with expected answer using approximate matching. | `output`, `expected` | Text, Audio, RAG & Retrieval | Statistical Metric | -| [**Answer Refusal**](/docs/evaluation/builtin/answer-refusal) | Checks if the model correctly refuses harmful or restricted queries. | `input`, `output` | Text, Audio, Chat, Safety | LLM as Judge | -| [**Detect Hallucination**](/docs/evaluation/builtin/detect-hallucination) | Identifies fabricated facts not present in the input or reference. | `input`, `output` | Text, Audio, Image, Chat, RAG & Retrieval, Hallucination | LLM as Judge | -| [**No Harmful Therapeutic Guidance**](/docs/evaluation/builtin/no-harmful-therapeutic-guidance) | Ensures the model does not provide potentially harmful psychological advice. | `output` | Text, Audio, Chat, Safety | LLM as Judge | | [**Clinically Inappropriate Tone**](/docs/evaluation/builtin/clinically-inappropriate-tone) | Evaluates whether tone is unsuitable for clinical or mental health contexts. | `output` | Text, Audio, Chat, Safety | LLM as Judge | -| [**Is Harmful Advice**](/docs/evaluation/builtin/is-harmful-advice) | Detects advice that could be physically, emotionally, legally, or financially harmful. | `output` | Text, Audio, Chat, Safety | LLM as Judge | -| [**Is Good Summary**](/docs/evaluation/builtin/is-good-summary) | Evaluates if a summary is clear, well-structured, and captures key points. | `input`, `output` | Text, Audio, RAG & Retrieval | LLM as Judge | -| [**Is Informal Tone**](/docs/evaluation/builtin/is-informal-tone) | Detects whether the tone is casual (slang, contractions, emoji). | `output` | Text, Audio, Chat | LLM as Judge | -| [**Evaluate Function Calling**](/docs/evaluation/builtin/llm-function-calling) | Assesses accuracy and effectiveness of LLM function calls. | `output` | Text | LLM as Judge | -| [**Task Completion**](/docs/evaluation/builtin/task-completion) | Measures whether the model fulfilled the user's request accurately. | `input`, `output` | Text, Audio, Chat | LLM as Judge | -| [**Caption Hallucination**](/docs/evaluation/builtin/caption-hallucination) | Detects hallucinated or fabricated details in image captions. | `instruction`, `output` | Image, RAG & Retrieval, Hallucination | LLM as Judge | -| [**Text to SQL**](/docs/evaluation/builtin/text-to-sql) | Evaluates the quality and correctness of text-to-SQL generation. | `input`, `output` | Text | LLM as Judge | -| [**Synthetic Image Evaluator**](/docs/evaluation/builtin/synthetic-image-evaluator) | Evaluates synthetic or AI-generated images against criteria. | `image`, `instruction` | Image | LLM as Judge | -| [**OCR Evaluation**](/docs/evaluation/builtin/ocr-evaluation) | Evaluates the accuracy of optical character recognition (OCR) output. | `input_pdf`, `json_content` | Text, PDF / Document | LLM as Judge | -| [**Eval Ranking**](/docs/evaluation/builtin/eval-ranking) | Provides a ranking score for each context based on specified criteria. | `input`, `context` | RAG & Retrieval, Custom | LLM as Ranker | -| [**Is JSON**](/docs/evaluation/builtin/is-json) | Validates if content is proper JSON format. | `output` | Text | Deterministic / Rule-based | -| [**One Line**](/docs/evaluation/builtin/contain-evals) | Checks if the text is a single line. | `output` | Text | Deterministic / Rule-based | +| [**CLIP Score**](/docs/evaluation/builtin/clip-score) | Measures how well images match their text descriptions; higher scores indicate better image-text alignment (range: 0–100). | `images`, `text` | Image | Statistical Metric | +| [**Code Bleu**](/docs/evaluation/builtin/code-bleu) | Computes CodeBLEU - a code-aware BLEU variant that combines standard n-gram BLEU with keyword matching for code-specific tokens. | `reference`, `hypothesis` | Code, NLP Metrics | Statistical Metric | +| [**Code Complexity**](/docs/evaluation/builtin/code-complexity) | Computes cyclomatic complexity of Python code using AST analysis. | `text` | Code, NLP Metrics | Statistical Metric | +| [**Cohen Kappa**](/docs/evaluation/builtin/cohen-kappa) | Computes Cohen's Kappa coefficient for inter-rater agreement. Accounts for agreement occurring by chance. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | +| [**Completeness**](/docs/evaluation/builtin/completeness) | Evaluates if the response completely answers the query. | `input`, `output` | Text, Audio, Chat, RAG & Retrieval | LLM as Judge | +| [**Contains Code**](/docs/evaluation/builtin/is-code) | Checks whether the output is valid code or contains expected code snippets. | `output` | Text | LLM as Judge | | [**Contains Valid Link**](/docs/evaluation/builtin/contains-valid-link) | Checks for presence of valid URLs in the output. | `output` | Text | Deterministic / Rule-based | -| [**Is Email**](/docs/evaluation/builtin/is-email) | Validates email address format. | `output` | Text | Deterministic / Rule-based | -| [**No Invalid Links**](/docs/evaluation/builtin/no-invalid-links) | Checks if the text contains no invalid URLs. | `output` | Text | Deterministic / Rule-based | -| [**BLEU Score**](/docs/evaluation/builtin/bleu) | Computes BLEU score between expected answer and model output. | `output`, `expected_response` | Text | Statistical Metric | -| [**ROUGE Score**](/docs/evaluation/builtin/rouge) | Calculates ROUGE score between generated and reference text. | `output`, `expected_response` | Text | Statistical Metric | -| [**Levenshtein Similarity**](/docs/evaluation/builtin/lavenshtein-similarity) | Calculates edit distance between generated and reference text. | `output`, `expected_response` | Text | Statistical Metric | -| [**Numeric Similarity**](/docs/evaluation/builtin/numeric-similarity) | Calculates numerical difference between generated and reference value. | `output`, `expected_response` | Text | Statistical Metric | -| [**Embedding Similarity**](/docs/evaluation/builtin/embedding-similarity) | Calculates semantic similarity between generated and reference text. | `output`, `expected_response` | Text | Statistical Metric | -| [**Semantic List Contains**](/docs/evaluation/builtin/semantic-list-contains) | Checks if text contains phrases semantically similar to reference phrases. | `output`, `expected_response` | Text | Statistical Metric | -| [**Recall@K**](/docs/evaluation/builtin/recall-at-k) | Evaluates recall at K for retrieval-based systems. | `output`, `context` | RAG & Retrieval | Statistical Metric | -| [**Precision@K**](/docs/evaluation/builtin/precision-at-k) | Evaluates precision at K for retrieval-based systems. | `output`, `context` | RAG & Retrieval | Statistical Metric | -| [**NDCG@K**](/docs/evaluation/builtin/ndcg-at-k) | Calculates normalized discounted cumulative gain at K. | `output`, `context` | RAG & Retrieval | Statistical Metric | -| [**MRR**](/docs/evaluation/builtin/mrr) | Calculates mean reciprocal rank for retrieval results. | `output`, `context` | RAG & Retrieval | Statistical Metric | -| [**Hit Rate**](/docs/evaluation/builtin/hit-rate) | Measures the fraction of queries where the correct item appears in top-K results. | `output`, `context` | RAG & Retrieval | Statistical Metric | -| [**Customer Agent: Loop Detection**](/docs/evaluation/builtin/customer-agent-loop-detection) | Detects if a customer agent is stuck in a loop during a conversation. | `conversation` | Conversation, Chat, Audio | LLM as Judge | +| [**Context Adherence**](/docs/evaluation/builtin/context-adherence) | Measures how well responses stay within the provided context. | `output`, `context` | Text, Audio, Image, Chat, RAG & Retrieval, Hallucination | LLM as Judge | +| [**Context Relevance**](/docs/evaluation/builtin/context-relevance) | Evaluates the relevancy of the context to the user query. | `input`, `context` | Text, Audio, Image, Chat, RAG & Retrieval | LLM as Judge | +| [**Conversation Coherence**](/docs/evaluation/builtin/conversation-coherence) | Evaluates if a conversation flows logically and maintains context throughout. | `conversation` | Conversation, Chat, Audio | LLM as Judge | +| [**Conversation Resolution**](/docs/evaluation/builtin/conversation-resolution) | Checks if the conversation reaches a satisfactory conclusion. | `conversation` | Conversation, Chat, Audio | LLM as Judge | +| [**Cultural Sensitivity**](/docs/evaluation/builtin/cultural-sensitivity) | Analyzes output for cultural appropriateness and inclusive language. | `output` | Text, Audio, Image, Chat, Safety | LLM as Judge | +| [**Customer Agent: Clarification Seeking**](/docs/evaluation/builtin/customer-agent-clarification-seeking) | Evaluates if the agent appropriately seeks clarification when needed. | `conversation` | Conversation, Chat, Audio | LLM as Judge | | [**Customer Agent: Context Retention**](/docs/evaluation/builtin/customer-agent-context-retention) | Evaluates if the agent correctly retains context across conversation turns. | `conversation` | Conversation, Chat, Audio | LLM as Judge | -| [**Customer Agent: Query Handling**](/docs/evaluation/builtin/customer-agent-query-handling) | Assesses how effectively the agent handles customer queries. | `conversation` | Conversation, Chat, Audio | LLM as Judge | -| [**Customer Agent: Termination Handling**](/docs/evaluation/builtin/customer-agent-termination-handling) | Evaluates how the agent handles conversation termination. | `conversation` | Conversation, Chat, Audio | LLM as Judge | -| [**Customer Agent: Interruption Handling**](/docs/evaluation/builtin/customer-agent-interruption-handling) | Checks how the agent responds to interruptions during a conversation. | `conversation` | Conversation, Chat, Audio | LLM as Judge | | [**Customer Agent: Conversation Quality**](/docs/evaluation/builtin/customer-agent-conversation-quality) | Evaluates the overall quality of a customer agent conversation. | `conversation` | Conversation, Chat, Audio | LLM as Judge | -| [**Customer Agent: Objection Handling**](/docs/evaluation/builtin/customer-agent-objection-handling) | Assesses how the agent handles objections raised by the customer. | `conversation` | Conversation, Chat, Audio | LLM as Judge | -| [**Customer Agent: Language Handling**](/docs/evaluation/builtin/customer-agent-language-handling) | Evaluates language consistency and appropriateness in agent responses. | `conversation` | Conversation, Chat, Audio | LLM as Judge | | [**Customer Agent: Human Escalation**](/docs/evaluation/builtin/customer-agent-human-escalation) | Checks if the agent correctly identifies when to escalate to a human. | `conversation` | Conversation, Chat, Audio | LLM as Judge | -| [**Customer Agent: Clarification Seeking**](/docs/evaluation/builtin/customer-agent-clarification-seeking) | Evaluates if the agent appropriately seeks clarification when needed. | `conversation` | Conversation, Chat, Audio | LLM as Judge | +| [**Customer Agent: Interruption Handling**](/docs/evaluation/builtin/customer-agent-interruption-handling) | Checks how the agent responds to interruptions during a conversation. | `conversation` | Conversation, Chat, Audio | LLM as Judge | +| [**Customer Agent: Language Handling**](/docs/evaluation/builtin/customer-agent-language-handling) | Evaluates language consistency and appropriateness in agent responses. | `conversation` | Conversation, Chat, Audio | LLM as Judge | +| [**Customer Agent: Loop Detection**](/docs/evaluation/builtin/customer-agent-loop-detection) | Detects if a customer agent is stuck in a loop during a conversation. | `conversation` | Conversation, Chat, Audio | LLM as Judge | +| [**Customer Agent: Objection Handling**](/docs/evaluation/builtin/customer-agent-objection-handling) | Assesses how the agent handles objections raised by the customer. | `conversation` | Conversation, Chat, Audio | LLM as Judge | | [**Customer Agent: Prompt Conformance**](/docs/evaluation/builtin/customer-agent-prompt-conformance) | Checks if agent responses conform to the defined prompt and guidelines. | `system_prompt`, `conversation` | Conversation, Chat, Audio | LLM as Judge | -| [**TTS Accuracy**](/docs/evaluation/builtin/tts-accuracy) | Evaluates the accuracy and naturalness of text-to-speech output. | `text`, `generated_audio` | Audio, Conversation | LLM as Judge | -| [**Ground Truth Match**](/docs/evaluation/builtin/ground-truth-match) | Checks if the output matches a provided ground truth answer. | `generated_value`, `expected_value` | Text, Audio | LLM as Judge | +| [**Customer Agent: Query Handling**](/docs/evaluation/builtin/customer-agent-query-handling) | Assesses how effectively the agent handles customer queries. | `conversation` | Conversation, Chat, Audio | LLM as Judge | +| [**Customer Agent: Termination Handling**](/docs/evaluation/builtin/customer-agent-termination-handling) | Evaluates how the agent handles conversation termination. | `conversation` | Conversation, Chat, Audio | LLM as Judge | +| [**Data Privacy Compliance**](/docs/evaluation/builtin/data-privacy) | Checks output for GDPR, HIPAA, and other privacy regulation compliance. | `output` | Text, Audio, Image, Chat, Safety | LLM as Judge | +| [**Detect Hallucination**](/docs/evaluation/builtin/detect-hallucination) | Identifies fabricated facts not present in the input or reference. | `input`, `output` | Text, Audio, Image, Chat, RAG & Retrieval, Hallucination | LLM as Judge | +| [**Distinct N**](/docs/evaluation/builtin/distinct-n) | Computes Distinct-N: ratio of unique n-grams to total n-grams. Measures vocabulary diversity in generated text. | `text` | NLP Metrics, Text | Statistical Metric | +| [**Embedding Similarity**](/docs/evaluation/builtin/embedding-similarity) | Calculates semantic similarity between generated and reference text. | `output`, `expected_response` | Text | Statistical Metric | +| [**Eval Ranking**](/docs/evaluation/builtin/eval-ranking) | Provides a ranking score for each context based on specified criteria. | `input`, `context` | RAG & Retrieval, Custom | LLM as Ranker | +| [**Evaluate Function Calling**](/docs/evaluation/builtin/llm-function-calling) | Assesses accuracy and effectiveness of LLM function calls. | `output` | Text | LLM as Judge | +| [**F Beta Score**](/docs/evaluation/builtin/f-beta-score) | Computes F-beta score with configurable beta for precision/recall weighting. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | +| [**F1 Score**](/docs/evaluation/builtin/f1-score) | Computes token-level F1 score between output and expected text. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | | [**FID Score**](/docs/evaluation/builtin/fid-score) | Computes the Fréchet Inception Distance between two sets of images; lower scores indicate more similar image distributions. | `real_images`, `fake_images` | Image | Statistical Metric | -| [**CLIP Score**](/docs/evaluation/builtin/clip-score) | Measures how well images match their text descriptions; higher scores indicate better image-text alignment (range: 0–100). | `images`, `text` | Image | Statistical Metric | +| [**Fleiss Kappa**](/docs/evaluation/builtin/fleiss-kappa) | Computes Fleiss' Kappa for multi-rater agreement. Extends Cohen's Kappa to N raters. | `output` | NLP Metrics, Output Validation | Statistical Metric | +| [**Fuzzy Match**](/docs/evaluation/builtin/fuzzy-match) | Compares output with expected answer using approximate matching. | `output`, `expected` | Text, Audio, RAG & Retrieval | Statistical Metric | +| [**Gleu Score**](/docs/evaluation/builtin/gleu-score) | Computes Google BLEU (GLEU) score. A sentence-level BLEU variant. | `reference`, `hypothesis` | NLP Metrics, Text | Statistical Metric | +| [**Ground Truth Match**](/docs/evaluation/builtin/ground-truth-match) | Checks if the output matches a provided ground truth answer. | `generated_value`, `expected_value` | Text, Audio | LLM as Judge | +| [**Groundedness**](/docs/evaluation/builtin/groundedness) | Ensures response strictly adheres to the provided context without external information. | `output`, `context` | Text, Audio, Chat, RAG & Retrieval, Hallucination | LLM as Judge | +| [**Hamming Similarity**](/docs/evaluation/builtin/hamming-similarity) | Computes Hamming similarity between two strings. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | +| [**Hit Rate**](/docs/evaluation/builtin/hit-rate) | Measures the fraction of queries where the correct item appears in top-K results. | `output`, `context` | RAG & Retrieval | Statistical Metric | | [**Image Instruction Adherence**](/docs/evaluation/builtin/image-instruction-adherence) | Measures how well generated images adhere to a given text instruction across subject, style, and composition. | `instruction`, `images` | Image | LLM as Judge | -| [**Accuracy**](/docs/evaluation/builtin/accuracy) | Computes classification accuracy by comparing predicted labels against expected labels. Accepts single values or JSON arrays of labels. Case-insensitive comparison. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | -| [**Answer Similarity**](/docs/evaluation/builtin/answer-similarity) | Evaluates the similarity between the expected and actual responses. | `expected_response`, `response` | NLP Metrics, Output Validation | Statistical Metric | -| [**Api Call**](/docs/evaluation/builtin/api-call) | Makes an API call and evaluates the response. | `response` | Code, Output Validation | Deterministic / Rule-based | -| [**Balanced Accuracy**](/docs/evaluation/builtin/balanced-accuracy) | Computes balanced accuracy (average recall per class). Handles imbalanced datasets better than standard accuracy. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | -| [**Character Error Rate**](/docs/evaluation/builtin/character-error-rate) | Computes Character Error Rate (CER) for ASR/OCR evaluation. CER measures character-level edit distance between reference and hypothesis. Returns 1-CER as score (higher=better). | `reference`, `hypothesis` | NLP Metrics, Audio | Statistical Metric | -| [**Chrf Score**](/docs/evaluation/builtin/chrf-score) | Computes ChrF score (character n-gram F-score). More robust than BLEU for morphologically rich languages and short texts. Uses character-level n-grams up to order 6 with recall-weighted F-score. | `reference`, `hypothesis` | NLP Metrics, Text | Statistical Metric | -| [**Code Bleu**](/docs/evaluation/builtin/code-bleu) | Computes CodeBLEU - a code-aware BLEU variant that combines standard n-gram BLEU with keyword matching for code-specific tokens (def, class, return, if, for, etc). Better than standard BLEU for evaluating code generation. | `reference`, `hypothesis` | Code, NLP Metrics | Statistical Metric | -| [**Code Complexity**](/docs/evaluation/builtin/code-complexity) | Computes cyclomatic complexity of Python code using AST analysis. Counts decision points (if, for, while, except, boolean ops). Lower complexity = higher score. Useful for code quality evaluation. | `text` | Code, NLP Metrics | Statistical Metric | -| [**Cohen Kappa**](/docs/evaluation/builtin/cohen-kappa) | Computes Cohen's Kappa coefficient for inter-rater agreement. Accounts for agreement occurring by chance. Range -1 to 1, normalized to 0-1 for scoring. Useful for classification evaluation with imbalanced classes. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | -| [**Contains All**](/docs/evaluation/builtin/contains-all) | Verifies text contains all specified keywords. | `text` | Output Validation, Text | Deterministic / Rule-based | -| [**Contains Any**](/docs/evaluation/builtin/contains-any) | Checks if the text contains any of the specified keywords. | `text` | Output Validation, Text | Deterministic / Rule-based | -| [**Contains None**](/docs/evaluation/builtin/contains-none) | Verifies text contains none of specified terms. | `text` | Output Validation, Text | Deterministic / Rule-based | -| [**Content Moderation**](/docs/evaluation/builtin/content-moderation) | Uses content moderation to evaluate content safety. | `output` | Red Teaming, Safety, Harmful Objects | LLM as Judge | -| [**Content Safety Violation**](/docs/evaluation/builtin/content-safety-violation) | A broad check for content that violates safety or usage policies—this includes toxicity, hate speech, explicit content, violence, etc. | `output` | Red Teaming, Safety, Harmful Objects | LLM as Judge | -| [**Custom Code Evaluation**](/docs/evaluation/builtin/custom-code-evaluation) | Executes custom Python code for evaluation. | — | Code, Output Validation | Deterministic / Rule-based | -| [**Deterministic Evals**](/docs/evaluation/builtin/deterministic-evals) | Evaluates if the output is deterministic or not. | — | Output Validation | LLM as Judge | -| [**Distinct N**](/docs/evaluation/builtin/distinct-n) | Computes Distinct-N: ratio of unique n-grams to total n-grams. Measures vocabulary diversity in generated text. Higher = more diverse. | `text` | NLP Metrics, Text | Statistical Metric | -| [**Ends With**](/docs/evaluation/builtin/ends-with) | Checks if text ends with specific substring. | `text` | Output Validation, Text | Deterministic / Rule-based | -| [**Equals**](/docs/evaluation/builtin/equals) | Compares if two texts are exactly equal. | `text`, `expected_text` | Output Validation | Deterministic / Rule-based | -| [**F Beta Score**](/docs/evaluation/builtin/f-beta-score) | Computes F-beta score with configurable beta for precision/recall weighting. `Beta<1` favors precision, `beta>1` favors recall. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | -| [**F1 Score**](/docs/evaluation/builtin/f1-score) | Computes token-level F1 score between output and expected text. Treats both texts as bags of tokens and calculates the harmonic mean of precision and recall. Widely used for QA and extraction tasks. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | -| [**Factual Accuracy**](/docs/evaluation/builtin/factual-accuracy) | Verifies if the provided output is factually correct or not. | `input`, `output`, `context` | Hallucination, NLP Metrics | LLM as Judge | -| [**Fleiss Kappa**](/docs/evaluation/builtin/fleiss-kappa) | Computes Fleiss' Kappa for multi-rater agreement. Extends Cohen's Kappa to N raters. Input: matrix where rows=subjects, columns=categories, values=rater counts. | `output` | NLP Metrics, Output Validation | Statistical Metric | -| [**Gleu Score**](/docs/evaluation/builtin/gleu-score) | Computes Google BLEU (GLEU) score. A sentence-level BLEU variant that takes the minimum of precision and recall for each n-gram order, making it more balanced than standard BLEU. | `reference`, `hypothesis` | NLP Metrics, Text | Statistical Metric | -| [**Hamming Similarity**](/docs/evaluation/builtin/hamming-similarity) | Computes Hamming similarity between two strings. Counts matching character positions normalized by the longer string length. Pads the shorter string for unequal lengths. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | -| [**Image Properties**](/docs/evaluation/builtin/image-properties) | Validates image properties including dimensions, format, and file size. Useful for ensuring generated images meet specific requirements. | `text` | Image, Output Validation | Deterministic / Rule-based | -| [**Is Compliant**](/docs/evaluation/builtin/is-compliant) | Ensures that the output adheres to legal, regulatory, or organizational policies (e.g., HIPAA, GDPR, company rules). | `output` | Safety, Output Validation | LLM as Judge | -| [**Is Factually Consistent**](/docs/evaluation/builtin/is-factually-consistent) | Checks if the generated output is factually consistent with the source/context (e.g., input text or documents). | `input`, `output`, `context` | Hallucination, NLP Metrics | LLM as Judge | -| [**Is Html**](/docs/evaluation/builtin/is-html) | Validates if text contains well-formed HTML with proper tag nesting. Checks for the presence of HTML tags and validates that all non-void tags are properly closed. | `text` | Output Format, Code | Deterministic / Rule-based | -| [**Is Refusal**](/docs/evaluation/builtin/is-refusal) | Detects if LLM output is a refusal to answer using common refusal pattern matching. Returns True if refusal detected. | `text` | Safety, Output Validation | Deterministic / Rule-based | -| [**Is Sql**](/docs/evaluation/builtin/is-sql) | Validates if text appears to be syntactically valid SQL. Checks for recognized SQL keywords, balanced parentheses, and balanced quotes. | `text` | Output Format, Code | Deterministic / Rule-based | +| [**Image Properties**](/docs/evaluation/builtin/image-properties) | Validates image properties including dimensions, format, and file size. | `text` | Image, Output Validation | Deterministic / Rule-based | +| [**Is Concise**](/docs/evaluation/builtin/is-concise) | Measures whether the answer is brief and avoids redundancy. | `output` | Text, Audio, Chat | LLM as Judge | +| [**Is Email**](/docs/evaluation/builtin/is-email) | Validates email address format. | `output` | Text | Deterministic / Rule-based | +| [**Is Good Summary**](/docs/evaluation/builtin/is-good-summary) | Evaluates if a summary is clear, well-structured, and captures key points. | `input`, `output` | Text, Audio, RAG & Retrieval | LLM as Judge | +| [**Is Harmful Advice**](/docs/evaluation/builtin/is-harmful-advice) | Detects advice that could be physically, emotionally, legally, or financially harmful. | `output` | Text, Audio, Chat, Safety | LLM as Judge | +| [**Is Helpful**](/docs/evaluation/builtin/is-helpful) | Evaluates whether the response answers the user's question effectively. | `input`, `output` | Text, Audio, Chat | LLM as Judge | +| [**Is Html**](/docs/evaluation/builtin/is-html) | Validates if text contains well-formed HTML with proper tag nesting. | `text` | Output Format, Code | Deterministic / Rule-based | +| [**Is Informal Tone**](/docs/evaluation/builtin/is-informal-tone) | Detects whether the tone is casual (slang, contractions, emoji). | `output` | Text, Audio, Chat | LLM as Judge | +| [**Is JSON**](/docs/evaluation/builtin/is-json) | Validates if content is proper JSON format. | `output` | Text | Deterministic / Rule-based | +| [**Is Polite**](/docs/evaluation/builtin/is-polite) | Ensures output maintains a respectful and non-aggressive tone. | `output` | Text, Audio, Chat | LLM as Judge | +| [**Is Refusal**](/docs/evaluation/builtin/is-refusal) | Detects if LLM output is a refusal to answer using common refusal pattern matching. | `text` | Safety, Output Validation | Deterministic / Rule-based | +| [**Is Sql**](/docs/evaluation/builtin/is-sql) | Validates if text appears to be syntactically valid SQL. | `text` | Output Format, Code | Deterministic / Rule-based | | [**Is Url**](/docs/evaluation/builtin/is-url) | Validates if text is a properly formatted URL with a valid scheme and network location. | `text` | Output Format, Output Validation | Deterministic / Rule-based | -| [**Is Xml**](/docs/evaluation/builtin/is-xml) | Validates if text is well-formed XML. Checks that the text can be parsed as a valid XML document. | `text` | Output Format, Code | Deterministic / Rule-based | -| [**Jaccard Similarity**](/docs/evaluation/builtin/jaccard-similarity) | Computes Jaccard similarity (intersection over union) between token sets of two texts. Useful for measuring set-level overlap regardless of frequency or order. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | -| [**Jaro Winkler Similarity**](/docs/evaluation/builtin/jaro-winkler-similarity) | Computes Jaro-Winkler similarity between two strings. Particularly effective for short strings like names, labels, and identifiers. Adds a prefix bonus to the base Jaro distance. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | -| [**Json Diff**](/docs/evaluation/builtin/json-diff) | Deep structural comparison between two JSON objects. Recursively compares keys and values at all levels, returning a score based on the fraction of matching nodes. Useful for evaluating structured output generation. | `output`, `expected` | Output Format, Output Validation | Deterministic / Rule-based | -| [**Json Scheme Validation**](/docs/evaluation/builtin/json-scheme-validation) | Validates JSON against specified criteria. | `actual_json`, `expected_json` | Output Format, Output Validation | Deterministic / Rule-based | -| [**Latency Check**](/docs/evaluation/builtin/latency-check) | Validates that response latency is within acceptable bounds. Pass if `latency <= max_latency_ms`. | `text` | Output Validation | Deterministic / Rule-based | -| [**Length Between**](/docs/evaluation/builtin/length-between) | Checks if the text length is between specified min and max values. | `text` | Output Validation, Text | Deterministic / Rule-based | -| [**Length Greater Than**](/docs/evaluation/builtin/length-greater-than) | Checks if the text length is greater than a specified value. | `text` | Output Validation, Text | Deterministic / Rule-based | -| [**Length Less Than**](/docs/evaluation/builtin/length-less-than) | Checks if text length is below threshold. | `text` | Output Validation, Text | Deterministic / Rule-based | -| [**Log Loss**](/docs/evaluation/builtin/log-loss) | Computes log loss (cross-entropy) for probability predictions. Returns 1/(1+loss) as score. Lower loss = higher score. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | -| [**Match Error Rate**](/docs/evaluation/builtin/match-error-rate) | Computes Match Error Rate (MER) for speech recognition. MER = edits / (hits + edits). Returns 1-MER as score. | `reference`, `hypothesis` | NLP Metrics, Audio | Statistical Metric | -| [**Matthews Correlation**](/docs/evaluation/builtin/matthews-correlation) | Computes Matthews Correlation Coefficient (MCC). A balanced metric for binary and multiclass classification that produces high scores only when the prediction obtains good results in all four confusion matrix categories. Range -1 to 1, normalized to 0-1. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | -| [**Mean Average Precision**](/docs/evaluation/builtin/mean-average-precision) | Computes Mean Average Precision (MAP) for information retrieval. Averages precision at each relevant item across queries. | `reference`, `hypothesis` | RAG, Retrieval Systems, NLP Metrics | Statistical Metric | -| [**Meteor Score**](/docs/evaluation/builtin/meteor-score) | Computes METEOR score between reference and hypothesis. Uses unigram matching with exact and stem matching, penalizing fragmentation. More correlated with human judgment than BLEU for many tasks. | `reference`, `hypothesis` | NLP Metrics, Text | Statistical Metric | -| [**Non Llm Context Precision**](/docs/evaluation/builtin/non-llm-context-precision) | Non-LLM context precision for RAG evaluation. Measures what fraction of retrieved contexts match reference contexts using exact string matching. | `output`, `expected` | RAG, Retrieval Systems | Deterministic / Rule-based | -| [**Non Llm Context Recall**](/docs/evaluation/builtin/non-llm-context-recall) | Non-LLM context recall for RAG evaluation. Measures what fraction of reference contexts were successfully retrieved. | `output`, `expected` | RAG, Retrieval Systems | Deterministic / Rule-based | +| [**Is Xml**](/docs/evaluation/builtin/is-xml) | Validates if text is well-formed XML. | `text` | Output Format, Code | Deterministic / Rule-based | +| [**Jaccard Similarity**](/docs/evaluation/builtin/jaccard-similarity) | Computes Jaccard similarity (intersection over union) between token sets of two texts. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | +| [**Jaro Winkler Similarity**](/docs/evaluation/builtin/jaro-winkler-similarity) | Computes Jaro-Winkler similarity between two strings. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | +| [**Json Diff**](/docs/evaluation/builtin/json-diff) | Deep structural comparison between two JSON objects. | `output`, `expected` | Output Format, Output Validation | Deterministic / Rule-based | +| [**Latency Check**](/docs/evaluation/builtin/latency-check) | Validates that response latency is within acceptable bounds. | `text` | Output Validation | Deterministic / Rule-based | +| [**Levenshtein Similarity**](/docs/evaluation/builtin/lavenshtein-similarity) | Calculates edit distance between generated and reference text. | `output`, `expected_response` | Text | Statistical Metric | +| [**Log Loss**](/docs/evaluation/builtin/log-loss) | Computes log loss (cross-entropy) for probability predictions. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | +| [**Match Error Rate**](/docs/evaluation/builtin/match-error-rate) | Computes Match Error Rate (MER) for speech recognition. | `reference`, `hypothesis` | NLP Metrics, Audio | Statistical Metric | +| [**Matthews Correlation**](/docs/evaluation/builtin/matthews-correlation) | Computes Matthews Correlation Coefficient (MCC). A balanced metric for binary and multiclass classification. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | +| [**Mean Average Precision**](/docs/evaluation/builtin/mean-average-precision) | Computes Mean Average Precision (MAP) for information retrieval. | `reference`, `hypothesis` | RAG, Retrieval Systems, NLP Metrics | Statistical Metric | +| [**Meteor Score**](/docs/evaluation/builtin/meteor-score) | Computes METEOR score between reference and hypothesis. More correlated with human judgment than BLEU for many tasks. | `reference`, `hypothesis` | NLP Metrics, Text | Statistical Metric | +| [**MRR**](/docs/evaluation/builtin/mrr) | Calculates mean reciprocal rank for retrieval results. | `output`, `context` | RAG & Retrieval | Statistical Metric | +| [**NDCG@K**](/docs/evaluation/builtin/ndcg-at-k) | Calculates normalized discounted cumulative gain at K. | `output`, `context` | RAG & Retrieval | Statistical Metric | +| [**No Age Bias**](/docs/evaluation/builtin/no-age-bias) | Evaluates if content is free from age-based stereotypes. | `output` | Text, Audio, Image, Chat, Safety | LLM as Judge | +| [**No Apologies**](/docs/evaluation/builtin/no-apologies) | Checks if the model unnecessarily apologizes. | `output` | Text, Audio, Chat | LLM as Judge | +| [**No Gender Bias**](/docs/evaluation/builtin/no-gender-bias) | Checks the response does not reinforce gender stereotypes. | `output` | Text, Audio, Image, Chat, Safety | LLM as Judge | +| [**No Harmful Therapeutic Guidance**](/docs/evaluation/builtin/no-harmful-therapeutic-guidance) | Ensures the model does not provide potentially harmful psychological advice. | `output` | Text, Audio, Chat, Safety | LLM as Judge | +| [**No Invalid Links**](/docs/evaluation/builtin/no-invalid-links) | Checks if the text contains no invalid URLs. | `output` | Text | Deterministic / Rule-based | +| [**No LLM Reference**](/docs/evaluation/builtin/no-llm-reference) | Ensures output does not reference being an LLM or OpenAI model. | `output` | Text, Audio, Chat, Safety | LLM as Judge | +| [**No Racial Bias**](/docs/evaluation/builtin/no-racial-bias) | Ensures output does not contain or imply racial bias. | `output` | Text, Audio, Image, Chat, Safety | LLM as Judge | +| [**Non Llm Context Precision**](/docs/evaluation/builtin/non-llm-context-precision) | Non-LLM context precision for RAG evaluation. | `output`, `expected` | RAG, Retrieval Systems | Deterministic / Rule-based | +| [**Non Llm Context Recall**](/docs/evaluation/builtin/non-llm-context-recall) | Non-LLM context recall for RAG evaluation. | `output`, `expected` | RAG, Retrieval Systems | Deterministic / Rule-based | +| [**Numeric Similarity**](/docs/evaluation/builtin/numeric-similarity) | Calculates numerical difference between generated and reference value. | `output`, `expected_response` | Text | Statistical Metric | +| [**OCR Evaluation**](/docs/evaluation/builtin/ocr-evaluation) | Evaluates the accuracy of optical character recognition (OCR) output. | `input_pdf`, `json_content` | Text, PDF / Document | LLM as Judge | | [**One Line**](/docs/evaluation/builtin/one-line) | Checks if the text is a single line. | `text` | Output Validation, Output Format | Deterministic / Rule-based | -| [**Pearson Correlation**](/docs/evaluation/builtin/pearson-correlation) | Computes Pearson correlation coefficient between two sets of numeric values. Measures linear relationship strength (-1 to 1, normalized to 0-1). | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | -| [**Precision Score**](/docs/evaluation/builtin/precision-score) | Computes classification precision (TP / (TP + FP)) for binary or multiclass tasks. Measures how many positive predictions are actually correct. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | -| [**Prompt Adherence**](/docs/evaluation/builtin/prompt-adherence) | Assesses how closely the output follows the given prompt instructions, checking for completion of all requested tasks and adherence to specified constraints or formats. Evaluates both explicit and implicit requirements in the prompt. | `input`, `output` | NLP Metrics, Output Validation | LLM as Judge | -| [**Psnr**](/docs/evaluation/builtin/psnr) | Computes Peak Signal-to-Noise Ratio (PSNR) between two images. Higher PSNR indicates more similar images. Returns a normalized score (0-1) where PSNR is mapped from 0-50 dB range. | `output`, `expected` | Image, NLP Metrics | Statistical Metric | -| [**R2 Score**](/docs/evaluation/builtin/r2-score) | Computes R-squared (coefficient of determination). Measures proportion of variance explained by predictions. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | -| [**Readability Score**](/docs/evaluation/builtin/readability-score) | Computes Flesch-Kincaid readability metrics. Returns a normalized score (0-1) based on Flesch Reading Ease. Higher scores indicate more readable text. Also reports grade level. | `text` | NLP Metrics, Text | Statistical Metric | -| [**Recall Score**](/docs/evaluation/builtin/recall-score) | Recall: Out of all ground-truth relevant chunks, what fraction was retrieved. | `hypothesis`, `reference` | NLP Metrics, Output Validation | Statistical Metric | -| [**Regex**](/docs/evaluation/builtin/regex) | Checks if the text matches a specified regex pattern. | `text` | Output Validation, Output Format | Deterministic / Rule-based | -| [**Regex Pii Detection**](/docs/evaluation/builtin/regex-pii-detection) | Detects Personally Identifiable Information (PII) using regex patterns. Scans for SSN, credit card numbers, phone numbers, email addresses, and IP addresses. Returns pass (no PII) or fail (PII detected). | `text` | Data Leakage, Safety | Deterministic / Rule-based | -| [**Repetition Rate**](/docs/evaluation/builtin/repetition-rate) | Measures repeated n-gram rate in text. Returns 1-rate as score (higher = less repetitive = better). Useful for detecting degenerate/looping LLM outputs. | `text` | NLP Metrics, Text | Statistical Metric | -| [**Rmse**](/docs/evaluation/builtin/rmse) | Computes Root Mean Squared Error between predicted and actual values. Returns 1/(1+RMSE) as score (higher=better). | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | -| [**Sentence Count**](/docs/evaluation/builtin/sentence-count) | Counts sentences in text and optionally validates against a min/max range. Useful for enforcing structural constraints on generated text. | `text` | Output Validation, Text | Deterministic / Rule-based | -| [**Spearman Correlation**](/docs/evaluation/builtin/spearman-correlation) | Computes Spearman rank correlation coefficient. Measures monotonic relationship between two sets of values (-1 to 1, normalized to 0-1). | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | -| [**Squad Score**](/docs/evaluation/builtin/squad-score) | Computes SQuAD-style evaluation: exact match + token F1 for QA tasks. Normalizes text (lowercase, remove articles/punctuation) before comparing. | `output`, `expected` | NLP Metrics, RAG | Statistical Metric | -| [**Ssim**](/docs/evaluation/builtin/ssim) | Computes Structural Similarity Index (SSIM) between two images. Measures perceptual similarity based on luminance, contrast, and structure. Score ranges from 0 (completely different) to 1 (identical). | `output`, `expected` | Image, NLP Metrics | Statistical Metric | -| [**Starts With**](/docs/evaluation/builtin/starts-with) | Checks if text begins with specific substring. | `text` | Output Validation, Text | Deterministic / Rule-based | -| [**Step Count**](/docs/evaluation/builtin/step-count) | Counts and validates the number of steps/actions in an agent trajectory. Can check against exact count, minimum, maximum, or range. | `output` | Agents, Output Validation | Deterministic / Rule-based | -| [**Syntax Validation**](/docs/evaluation/builtin/syntax-validation) | Validates code syntax without executing it. Supports Python (via ast.parse), JSON, and basic JavaScript bracket validation. Useful for checking if LLM-generated code is syntactically correct. | `text` | Code, Output Validation | Deterministic / Rule-based | -| [**Tool Call Accuracy**](/docs/evaluation/builtin/tool-call-accuracy) | Evaluates accuracy of agent tool/function calls by comparing actual vs expected calls. Checks function names and arguments, supporting both OpenAI and generic tool call formats. Scores full matches (name+args) at 1.0 and name-only matches at 0.5. | `output`, `expected` | Agents, Code | Deterministic / Rule-based | -| [**Trajectory Match**](/docs/evaluation/builtin/trajectory-match) | Validates agent action/tool call sequences. Supports strict (same order), unordered (any order), subset (expected in actual), and superset (actual in expected) matching modes. | `output`, `expected` | Agents, Output Validation | Deterministic / Rule-based | -| [**Translation Edit Rate**](/docs/evaluation/builtin/translation-edit-rate) | Computes Translation Edit Rate (TER). TER measures the minimum number of edits (insertions, deletions, substitutions) needed to transform the hypothesis into the reference, normalized by reference length. Returns 1-TER (higher=better). | `reference`, `hypothesis` | NLP Metrics, Text | Statistical Metric | -| [**Type Token Ratio**](/docs/evaluation/builtin/type-token-ratio) | Computes Type-Token Ratio (TTR): unique tokens divided by total tokens. Measures lexical diversity. | `text` | NLP Metrics, Text | Statistical Metric | -| [**Word Count In Range**](/docs/evaluation/builtin/word-count-in-range) | Checks if the word count of text falls within a specified range. Useful for enforcing length constraints on generated responses (e.g., summaries, tweets, abstracts). | `text` | Output Validation, Text | Deterministic / Rule-based | -| [**Word Error Rate**](/docs/evaluation/builtin/word-error-rate) | Computes Word Error Rate (WER) for ASR/STT evaluation. WER measures the edit distance at the word level between reference and hypothesis transcriptions. Returns 1-WER as score (higher=better). | `reference`, `hypothesis` | NLP Metrics, Audio | Statistical Metric | -| [**Word Info Lost**](/docs/evaluation/builtin/word-info-lost) | Computes Word Information Lost (WIL) for speech. WIL = 1 - (hits/ref * hits/hyp). Returns 1-WIL as score. | `reference`, `hypothesis` | NLP Metrics, Audio | Statistical Metric | -| [**Word Info Preserved**](/docs/evaluation/builtin/word-info-preserved) | Computes Word Information Preserved (WIP) for speech. WIP = (hits/ref) * (hits/hyp). Higher = better. | `reference`, `hypothesis` | NLP Metrics, Audio | Statistical Metric | +| [**Pearson Correlation**](/docs/evaluation/builtin/pearson-correlation) | Computes Pearson correlation coefficient between two sets of numeric values. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | +| [**PII Detection**](/docs/evaluation/builtin/pii) | Detects personally identifiable information (PII) in text. | `output` | Text, Audio, Image, Chat, Safety | LLM as Judge | +| [**Precision Score**](/docs/evaluation/builtin/precision-score) | Computes classification precision (TP / (TP + FP)) for binary or multiclass tasks. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | +| [**Precision@K**](/docs/evaluation/builtin/precision-at-k) | Evaluates precision at K for retrieval-based systems. | `output`, `context` | RAG & Retrieval | Statistical Metric | +| [**Prompt Adherence**](/docs/evaluation/builtin/prompt-adherence) | Assesses how closely the output follows the given prompt instructions. | `input`, `output` | NLP Metrics, Output Validation | LLM as Judge | +| [**Prompt Injection**](/docs/evaluation/builtin/prompt-injection) | Evaluates text for potential prompt injection attempts. | `input`, `output` | Text, Audio, Image, Chat, Safety | LLM as Judge | +| [**Psnr**](/docs/evaluation/builtin/psnr) | Computes Peak Signal-to-Noise Ratio (PSNR) between two images. | `output`, `expected` | Image, NLP Metrics | Statistical Metric | +| [**R2 Score**](/docs/evaluation/builtin/r2-score) | Computes R-squared (coefficient of determination). | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | +| [**Readability Score**](/docs/evaluation/builtin/readability-score) | Computes Flesch-Kincaid readability metrics. | `text` | NLP Metrics, Text | Statistical Metric | +| [**Recall@K**](/docs/evaluation/builtin/recall-at-k) | Evaluates recall at K for retrieval-based systems. | `output`, `context` | RAG & Retrieval | Statistical Metric | +| [**Regex Pii Detection**](/docs/evaluation/builtin/regex-pii-detection) | Detects Personally Identifiable Information (PII) using regex patterns. | `text` | Data Leakage, Safety | Deterministic / Rule-based | +| [**Repetition Rate**](/docs/evaluation/builtin/repetition-rate) | Measures repeated n-gram rate in text. Useful for detecting degenerate/looping LLM outputs. | `text` | NLP Metrics, Text | Statistical Metric | +| [**Rmse**](/docs/evaluation/builtin/rmse) | Computes Root Mean Squared Error between predicted and actual values. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | +| [**ROUGE Score**](/docs/evaluation/builtin/rouge) | Calculates ROUGE score between generated and reference text. | `output`, `expected_response` | Text | Statistical Metric | +| [**Semantic List Contains**](/docs/evaluation/builtin/semantic-list-contains) | Checks if text contains phrases semantically similar to reference phrases. | `output`, `expected_response` | Text | Statistical Metric | +| [**Sentence Count**](/docs/evaluation/builtin/sentence-count) | Counts sentences in text and optionally validates against a min/max range. | `text` | Output Validation, Text | Deterministic / Rule-based | +| [**Sexist**](/docs/evaluation/builtin/sexist) | Detects sexist content and gender bias. | `output` | Text, Audio, Image, Chat, Safety | LLM as Judge | +| [**Spearman Correlation**](/docs/evaluation/builtin/spearman-correlation) | Computes Spearman rank correlation coefficient. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | +| [**Squad Score**](/docs/evaluation/builtin/squad-score) | Computes SQuAD-style evaluation: exact match + token F1 for QA tasks. | `output`, `expected` | NLP Metrics, RAG | Statistical Metric | +| [**Ssim**](/docs/evaluation/builtin/ssim) | Computes Structural Similarity Index (SSIM) between two images. | `output`, `expected` | Image, NLP Metrics | Statistical Metric | +| [**Step Count**](/docs/evaluation/builtin/step-count) | Counts and validates the number of steps/actions in an agent trajectory. | `output` | Agents, Output Validation | Deterministic / Rule-based | +| [**Summary Quality**](/docs/evaluation/builtin/summary-quality) | Evaluates if a summary captures main points and achieves appropriate length. | `input`, `output` | Text, Audio, Image, RAG & Retrieval | LLM as Judge | +| [**Syntax Validation**](/docs/evaluation/builtin/syntax-validation) | Validates code syntax without executing it. Supports Python, JSON, and basic JavaScript bracket validation. | `text` | Code, Output Validation | Deterministic / Rule-based | +| [**Synthetic Image Evaluator**](/docs/evaluation/builtin/synthetic-image-evaluator) | Evaluates synthetic or AI-generated images against criteria. | `image`, `instruction` | Image | LLM as Judge | +| [**Task Completion**](/docs/evaluation/builtin/task-completion) | Measures whether the model fulfilled the user's request accurately. | `input`, `output` | Text, Audio, Chat | LLM as Judge | +| [**Text to SQL**](/docs/evaluation/builtin/text-to-sql) | Evaluates the quality and correctness of text-to-SQL generation. | `input`, `output` | Text | LLM as Judge | +| [**Tone**](/docs/evaluation/builtin/tone) | Analyzes the tone and sentiment of content. | `output` | Text, Audio, Chat, Safety | LLM as Judge | +| [**Tool Call Accuracy**](/docs/evaluation/builtin/tool-call-accuracy) | Evaluates accuracy of agent tool/function calls by comparing actual vs expected calls. | `output`, `expected` | Agents, Code | Deterministic / Rule-based | +| [**Toxicity**](/docs/evaluation/builtin/toxicity) | Evaluates content for toxic or harmful language. | `output` | Text, Audio, Image, Chat, Safety | LLM as Judge | +| [**Trajectory Match**](/docs/evaluation/builtin/trajectory-match) | Validates agent action/tool call sequences. Supports strict, unordered, subset, and superset matching modes. | `output`, `expected` | Agents, Output Validation | Deterministic / Rule-based | +| [**Translation Accuracy**](/docs/evaluation/builtin/translation-accuracy) | Evaluates translation quality, accuracy, and cultural appropriateness. | `output`, `expected_response` | Text, Audio, RAG & Retrieval | LLM as Judge | +| [**Translation Edit Rate**](/docs/evaluation/builtin/translation-edit-rate) | Computes Translation Edit Rate (TER). | `reference`, `hypothesis` | NLP Metrics, Text | Statistical Metric | +| [**TTS Accuracy**](/docs/evaluation/builtin/tts-accuracy) | Evaluates the accuracy and naturalness of text-to-speech output. | `text`, `generated_audio` | Audio, Conversation | LLM as Judge | +| [**Type Token Ratio**](/docs/evaluation/builtin/type-token-ratio) | Computes Type-Token Ratio (TTR): unique tokens divided by total tokens. | `text` | NLP Metrics, Text | Statistical Metric | +| [**Word Count In Range**](/docs/evaluation/builtin/word-count-in-range) | Checks if the word count of text falls within a specified range. | `text` | Output Validation, Text | Deterministic / Rule-based | +| [**Word Error Rate**](/docs/evaluation/builtin/word-error-rate) | Computes Word Error Rate (WER) for ASR/STT evaluation. | `reference`, `hypothesis` | NLP Metrics, Audio | Statistical Metric | +| [**Word Info Lost**](/docs/evaluation/builtin/word-info-lost) | Computes Word Information Lost (WIL) for speech. | `reference`, `hypothesis` | NLP Metrics, Audio | Statistical Metric | +| [**Word Info Preserved**](/docs/evaluation/builtin/word-info-preserved) | Computes Word Information Preserved (WIP) for speech. | `reference`, `hypothesis` | NLP Metrics, Audio | Statistical Metric | + +--- + +## Next steps + +- [Run evals in the UI](/docs/evaluation/run/in-the-ui): apply an eval to a dataset from the dashboard. +- [Run evals with the Python SDK](/docs/evaluation/run/python-sdk): script the same eval from code. +- [Create custom evals](/docs/evaluation/build/custom): define your own quality rules when the catalog doesn't cover your case. +- [Eval types](/docs/evaluation/concepts/eval-types): Agents, LLM-As-A-Judge, or Code. diff --git a/src/pages/docs/evaluation/builtin/instruction-adherence.mdx b/src/pages/docs/evaluation/builtin/instruction-adherence.mdx deleted file mode 100644 index 43ff1af9..00000000 --- a/src/pages/docs/evaluation/builtin/instruction-adherence.mdx +++ /dev/null @@ -1,68 +0,0 @@ ---- - -title: "Prompt Instruction Adherence Evaluation Metric" -description: "Measures how closely an output follows prompt instructions, checking task completion and adherence to specified constraints or output formats." ---- - - - -```python Python -result = evaluator.evaluate( - eval_templates="prompt_instruction_adherence", - inputs={ - "prompt": "Write a short poem about nature that has exactly 4 lines and includes the word 'sunshine'.", - "output": "Morning rays filter through leaves,\nBirds sing in harmony with sunshine's glow,\nGreen meadows dance in the gentle breeze,\nNature's symphony in perfect flow." - }, - model_name="turing_flash" -) - -print(result.eval_results[0].output) -print(result.eval_results[0].reason) -``` - -```typescript JS/TS -import { Evaluator, Templates } from "@future-agi/ai-evaluation"; - -const evaluator = new Evaluator(); - -const result = await evaluator.evaluate( - "prompt_instruction_adherence", - { - prompt: "Write a short poem about nature that has exactly 4 lines and includes the word 'sunshine'.", - output: "Morning rays filter through leaves,\nBirds sing in harmony with sunshine's glow,\nGreen meadows dance in the gentle breeze,\nNature's symphony in perfect flow." - }, - { - modelName: "turing_flash", - } -); - -console.log(result); -``` - - - -| **Input** | | | | -| ------ | --------- | ---- | ----------- | -| | **Required Input** |**Type** | **Description** | -| | `prompt` | `string` | The input prompt provided to the model | -| | `output` | `string` | The output generated by the model | - -| **Output** | | | -| ------ | ----- | ----------- | -| | **Field** | **Description** | -| | **Result** | Returns a score, where higher values indicate better adherence to the prompt instructions | -| | **Reason** | Provides a detailed explanation of the prompt instruction adherence assessment | - -### What to Do if Prompt Instruction Adherence is Low - -Identify specific areas where the output deviates from the given instructions. Providing targeted feedback helps refine the content to better align with the prompt. - -Reviewing the prompt for clarity and completeness is essential, as ambiguous or vague instructions may contribute to poor adherence. If necessary, adjusting the prompt to offer clearer guidance can improve response accuracy. - -Enhancing the model's ability to interpret and follow instructions through fine-tuning or prompt engineering can further strengthen adherence. - -### Differentiating Prompt/Instruction Adherence with [Context Adherence](/docs/evaluation/builtin/context-adherence) - -Context Adherence focuses on maintaining information boundaries and verifying sources, ensuring that responses are strictly derived from the given context. Whereas, Prompt Adherence evaluates whether the output correctly follows instructions, completes tasks, and adheres to specified formats. - -Their evaluation criteria differ, with Context Adherence checking if information originates from the provided context, while Prompt Adherence ensures that all instructions are followed accurately. \ No newline at end of file diff --git a/src/pages/docs/evaluation/builtin/is-compliant.mdx b/src/pages/docs/evaluation/builtin/is-compliant.mdx deleted file mode 100644 index 105e3b79..00000000 --- a/src/pages/docs/evaluation/builtin/is-compliant.mdx +++ /dev/null @@ -1,52 +0,0 @@ ---- -title: "Is Compliant: Built-in Evaluation" -description: "Ensures that the output adheres to legal, regulatory, or organizational policies (e.g., HIPAA, GDPR, company rules)" ---- - - - -```python Python -result = evaluator.evaluate( - eval_templates="is_compliant", - inputs={ - "output": "The capital of France is Paris." - }, - model_name="turing_flash" -) - -print(result.eval_results[0].output) -print(result.eval_results[0].reason) -``` - -```typescript JS/TS -import { Evaluator } from "@future-agi/ai-evaluation"; - -const evaluator = new Evaluator(); - -const result = await evaluator.evaluate( - "is_compliant", - { - output: "The capital of France is Paris." - }, - { - modelName: "turing_flash", - } -); - -console.log(result); -``` - - - -| **Input** | | | | -| ------ | --------- | ---- | ----------- | -| | **Required Input** | **Type** | **Description** | -| | `output` | `string` | The output to be evaluated for compliance. | - -| **Output** | | | -| ------ | ----- | ----------- | -| | **Field** | **Description** | -| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | -| | **Reason** | A plain-language explanation of the verdict. | - -**Tags:** `Safety`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/is-concise.mdx b/src/pages/docs/evaluation/builtin/is-concise.mdx index 5d525039..739a81eb 100644 --- a/src/pages/docs/evaluation/builtin/is-concise.mdx +++ b/src/pages/docs/evaluation/builtin/is-concise.mdx @@ -60,4 +60,4 @@ console.log(result); - [**Completeness**](/docs/evaluation/builtin/completeness): Is Concise evaluates brevity and avoidance of redundancy, while Completeness ensures the response addresses all aspects of the query. - [**Is Helpful**](/docs/evaluation/builtin/is-helpful): Is Concise focuses on avoiding unnecessary verbosity, while Is Helpful evaluates whether the response actually answers the user's question effectively. -- [**Instruction Adherence**](/docs/evaluation/builtin/instruction-adherence): Is Concise measures response length quality, while Instruction Adherence checks if the response follows specific instructions that might include length or detail requirements. \ No newline at end of file +- [**Instruction Adherence**](/docs/evaluation/builtin/prompt-adherence): Is Concise measures response length quality, while Instruction Adherence checks if the response follows specific instructions that might include length or detail requirements. \ No newline at end of file diff --git a/src/pages/docs/evaluation/builtin/is-email.mdx b/src/pages/docs/evaluation/builtin/is-email.mdx index abbab32f..85bbd18e 100644 --- a/src/pages/docs/evaluation/builtin/is-email.mdx +++ b/src/pages/docs/evaluation/builtin/is-email.mdx @@ -55,7 +55,7 @@ Review the input text to identify formatting issues. Common problems may include Consider revising the input to ensure it meets the standard email format. -### Differentiating "Is Email" with [Contain](/docs/evaluation/builtin/contain-evals) Eval +### Differentiating "Is Email" from generic text checks The "Is Email" evaluation uses a regex pattern specifically designed for email validation, ensuring accurate identification of valid email addresses while minimising false positives. This approach prevents incorrect acceptance of improperly formatted emails. In contrast, Contains Evaluations may lead to inaccuracies by detecting partial matches, such as flagging "user@domain" as containing an email, even though it lacks the full structure of a valid email address. Unlike regex-based validation, these evaluations do not verify completeness, making them less reliable for strict email validation. diff --git a/src/pages/docs/evaluation/builtin/is-factually-consistent.mdx b/src/pages/docs/evaluation/builtin/is-factually-consistent.mdx deleted file mode 100644 index 11e4231d..00000000 --- a/src/pages/docs/evaluation/builtin/is-factually-consistent.mdx +++ /dev/null @@ -1,58 +0,0 @@ ---- -title: "Is Factually Consistent: Built-in Evaluation" -description: "Checks if the generated output is factually consistent with the source/context (e.g., input text or documents)" ---- - - - -```python Python -result = evaluator.evaluate( - eval_templates="is_factually_consistent", - inputs={ - "input": "What is the capital of France?", - "output": "The capital of France is Paris.", - "context": "Paris is the capital and most populous city of France." - }, - model_name="turing_flash" -) - -print(result.eval_results[0].output) -print(result.eval_results[0].reason) -``` - -```typescript JS/TS -import { Evaluator } from "@future-agi/ai-evaluation"; - -const evaluator = new Evaluator(); - -const result = await evaluator.evaluate( - "is_factually_consistent", - { - input: "What is the capital of France?", - output: "The capital of France is Paris.", - context: "Paris is the capital and most populous city of France." - }, - { - modelName: "turing_flash", - } -); - -console.log(result); -``` - - - -| **Input** | | | | -| ------ | --------- | ---- | ----------- | -| | **Required Input** | **Type** | **Description** | -| | `input` | `string` | The source/context material. | -| | `output` | `string` | The output to be evaluated for factual consistency. | -| | `context` | `string` | The context to compare against for factual consistency. | - -| **Output** | | | -| ------ | ----- | ----------- | -| | **Field** | **Description** | -| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | -| | **Reason** | A plain-language explanation of the verdict. | - -**Tags:** `Hallucination`, `NLP Metrics` diff --git a/src/pages/docs/evaluation/builtin/is-helpful.mdx b/src/pages/docs/evaluation/builtin/is-helpful.mdx index 004c56a8..52cd4abd 100644 --- a/src/pages/docs/evaluation/builtin/is-helpful.mdx +++ b/src/pages/docs/evaluation/builtin/is-helpful.mdx @@ -63,5 +63,5 @@ console.log(result); - [**Completeness**](/docs/evaluation/builtin/completeness): Is Helpful evaluates whether the response is useful overall, while Completeness checks if all aspects of the query are addressed. - [**Task Completion**](/docs/evaluation/builtin/task-completion): Is Helpful assesses general usefulness, while Task Completion checks if a specific requested task was accomplished. -- [**Instruction Adherence**](/docs/evaluation/builtin/instruction-adherence): Is Helpful evaluates usefulness, while Instruction Adherence evaluates if the response follows specific instructions. +- [**Instruction Adherence**](/docs/evaluation/builtin/prompt-adherence): Is Helpful evaluates usefulness, while Instruction Adherence evaluates if the response follows specific instructions. - [**Is Concise**](/docs/evaluation/builtin/is-concise): Is Helpful focuses on effectiveness, while Is Concise assesses whether the response avoids unnecessary verbosity. \ No newline at end of file diff --git a/src/pages/docs/evaluation/builtin/json-scheme-validation.mdx b/src/pages/docs/evaluation/builtin/json-scheme-validation.mdx deleted file mode 100644 index 72a7ef92..00000000 --- a/src/pages/docs/evaluation/builtin/json-scheme-validation.mdx +++ /dev/null @@ -1,51 +0,0 @@ ---- -title: "Json Scheme Validation: Built-in Evaluation" -description: "Validates JSON against specified criteria" ---- - - - -```python Python -result = evaluator.evaluate( - eval_templates="json_scheme_validation", - inputs={ - "actual_json": "...", - "expected_json": "..." - }, -) - -print(result.eval_results[0].output) -print(result.eval_results[0].reason) -``` - -```typescript JS/TS -import { Evaluator } from "@future-agi/ai-evaluation"; - -const evaluator = new Evaluator(); - -const result = await evaluator.evaluate( - "json_scheme_validation", - { - actual_json: "...", - expected_json: "..." - } -); - -console.log(result); -``` - - - -| **Input** | | | | -| ------ | --------- | ---- | ----------- | -| | **Required Input** | **Type** | **Description** | -| | `actual_json` | `string` | The JSON to be validated. | -| | `expected_json` | `string` | The expected JSON structure. | - -| **Output** | | | -| ------ | ----- | ----------- | -| | **Field** | **Description** | -| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | -| | **Reason** | A plain-language explanation of the verdict. | - -**Tags:** `Output Format`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/length-between.mdx b/src/pages/docs/evaluation/builtin/length-between.mdx deleted file mode 100644 index 607007ec..00000000 --- a/src/pages/docs/evaluation/builtin/length-between.mdx +++ /dev/null @@ -1,48 +0,0 @@ ---- -title: "Length Between: Built-in Evaluation" -description: "Checks if the text length is between specified min and max values" ---- - - - -```python Python -result = evaluator.evaluate( - eval_templates="length_between", - inputs={ - "text": "Hello, this is a sample text." - }, -) - -print(result.eval_results[0].output) -print(result.eval_results[0].reason) -``` - -```typescript JS/TS -import { Evaluator } from "@future-agi/ai-evaluation"; - -const evaluator = new Evaluator(); - -const result = await evaluator.evaluate( - "length_between", - { - text: "Hello, this is a sample text." - } -); - -console.log(result); -``` - - - -| **Input** | | | | -| ------ | --------- | ---- | ----------- | -| | **Required Input** | **Type** | **Description** | -| | `text` | `string` | The input text to be evaluated. | - -| **Output** | | | -| ------ | ----- | ----------- | -| | **Field** | **Description** | -| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | -| | **Reason** | A plain-language explanation of the verdict. | - -**Tags:** `Output Validation`, `Text` diff --git a/src/pages/docs/evaluation/builtin/length-greater-than.mdx b/src/pages/docs/evaluation/builtin/length-greater-than.mdx deleted file mode 100644 index 3283232e..00000000 --- a/src/pages/docs/evaluation/builtin/length-greater-than.mdx +++ /dev/null @@ -1,48 +0,0 @@ ---- -title: "Length Greater Than: Built-in Evaluation" -description: "Checks if the text length is greater than a specified value" ---- - - - -```python Python -result = evaluator.evaluate( - eval_templates="length_greater_than", - inputs={ - "text": "Hello, this is a sample text." - }, -) - -print(result.eval_results[0].output) -print(result.eval_results[0].reason) -``` - -```typescript JS/TS -import { Evaluator } from "@future-agi/ai-evaluation"; - -const evaluator = new Evaluator(); - -const result = await evaluator.evaluate( - "length_greater_than", - { - text: "Hello, this is a sample text." - } -); - -console.log(result); -``` - - - -| **Input** | | | | -| ------ | --------- | ---- | ----------- | -| | **Required Input** | **Type** | **Description** | -| | `text` | `string` | The input text to be evaluated. | - -| **Output** | | | -| ------ | ----- | ----------- | -| | **Field** | **Description** | -| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | -| | **Reason** | A plain-language explanation of the verdict. | - -**Tags:** `Output Validation`, `Text` diff --git a/src/pages/docs/evaluation/builtin/length-less-than.mdx b/src/pages/docs/evaluation/builtin/length-less-than.mdx deleted file mode 100644 index 93d89c38..00000000 --- a/src/pages/docs/evaluation/builtin/length-less-than.mdx +++ /dev/null @@ -1,48 +0,0 @@ ---- -title: "Length Less Than: Built-in Evaluation" -description: "Checks if text length is below threshold" ---- - - - -```python Python -result = evaluator.evaluate( - eval_templates="length_less_than", - inputs={ - "text": "Hello, this is a sample text." - }, -) - -print(result.eval_results[0].output) -print(result.eval_results[0].reason) -``` - -```typescript JS/TS -import { Evaluator } from "@future-agi/ai-evaluation"; - -const evaluator = new Evaluator(); - -const result = await evaluator.evaluate( - "length_less_than", - { - text: "Hello, this is a sample text." - } -); - -console.log(result); -``` - - - -| **Input** | | | | -| ------ | --------- | ---- | ----------- | -| | **Required Input** | **Type** | **Description** | -| | `text` | `string` | The input text to be evaluated. | - -| **Output** | | | -| ------ | ----- | ----------- | -| | **Field** | **Description** | -| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | -| | **Reason** | A plain-language explanation of the verdict. | - -**Tags:** `Output Validation`, `Text` diff --git a/src/pages/docs/evaluation/builtin/llm-function-calling.mdx b/src/pages/docs/evaluation/builtin/llm-function-calling.mdx index 634fe766..d0c807ff 100644 --- a/src/pages/docs/evaluation/builtin/llm-function-calling.mdx +++ b/src/pages/docs/evaluation/builtin/llm-function-calling.mdx @@ -61,4 +61,4 @@ Refining the model's output or adjusting the function call handling process can ### Comparing Evaluate Function Calling with Similar Evals - [**Task Completion**](/docs/evaluation/builtin/task-completion): Evaluate Function Calling assesses whether the LLM correctly identifies and formats a function/tool call, while Task Completion measures whether the model fulfilled the user's overall request accurately. -- [**Instruction Adherence**](/docs/evaluation/builtin/instruction-adherence): Evaluate Function Calling focuses on whether the correct function and parameters were identified, while Instruction Adherence evaluates whether the output follows the prompt instructions more broadly. \ No newline at end of file +- [**Instruction Adherence**](/docs/evaluation/builtin/prompt-adherence): Evaluate Function Calling focuses on whether the correct function and parameters were identified, while Instruction Adherence evaluates whether the output follows the prompt instructions more broadly. \ No newline at end of file diff --git a/src/pages/docs/evaluation/builtin/recall-score.mdx b/src/pages/docs/evaluation/builtin/recall-score.mdx deleted file mode 100644 index e9eaec5a..00000000 --- a/src/pages/docs/evaluation/builtin/recall-score.mdx +++ /dev/null @@ -1,51 +0,0 @@ ---- -title: "Recall Score: Built-in Evaluation" -description: "Recall: Out of all ground-truth relevant chunks, what fraction was retrieved" ---- - - - -```python Python -result = evaluator.evaluate( - eval_templates="recall_score", - inputs={ - "hypothesis": "Paris is the capital of France.", - "reference": "The capital of France is Paris." - }, -) - -print(result.eval_results[0].output) -print(result.eval_results[0].reason) -``` - -```typescript JS/TS -import { Evaluator } from "@future-agi/ai-evaluation"; - -const evaluator = new Evaluator(); - -const result = await evaluator.evaluate( - "recall_score", - { - hypothesis: "Paris is the capital of France.", - reference: "The capital of France is Paris." - } -); - -console.log(result); -``` - - - -| **Input** | | | | -| ------ | --------- | ---- | ----------- | -| | **Required Input** | **Type** | **Description** | -| | `hypothesis` | `string` | Retrieved chunks (array/JSON string). | -| | `reference` | `string` | Ground-truth relevant chunks (array/JSON string). | - -| **Output** | | | -| ------ | ----- | ----------- | -| | **Field** | **Description** | -| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | -| | **Reason** | A plain-language explanation of the verdict. | - -**Tags:** `NLP Metrics`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/regex.mdx b/src/pages/docs/evaluation/builtin/regex.mdx deleted file mode 100644 index b06db46e..00000000 --- a/src/pages/docs/evaluation/builtin/regex.mdx +++ /dev/null @@ -1,48 +0,0 @@ ---- -title: "Regex: Built-in Evaluation" -description: "Checks if the text matches a specified regex pattern" ---- - - - -```python Python -result = evaluator.evaluate( - eval_templates="regex", - inputs={ - "text": "Hello, this is a sample text." - }, -) - -print(result.eval_results[0].output) -print(result.eval_results[0].reason) -``` - -```typescript JS/TS -import { Evaluator } from "@future-agi/ai-evaluation"; - -const evaluator = new Evaluator(); - -const result = await evaluator.evaluate( - "regex", - { - text: "Hello, this is a sample text." - } -); - -console.log(result); -``` - - - -| **Input** | | | | -| ------ | --------- | ---- | ----------- | -| | **Required Input** | **Type** | **Description** | -| | `text` | `string` | The input text to be evaluated. | - -| **Output** | | | -| ------ | ----- | ----------- | -| | **Field** | **Description** | -| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | -| | **Reason** | A plain-language explanation of the verdict. | - -**Tags:** `Output Validation`, `Output Format` diff --git a/src/pages/docs/evaluation/builtin/starts-with.mdx b/src/pages/docs/evaluation/builtin/starts-with.mdx deleted file mode 100644 index 74f4d8e2..00000000 --- a/src/pages/docs/evaluation/builtin/starts-with.mdx +++ /dev/null @@ -1,48 +0,0 @@ ---- -title: "Starts With: Built-in Evaluation" -description: "Checks if text begins with specific substring" ---- - - - -```python Python -result = evaluator.evaluate( - eval_templates="starts_with", - inputs={ - "text": "Hello, this is a sample text." - }, -) - -print(result.eval_results[0].output) -print(result.eval_results[0].reason) -``` - -```typescript JS/TS -import { Evaluator } from "@future-agi/ai-evaluation"; - -const evaluator = new Evaluator(); - -const result = await evaluator.evaluate( - "starts_with", - { - text: "Hello, this is a sample text." - } -); - -console.log(result); -``` - - - -| **Input** | | | | -| ------ | --------- | ---- | ----------- | -| | **Required Input** | **Type** | **Description** | -| | `text` | `string` | The input text to be evaluated. | - -| **Output** | | | -| ------ | ----- | ----------- | -| | **Field** | **Description** | -| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | -| | **Reason** | A plain-language explanation of the verdict. | - -**Tags:** `Output Validation`, `Text` diff --git a/src/pages/docs/evaluation/builtin/task-completion.mdx b/src/pages/docs/evaluation/builtin/task-completion.mdx index b75bc920..1afd8364 100644 --- a/src/pages/docs/evaluation/builtin/task-completion.mdx +++ b/src/pages/docs/evaluation/builtin/task-completion.mdx @@ -67,5 +67,5 @@ If the response is evaluated as not completing the task (Failed) and you want to ### Comparing Task Completion with Similar Evals - [**Completeness**](/docs/evaluation/builtin/completeness): While Task Completion evaluates whether a response successfully accomplishes a requested task, Completeness focuses specifically on whether all required information is included. -- [**Instruction Adherence**](/docs/evaluation/builtin/instruction-adherence): Task Completion evaluates whether a response accomplishes the requested task, whereas Instruction Adherence measures how well the response follows specific instructions. +- [**Instruction Adherence**](/docs/evaluation/builtin/prompt-adherence): Task Completion evaluates whether a response accomplishes the requested task, whereas Instruction Adherence measures how well the response follows specific instructions. - [**Is Helpful**](/docs/evaluation/builtin/is-helpful): Task Completion focuses on successful completion of a task, while Is Helpful evaluates the overall usefulness of a response. \ No newline at end of file diff --git a/src/pages/docs/evaluation/concepts/composite-evals.mdx b/src/pages/docs/evaluation/concepts/composite-evals.mdx index a0dc2b08..8f7646b7 100644 --- a/src/pages/docs/evaluation/concepts/composite-evals.mdx +++ b/src/pages/docs/evaluation/concepts/composite-evals.mdx @@ -1,6 +1,22 @@ --- title: "Composite Evals: Combining Multiple Checks Into One Score" description: "How to bundle several eval templates into a single composite, the five aggregation functions, and when to use a safety gate vs an average." +page_type: "concept" +diataxis: "explanation" +concept_family: "evaluation" +concept_level: "intermediate" +primary_question: "What is a composite eval?" +direct_answer: "A composite eval runs several child eval templates against the same row and combines them into one verdict using a chosen aggregation function. Use composites when 'good' is the combination of independent checks, or for a single safety gate across multiple hard rules." +related_concepts: + - "/docs/evaluation/concepts/eval-templates" + - "/docs/evaluation/concepts/output-types" + - "/docs/evaluation/concepts/versioning" +related_tasks: + - "/docs/evaluation/build/custom" +has_diagram: true +diagram_type: "mermaid-flowchart" +last_diagram_reviewed: "2026-05-21" +schema_type: "TechArticle" --- ## About @@ -9,6 +25,16 @@ A composite eval runs several child eval templates against the same row and comb A composite is just a template like any other. You apply it to a dataset, trace project, or simulation the same way. The only difference is what's inside. + C1[Child eval: Helpfulness] + Row --> C2[Child eval: Groundedness] + Row --> C3[Child eval: Tone] + C1 --> Agg[Aggregation function] + C2 --> Agg + C3 --> Agg + Agg --> R[One composite verdict]`} /> + --- ## Anatomy of a composite @@ -108,9 +134,18 @@ This is the right default for production composites where you want the behaviour --- -## Next Steps +## What it isn't + +- **Composite vs a single rich-rubric LLM-As-A-Judge eval.** A composite runs N separate evals (N tokens, N model calls). A single LLM-As-A-Judge eval with a multi-criterion rubric runs once. Use a composite when each criterion needs to fire independently or be reweighted; use one richer eval when the criteria are correlated and a single judge can decide. +- **Composite vs multiple bindings.** Adding three separate evals to the same dataset gives you three independent results. A composite combines them into *one* verdict. Use the former when you want per-criterion visibility on the dataset; use a composite when you want a single number to gate on. +- **Composite aggregation vs aggregate stats.** Aggregation here means combining per-row child results into a per-row composite score. Aggregate stats (pass rate, average) are computed across rows after the composite verdict exists. +- **Child axis vs child output type.** All children must share the same output type (the "axis"). You can mix templates of different eval types (Agents + LLM-As-A-Judge + Code) as long as they all return Pass/fail, or all return Scoring, or all return Choices with the same label set. + +--- + +## Related concepts - [Eval templates](/docs/evaluation/concepts/eval-templates): The composite vs single distinction. - [Output types](/docs/evaluation/concepts/output-types): The output type sets the child axis. - [Eval results](/docs/evaluation/concepts/eval-results): How composite results aggregate. -- [Create custom evals](/docs/evaluation/features/custom): Build your own composite. +- [Create custom evals](/docs/evaluation/build/custom): Build your own composite. diff --git a/src/pages/docs/evaluation/concepts/data-injection.mdx b/src/pages/docs/evaluation/concepts/data-injection.mdx index df3037df..a98de0bd 100644 --- a/src/pages/docs/evaluation/concepts/data-injection.mdx +++ b/src/pages/docs/evaluation/concepts/data-injection.mdx @@ -1,6 +1,22 @@ --- title: "Data Injection: Giving an Eval More Than Variables" description: "How the Context setting lets an eval see the dataset row, span attributes, the full trace tree, the conversation history, or the call transcript in addition to your mapped variables." +page_type: "concept" +diataxis: "explanation" +concept_family: "evaluation" +concept_level: "intermediate" +primary_question: "What context does an eval see beyond mapped variables?" +direct_answer: "By default an eval sees only the variables you mapped. The Context setting on an Agent eval lets the judge additionally see the dataset row, full span attributes, the full trace tree, the conversation history, or the call transcript when those help the verdict." +related_concepts: + - "/docs/evaluation/concepts/eval-types" + - "/docs/evaluation/concepts/eval-templates" + - "/docs/evaluation/concepts/mcp-connectors" +related_tasks: + - "/docs/evaluation/build/custom" +has_diagram: true +diagram_type: "mermaid-flowchart" +last_diagram_reviewed: "2026-05-21" +schema_type: "TechArticle" --- ## About @@ -11,6 +27,17 @@ But sometimes the judgment needs more. If you're checking whether an agent answe This setting applies to [Agent evals](/docs/evaluation/concepts/eval-types#agents) and is a major reason to pick Agents over LLM-As-A-Judge. + J[Judge] + Ctx[Context options layered on top] + Ctx -->|dataset row| J + Ctx -->|full span| J + Ctx -->|trace tree| J + Ctx -->|session history| J + Ctx -->|simulation call| J + J --> Out[Verdict + reason]`} /> + --- ## The six context options @@ -28,8 +55,6 @@ The Context selector on the eval create page offers six options. You can pick an `Template variables` is always on. The other five are additive: turning on `Dataset row context` doesn't replace your mapped variables, it adds the rest of the row alongside them. -{/* SCREENSHOT NEEDED: The Context dropdown on the Agents tab of the eval create page, opened to show all six options (Template variables checked by default, plus the five additional options). */} - --- ## When to use each @@ -117,9 +142,18 @@ The eval template's Context setting acts as the maximum. If a template asks for --- -## Next Steps +## What it isn't + +- **Data injection vs RAG retrieval.** Data injection gives the *judge* extra context to make a verdict. RAG retrieval gives your *application* context to answer the user. The Context setting doesn't pull from your knowledge base, for that, attach [knowledge bases](/docs/evaluation/concepts/eval-types#agents) or [MCP connectors](/docs/evaluation/concepts/mcp-connectors) to an Agent eval. +- **Context vs variables.** Variables are explicit, named, and used in `{{variable_name}}`. Context is additive structured data the judge can read but the criteria can't reference by name. Use variables for fields the criteria explicitly names; use Context for the surrounding picture. +- **Context options vs eval type.** Context only applies to Agent evals. LLM-As-A-Judge and Code evals see only the variables you mapped. +- **Context vs few-shot examples.** [Ground truth](/docs/evaluation/build/ground-truth) attaches labelled examples to the prompt as few-shot context. The Context setting changes what raw data the judge sees per row. Both can be on at once for different purposes. + +--- + +## Related concepts - [Eval types](/docs/evaluation/concepts/eval-types): Context is an Agent setting; LLM-As-A-Judge and Code use only mapped variables. - [Eval templates](/docs/evaluation/concepts/eval-templates): Where the Context setting lives on a template. -- [Create custom evals](/docs/evaluation/features/custom): Configure context when authoring an Agent eval. -- [Evaluate via Platform & SDK](/docs/evaluation/features/evaluate): Apply an eval to a dataset, trace project, or simulation. +- [Create custom evals](/docs/evaluation/build/custom): Configure context when authoring an Agent eval. +- [Evaluate via Platform & SDK](/docs/evaluation/run/in-the-ui): Apply an eval to a dataset, trace project, or simulation. diff --git a/src/pages/docs/evaluation/concepts/eval-results.mdx b/src/pages/docs/evaluation/concepts/eval-results.mdx index 9414fc6e..dc448c20 100644 --- a/src/pages/docs/evaluation/concepts/eval-results.mdx +++ b/src/pages/docs/evaluation/concepts/eval-results.mdx @@ -1,6 +1,23 @@ --- title: "Eval Results: What an Eval Returns and Where It Goes" description: "What an eval result contains, the three output types and what each one returns, and how results are stored across datasets, traces, simulations, and the SDK." +page_type: "concept" +diataxis: "explanation" +concept_family: "evaluation" +concept_level: "foundational" +primary_question: "What does an eval return and where does it go?" +direct_answer: "Every eval run produces one result per row, span, or input. A result includes the value (verdict), reason, runtime, model, and status. Results are stored alongside the data they evaluated, dataset columns, trace span logs, simulation calls, or SDK returns, using the same schema everywhere." +related_concepts: + - "/docs/evaluation/concepts/output-types" + - "/docs/evaluation/concepts/eval-templates" + - "/docs/evaluation/concepts/composite-evals" +related_tasks: + - "/docs/evaluation/run/python-sdk" + - "/docs/evaluation/run/in-the-ui" +has_diagram: true +diagram_type: "mermaid-flowchart" +last_diagram_reviewed: "2026-05-21" +schema_type: "TechArticle" --- ## About @@ -9,6 +26,17 @@ Every evaluation produces one result per row, span, call, or input. A result tel A result has the same shape regardless of the [eval type](/docs/evaluation/concepts/eval-types) (Agents, LLM-As-A-Judge, or Code) that produced it. + R1[Per-row result] + R1 --> Fields[value + reason + runtime + model + status] + R1 --> Store{Where it lives} + Store -->|dataset| C1[Dataset column] + Store -->|trace project| C2[Span eval log] + Store -->|simulation| C3[Call score] + Store -->|SDK| C4[Returned object] + R1 --> Agg[Aggregate: pass rate / avg score / distribution]`} /> + --- ## What a result contains @@ -124,9 +152,18 @@ This is also how you resume a long run from a different process: store the eval --- -## Next Steps +## What it isn't + +- **Status `Failed` vs verdict `"Failed"`.** Status `Failed` means the eval itself errored (model timeout, code exception, invalid input). Verdict `"Failed"` means the eval ran successfully and the answer didn't pass. These are different fields and have different fixes. +- **Result vs aggregate.** A result is one row's verdict. An aggregate (pass rate, average score, distribution) is computed over many rows. The threshold field on a Scoring template only affects the aggregate; per-row results always include the label + numeric score. +- **Result vs template.** The result is what comes out. The [template](/docs/evaluation/concepts/eval-templates) is what defines what gets produced. Templates are versioned; results are not (they belong to the run that produced them). +- **Reason vs explanation of the verdict.** For Agent and LLM-As-A-Judge evals, the reason is the judge's own words. For Code evals, the reason is whatever your code returned (often empty). Don't trust a reason field as the source of truth without checking the verdict itself. + +--- + +## Related concepts - [Output types](/docs/evaluation/concepts/output-types): Pass/fail, Scoring, and Choices in detail, including how scoring labels map to numeric scores. - [Eval templates](/docs/evaluation/concepts/eval-templates): Where the output type comes from and how composite evals aggregate child results. -- [Evaluate via Platform & SDK](/docs/evaluation/features/evaluate): Run an eval and see results. -- [CI/CD pipeline](/docs/evaluation/features/cicd): Track results by version across deploys. +- [Evaluate via Platform & SDK](/docs/evaluation/run/in-the-ui): Run an eval and see results. +- [CI/CD pipeline](/docs/evaluation/run/cicd): Track results by version across deploys. diff --git a/src/pages/docs/evaluation/concepts/eval-templates.mdx b/src/pages/docs/evaluation/concepts/eval-templates.mdx index 3ed5884a..2602d57d 100644 --- a/src/pages/docs/evaluation/concepts/eval-templates.mdx +++ b/src/pages/docs/evaluation/concepts/eval-templates.mdx @@ -1,6 +1,25 @@ --- title: "Eval Templates: Built-in, Custom, Composite, and Versioned" description: "What eval templates are, the difference between built-in and custom, single vs composite, and how versioning lets you change a template without breaking running evals." +page_type: "concept" +diataxis: "explanation" +concept_family: "evaluation" +concept_level: "foundational" +primary_question: "What is an eval template?" +direct_answer: "An eval template is a reusable definition of one quality check. It bundles the criteria, eval type, input variables, judge model, and output type into a versioned artifact you can apply across datasets, traces, simulations, and SDK calls." +related_concepts: + - "/docs/evaluation/concepts/eval-types" + - "/docs/evaluation/concepts/output-types" + - "/docs/evaluation/concepts/versioning" + - "/docs/evaluation/concepts/composite-evals" +related_tasks: + - "/docs/evaluation/build/custom" +related_reference: + - "/docs/evaluation/builtin" +has_diagram: true +diagram_type: "mermaid-flowchart" +last_diagram_reviewed: "2026-05-21" +schema_type: "TechArticle" --- ## About @@ -9,19 +28,30 @@ An eval template defines what to measure. It holds the criteria, the type ([Agen Templates are the unit you share. Two datasets that use Toxicity reference the same template; mappings and overrides are stored separately for each use. +|criteria + type + output type| Def[Definition] + T -->|saved at each edit| V[Versions] + T -->|applied to data| B1[Binding: dataset] + T -->|applied to data| B2[Binding: trace project] + T -->|applied to data| B3[Binding: simulation] + B1 --> R[Eval result] + B2 --> R + B3 --> R`} /> + --- ## Built-in vs custom | | Built-in | Custom | |---|---|---| -| **Authored by** | Future AGI | You or your team | +| **Authored by** | FutureAGI | You or your team | | **Where it lives** | Available in every workspace | Scoped to your workspace | | **Editable** | Read-only (you can duplicate and edit the copy) | Fully editable | -| **Coverage** | 70+ templates across quality, safety, factuality, RAG retrieval, format, bias, audio, image | Anything you can express as instructions, code, or an agent rubric | +| **Coverage** | 130+ templates across quality, safety, factuality, RAG retrieval, format, bias, audio, image | Anything you can express as instructions, code, or an agent rubric | | **Mapping** | You map your data to the template's required keys | You define the keys when you write the template | -See [Built-in evals](/docs/evaluation/builtin) for the full list. See [Create custom evals](/docs/evaluation/features/custom) for how to author your own. +See [Built-in evals](/docs/evaluation/builtin) for the full list. See [Create custom evals](/docs/evaluation/build/custom) for how to author your own. --- @@ -106,16 +136,25 @@ This means you can change a template's criteria, model, or settings without brea |---|---|---| | **Visible to** | Every workspace | Your workspace | | **Editable** | No (you can duplicate to make a custom copy) | Yes | -| **Updates over time** | Future AGI updates these as the platform improves | You decide when to change them | +| **Updates over time** | FutureAGI updates these as the platform improves | You decide when to change them | System templates are the built-ins. Custom templates are anything you create or duplicate. --- -## Next Steps +## What it isn't + +- **Template vs binding.** A template is the abstract definition. A binding is one application of that template to a specific dataset, trace project, or simulation, the binding stores the input mapping and any per-run overrides, the template stays unchanged. +- **Template vs eval result.** A template defines what gets measured. An [eval result](/docs/evaluation/concepts/eval-results) is one verdict for one row produced by running a template against data. +- **Template vs eval config.** "Eval config" in the SDK refers to the runtime settings (judge model, mapping, threshold) bundled with a template invocation. The template itself is just the definition. +- **Template version vs binding version.** Versioning is on the template (each save creates a new immutable snapshot). Bindings pin to whichever version was active when they were created and don't auto-upgrade. See [Versioning](/docs/evaluation/concepts/versioning). + +--- + +## Related concepts - [Built-in evals](/docs/evaluation/builtin): Full list of available templates with required keys and output types. - [Eval types](/docs/evaluation/concepts/eval-types): Agents, LLM-As-A-Judge, and Code. - [Output types](/docs/evaluation/concepts/output-types): Pass/fail, Scoring, and Choices in detail. -- [Create custom evals](/docs/evaluation/features/custom): Author your own template. +- [Create custom evals](/docs/evaluation/build/custom): Author your own template. - [Eval results](/docs/evaluation/concepts/eval-results): What the output of an eval run looks like and where it goes. diff --git a/src/pages/docs/evaluation/concepts/eval-types.mdx b/src/pages/docs/evaluation/concepts/eval-types.mdx index 9a9e5170..084c2ecb 100644 --- a/src/pages/docs/evaluation/concepts/eval-types.mdx +++ b/src/pages/docs/evaluation/concepts/eval-types.mdx @@ -1,14 +1,41 @@ --- title: "Eval Types: Agents, LLM-As-A-Judge, Code" -description: "The three eval types in Future AGI, what each one can do, and how to pick the right one for your check." +description: "The three eval types in FutureAGI, what each one can do, and how to pick the right one for your check." +page_type: "concept" +diataxis: "explanation" +concept_family: "evaluation" +concept_level: "foundational" +primary_question: "Which eval type should I pick?" +direct_answer: "An eval type controls what the evaluator can do at run time: Agents call tools and reason iteratively, LLM-As-A-Judge runs a single templated prompt, Code runs deterministic Python or JavaScript." +related_concepts: + - "/docs/evaluation/concepts/eval-templates" + - "/docs/evaluation/concepts/output-types" + - "/docs/evaluation/concepts/judge-models" +related_tasks: + - "/docs/evaluation/build/custom" +related_reference: + - "/docs/evaluation/builtin" +has_diagram: true +diagram_type: "mermaid-flowchart" +last_diagram_reviewed: "2026-05-21" +schema_type: "TechArticle" --- ## About -Every eval template in Future AGI is one of three types: **Agents**, **LLM-As-A-Judge**, or **Code**. The type determines what the eval can do at run time: whether it can call tools, look things up, follow a single prompt, or run as deterministic code. +Every eval template in FutureAGI is one of three types: **Agents**, **LLM-As-A-Judge**, or **Code**. The type determines what the eval can do at run time: whether it can call tools, look things up, follow a single prompt, or run as deterministic code. The type is set when you create the template and shown as tabs on the create page. +|Can be computed without a model| C[Code] + Q -->|One prompt is enough| L[LLM-As-A-Judge] + Q -->|Needs tools or external lookup| A[Agents] + C --> Out[Verdict + optional reason] + L --> Out + A --> Out`} /> + | Type | What it runs | What it can use | Returns a reason | |---|---|---|---| | **Agents** | A multi-step evaluator that iterates over the input | Tools and MCP connectors, knowledge bases, internet, dataset row / span / trace / session context | Yes | @@ -57,7 +84,7 @@ What you can configure: | Setting | What it controls | |---|---| | **Messages** | The system + user message chain the model sees. Use `{{variable_name}}` for any field that gets filled at run time. | -| **Model** | Which model judges. Future AGI ships `turing_large`, `turing_small`, and `turing_flash`. You can also bring your own model with [custom models](/docs/evaluation/features/custom-models). | +| **Model** | Which model judges. FutureAGI ships `turing_large`, `turing_small`, and `turing_flash`. You can also bring your own model with [custom models](/docs/evaluation/judge-models/custom). | | **Few-shot examples** | Optional dataset of labelled examples to show the model the kind of judgment you want. | | **Template format** | Mustache-style `{{variable}}` (default) or Jinja, if you need control flow. | @@ -86,7 +113,7 @@ What you can configure: - The cost of an LLM call is not justified for a deterministic answer. - You want zero variance across runs. -Future AGI ships dozens of built-in Code evals so you don't have to write them yourself: Contains, Equals, Regex, Is JSON, BLEU Score, ROUGE, Embedding Similarity, Recall@K, Precision@K, NDCG@K, MRR, CLIP Score, FID Score, and more. +FutureAGI ships dozens of built-in Code evals so you don't have to write them yourself: Contains, Equals, Regex, Is JSON, BLEU Score, ROUGE, Embedding Similarity, Recall@K, Precision@K, NDCG@K, MRR, CLIP Score, FID Score, and more. --- @@ -127,10 +154,19 @@ For multimodal evals, pick a judge model that supports the modality. `turing_lar --- -## Next Steps +## What it isn't + +- **Eval type vs output type.** Eval type controls *capability* (what the evaluator can do); [output type](/docs/evaluation/concepts/output-types) controls *verdict shape* (Pass/fail, Scoring, or Choices). All three eval types can produce any of the three output types. +- **Eval type vs modality.** Type is independent of whether the input is text, image, audio, or conversation. A Code eval can validate JSON; an LLM-As-A-Judge with a multimodal model can score images. +- **Eval type vs eval template.** A template is the artifact you reuse across runs. Its type is one of several properties on the template (alongside criteria, model, output type, required keys). +- **Agents vs an LLM agent in your app.** An Agent eval is a *judge* that iterates while scoring. It is not your application's agent; it is the evaluator that scores your application's output. + +--- + +## Related concepts - [Eval templates](/docs/evaluation/concepts/eval-templates): Built-in vs custom, single vs composite, versioning. - [Output types](/docs/evaluation/concepts/output-types): Pass/fail, Scoring, and Choices. - [Data injection](/docs/evaluation/concepts/data-injection): How to give an eval more than just `{{variables}}`. -- [Create custom evals](/docs/evaluation/features/custom): Author your own template of any type. +- [Create custom evals](/docs/evaluation/build/custom): Author your own template of any type. - [Judge models](/docs/evaluation/concepts/judge-models): Pick the right model for an LLM-As-A-Judge or Agent eval. diff --git a/src/pages/docs/evaluation/concepts/judge-models.mdx b/src/pages/docs/evaluation/concepts/judge-models.mdx index dd654a46..f4128bd6 100644 --- a/src/pages/docs/evaluation/concepts/judge-models.mdx +++ b/src/pages/docs/evaluation/concepts/judge-models.mdx @@ -1,6 +1,22 @@ --- title: "Judge Models: Picking the Right Model for an Eval" -description: "What a judge model is, the Future AGI built-in models, when to bring your own, and how the model interacts with each eval type." +description: "What a judge model is, the FutureAGI built-in models, when to bring your own, and how the model interacts with each eval type." +page_type: "concept" +diataxis: "explanation" +concept_family: "evaluation" +concept_level: "foundational" +primary_question: "What is a judge model and which one should I use?" +direct_answer: "A judge model is the model that runs an Agent or LLM-As-A-Judge eval against your inputs. FutureAGI ships three judges (turing_large, turing_small, turing_flash) and supports bringing your own. The model is the main cost vs accuracy lever for an eval." +related_concepts: + - "/docs/evaluation/concepts/eval-types" + - "/docs/evaluation/concepts/eval-templates" +related_tasks: + - "/docs/evaluation/judge-models/futureagi" + - "/docs/evaluation/judge-models/custom" +has_diagram: true +diagram_type: "mermaid-flowchart" +last_diagram_reviewed: "2026-05-21" +schema_type: "TechArticle" --- ## About @@ -9,6 +25,14 @@ A judge model is the model that runs an evaluation. For an [Agent or LLM-As-A-Ju The judge model determines how accurate, how fast, and how expensive each row is to evaluate. Picking the right one is the main lever you have on the cost / accuracy trade-off for an eval. + Judge[Judge model] + Criteria --> Judge + Judge --> Verdict[Verdict + reason] + Judge -.->|cost + latency| Tradeoff[Pick model for the cost/accuracy axis]`} /> + --- ## When this applies @@ -23,9 +47,9 @@ For multimodal evals (image, audio), the model also has to support that modality --- -## Future AGI built-in models +## FutureAGI built-in models -Future AGI ships three judge models tuned for evaluation. They appear in a "Future AGI Models" section at the top of every model picker. +FutureAGI ships three judge models tuned for evaluation. They appear in a "FutureAGI Models" section at the top of every model picker. | Model | Code | What it's for | |---|---|---| @@ -37,7 +61,7 @@ Future AGI ships three judge models tuned for evaluation. They appear in a "Futu ## Bring your own model -Future AGI also lets you use your own model as the judge. Add it through workspace settings (`AI Providers` for direct integrations like OpenAI, Bedrock, Vertex, Azure; or a custom endpoint for any HTTP-accessible model). Once added, it appears in the eval model picker alongside the Future AGI models. +FutureAGI also lets you use your own model as the judge. Add it through workspace settings (`AI Providers` for direct integrations like OpenAI, Bedrock, Vertex, Azure; or a custom endpoint for any HTTP-accessible model). Once added, it appears in the eval model picker alongside the FutureAGI models. This is useful when: @@ -45,7 +69,7 @@ This is useful when: - Compliance requires inference in a specific region or vendor - You already pay for a model and want to reuse it as the judge -See [Use custom models](/docs/evaluation/features/custom-models) for the setup steps and the providers supported. +See [Use custom models](/docs/evaluation/judge-models/custom) for the setup steps and the providers supported. --- @@ -74,10 +98,18 @@ A few specifics to be aware of: --- -## Next Steps +## What it isn't + +- **Judge model vs your app's model.** The judge model scores output. Your application's model produces output. They are different roles and usually different models. Using your own production model as the judge can cause it to mark its own answers as correct. +- **Judge model vs the model field on an [eval template](/docs/evaluation/concepts/eval-templates).** The template stores a default judge model. The per-application override on a binding can change it for one dataset or trace project without changing the template. +- **Judge model vs modality.** Multimodal evals (image, audio) need a judge that supports the modality. `turing_large` supports text + image + audio; `turing_small` and `turing_flash` support text + image. + +--- + +## Related concepts -- [Future AGI models](/docs/evaluation/features/futureagi-models): Full reference for the built-in judge models. -- [Use custom models](/docs/evaluation/features/custom-models): Bring your own model. +- [FutureAGI models](/docs/evaluation/judge-models/futureagi): Full reference for the built-in judge models. +- [Use custom models](/docs/evaluation/judge-models/custom): Bring your own model. - [Eval types](/docs/evaluation/concepts/eval-types): Agents, LLM-As-A-Judge, and Code. - [Eval templates](/docs/evaluation/concepts/eval-templates): Where the model setting lives on a template. - [Eval results](/docs/evaluation/concepts/eval-results): What the judge produces after scoring. diff --git a/src/pages/docs/evaluation/concepts/mcp-connectors.mdx b/src/pages/docs/evaluation/concepts/mcp-connectors.mdx index f6bfbb18..85bc0c2a 100644 --- a/src/pages/docs/evaluation/concepts/mcp-connectors.mdx +++ b/src/pages/docs/evaluation/concepts/mcp-connectors.mdx @@ -1,6 +1,23 @@ --- title: "MCP Connectors in Evaluation: Give Your Judge Access to External Tools" description: "How MCP connectors extend the Agent eval mode so the judge can look up live data from your systems while scoring an output." +page_type: "concept" +diataxis: "explanation" +concept_family: "evaluation" +concept_level: "intermediate" +primary_question: "What are MCP connectors in evaluation?" +direct_answer: "An MCP connector is a registered link to an MCP server. When attached to an Agent eval, the judge can call the connector's tools while scoring, looking up records in your CRM, fetching docs, querying APIs, so verdicts are grounded in live external data instead of only the inputs." +related_concepts: + - "/docs/evaluation/concepts/eval-types" + - "/docs/evaluation/concepts/data-injection" +related_tasks: + - "/docs/evaluation/build/mcp-connectors" +related_cookbooks: + - "/docs/cookbook/evaluation/eval-with-mcp-connectors" +has_diagram: true +diagram_type: "mermaid-flowchart" +last_diagram_reviewed: "2026-05-21" +schema_type: "TechArticle" --- ## About @@ -9,6 +26,13 @@ An MCP connector is a registered link to an [MCP](https://modelcontextprotocol.i This lets an eval verify a model's response against ground truth that lives outside the dataset row. The judge looks up a record in your CRM, fetches a knowledge-base article, queries Sentry for a related error, or hits an internal API, then uses what it found to decide. + J[Agent-mode judge] + J -->|tool call| C[MCP connector] + C -->|live data| J + J --> V[Verdict + reason + tool trace]`} /> + A concrete example: | Eval input | Connector used | What the judge does | @@ -47,11 +71,11 @@ The eval still produces the normal output shape (pass/fail, choice, score). The | Layer | Visible to the judge | |---|---| -| Eval inputs (dataset columns) | Yes — same as a regular eval | +| Eval inputs (dataset columns) | Yes, same as a regular eval | | Rule prompt | Yes | -| Connector tool schemas | Yes — names, descriptions, parameters | -| Connector tool results | Yes — only for tools the judge actually invokes during the run | -| Other workspace data | No — connectors are scoped to the eval's workspace | +| Connector tool schemas | Yes, names, descriptions, parameters | +| Connector tool results | Yes, only for tools the judge actually invokes during the run | +| Other workspace data | No, connectors are scoped to the eval's workspace | A connector that requires OAuth runs as the user who authenticated it. A connector that uses an API key or bearer token uses the same credentials for every eval row. @@ -81,8 +105,17 @@ The Internet tool fetches one URL at a time and does not crawl. If you need inde --- -## Next steps +## What it isn't + +- **MCP connector vs internet tool.** The internet tool fetches one public URL. An MCP connector exposes a structured set of tools backed by your private systems (CRM, ticketing, docs, monitoring). Use the internet tool for public verification; use a connector for proprietary data. +- **MCP connector vs knowledge base.** Knowledge bases are indexed documents the judge can search semantically. Connectors expose typed tool calls with parameters (e.g. `get_order(id)`). Use a KB for prose lookups; use a connector for structured queries against a live system. +- **MCP connector vs your app's tools.** The connectors attached to an eval are scored *judge* tools, not the application agent's tools. Even if your app uses the same MCP server, the eval gets its own connection and scope. +- **Eval connector vs Falcon AI connector.** Connectors are registered once in [Falcon AI Connectors](/docs/falcon-ai/features/mcp-connectors) and reused across chat, skills, and evals. The eval doesn't define its own connector; it ticks the box to enable an existing one. + +--- + +## Related concepts -- [Configure MCP connectors for an eval](/docs/evaluation/features/mcp-connectors): UI walkthrough for creating an Agent-mode eval with connectors. +- [Configure MCP connectors for an eval](/docs/evaluation/build/mcp-connectors): UI walkthrough for creating an Agent-mode eval with connectors. - [Eval with MCP connectors cookbook](/docs/cookbook/evaluation/eval-with-mcp-connectors): End-to-end example with Linear and Notion. - [Falcon AI MCP Connectors](/docs/falcon-ai/features/mcp-connectors): How to register a connector and authenticate. diff --git a/src/pages/docs/evaluation/concepts/output-types.mdx b/src/pages/docs/evaluation/concepts/output-types.mdx index fb448a21..c91bf699 100644 --- a/src/pages/docs/evaluation/concepts/output-types.mdx +++ b/src/pages/docs/evaluation/concepts/output-types.mdx @@ -1,6 +1,22 @@ --- title: "Output Types: Pass/fail, Scoring, and Choices" description: "How to pick an output type when creating an eval, what each one returns, and how labels map to numeric scores so results stay consistent across runs." +page_type: "concept" +diataxis: "explanation" +concept_family: "evaluation" +concept_level: "foundational" +primary_question: "Which output type should I pick?" +direct_answer: "Output type controls the shape of every verdict an eval returns. Pass/fail gives a binary string, Scoring returns a labelled level with a 0-1 numeric score, and Choices returns one or more category labels each marked Pass, Neutral, or Fail." +related_concepts: + - "/docs/evaluation/concepts/eval-templates" + - "/docs/evaluation/concepts/eval-results" + - "/docs/evaluation/concepts/eval-types" +related_tasks: + - "/docs/evaluation/build/custom" +has_diagram: true +diagram_type: "mermaid-flowchart" +last_diagram_reviewed: "2026-05-21" +schema_type: "TechArticle" --- ## About @@ -9,6 +25,15 @@ Every eval template has one output type. The output type determines what shape t The output type is set on the template, not at run time. Picking the right one is the second decision after [eval type](/docs/evaluation/concepts/eval-types). +|Yes or no| PF[Pass/fail] + Q -->|Graded levels with a number| S[Scoring] + Q -->|One or more category labels| C[Choices] + PF --> R1["Passed / Failed"] + S --> R2["choice + score 0-1"] + C --> R3["chosen labels"]`} /> + --- ## At a glance @@ -151,9 +176,18 @@ For Code evals, the return value of your function determines the verdict directl --- -## Next Steps +## What it isn't + +- **Output type vs eval type.** [Eval type](/docs/evaluation/concepts/eval-types) is the *engine* (Agents, LLM-As-A-Judge, Code). Output type is the *verdict shape*. All three engines can produce any of the three output types (with one Code-specific nuance, Code returns the verdict directly from your function's return value). +- **Scoring vs raw numeric output.** Scoring asks the judge to pick a labelled level, which then maps to a 0-1 number. It is not an open-ended "give me any number". If you need continuous numeric output (BLEU, ROUGE, embedding similarity, latency), use a [Code eval](/docs/evaluation/concepts/eval-types#code). +- **Pass threshold vs verdict.** For Scoring, the threshold turns the numeric score into pass/fail for aggregation. The per-row verdict is still the label + score; the threshold is only used at the aggregate level. +- **Choices vs Pass/fail.** Both can produce binary "pass" / "fail" aggregates, but Choices preserves the *category* label per row. Use Choices when you need to know which type of pass or fail it was (e.g. tone = `Formal` vs `Casual` vs `Neutral`). + +--- + +## Related concepts - [Eval types](/docs/evaluation/concepts/eval-types): Pick the type before the output type. - [Eval templates](/docs/evaluation/concepts/eval-templates): Where the output type is set. - [Eval results](/docs/evaluation/concepts/eval-results): What the result looks like once an eval runs. -- [Create custom evals](/docs/evaluation/features/custom): Author a template and configure its output type. +- [Create custom evals](/docs/evaluation/build/custom): Author a template and configure its output type. diff --git a/src/pages/docs/evaluation/concepts/understanding-evaluation.mdx b/src/pages/docs/evaluation/concepts/understanding-evaluation.mdx deleted file mode 100644 index 7b8f40d0..00000000 --- a/src/pages/docs/evaluation/concepts/understanding-evaluation.mdx +++ /dev/null @@ -1,65 +0,0 @@ ---- -title: "Understanding Evaluation in Future AGI" -description: "How evaluation works in Future AGI: templates, types, output, and where you can run them across datasets, traces, simulations, experiments, and the SDK." ---- - -## About - -Evaluation is how you check whether your AI is doing the right thing. You define a check once, the platform runs it against your data, and every row gets a verdict and a reason. You can review individual rows, watch the aggregate over time, or fail a CI run when scores drop. - -Three pieces work together: an [eval template](/docs/evaluation/concepts/eval-templates) that defines what to measure, an [eval type](/docs/evaluation/concepts/eval-types) that determines how it runs (Agents, LLM-As-A-Judge, or Code), and a [result](/docs/evaluation/concepts/eval-results) that records the outcome. Pick a template, map your data, run. - ---- - -## How it works - -1. **Pick or create a template.** Choose from 70+ built-in templates (Toxicity, Groundedness, Tone, Task Completion, BLEU, ROUGE, ...) or create a custom one. The template's [type](/docs/evaluation/concepts/eval-types) determines what it can do at run time: - - **Agents** can call tools and MCP connectors, search a knowledge base, use the internet, and reason over multiple steps. - - **LLM-As-A-Judge** runs one templated prompt against a model. - - **Code** runs Python or JavaScript with no model call. - -2. **Map your data.** Each template declares the input variables it expects, written as `{{variable_name}}` in the criteria. You map your actual data to those variables. On a dataset, you pick columns. On a trace project, you pick span attributes. The template stays the same; only the mapping changes. - -3. **Configure run settings.** When applying the template you can override the model, change agent settings (mode, tools, knowledge bases, internet access, what context to inject), turn on error localization, and decide whether to add a reason column. - -4. **Run.** Future AGI processes every row in parallel. Each row gets a value, a reason, runtime, and the model that produced the verdict. - -5. **Review.** Results show as new columns in the dataset, eval logs on traces, scores on simulation calls, or returned values from the SDK. Aggregates (pass rate, average score, distributions) appear automatically. - ---- - -## Where evals run - -The same templates work across every surface in Future AGI. Pick the surface that matches what you want to evaluate. - -| Surface | What you evaluate | When to use | -|---|---|---| -| **Dataset** | Every row in a structured dataset | Offline evaluation, batch quality checks, regression suites | -| **Eval Playground** | A single ad-hoc input you type or paste | Quick sanity check while authoring a template | -| **Trace project** | Spans, traces, or sessions captured from your AI app | Live and historical evaluation of production traffic | -| **Simulation** | Calls produced by a simulated agent run | Pre-production testing of agents and prompts | -| **Experiment** | Variant outputs side by side (different prompts or models) | A/B comparison of prompt or model changes | -| **CI/CD pipeline** | Eval pass rates per code version | Gate deploys on quality | -| **SDK** | Anything you can pass to a Python or TypeScript function | Integrate evaluation into your own scripts and pipelines | - -Using the same template across surfaces keeps results directly comparable without redefining your quality criteria each time. - ---- - -## Key concepts - -- [**Eval types**](/docs/evaluation/concepts/eval-types): Agents, LLM-As-A-Judge, and Code. The type determines what a template can do. -- [**Eval templates**](/docs/evaluation/concepts/eval-templates): The shareable definition of a check. Built-in or custom, single or composite, versioned. -- [**Output types**](/docs/evaluation/concepts/output-types): Pass/fail, Scoring, and Choices. What the eval returns. -- [**Judge models**](/docs/evaluation/concepts/judge-models): Which model judges, when applicable. -- [**Data injection**](/docs/evaluation/concepts/data-injection): What context an eval gets in addition to your `{{variables}}`. -- [**Eval results**](/docs/evaluation/concepts/eval-results): The format of the verdict, reason, and aggregates. - ---- - -## Next Steps - -- [Evaluate via Platform & SDK](/docs/evaluation/features/evaluate): Run your first eval. -- [Built-in evals](/docs/evaluation/builtin): 70+ templates across quality, safety, factuality, RAG, audio, and image. -- [Create custom evals](/docs/evaluation/features/custom): Define your own criteria in any of the three types. -- [Test playground](/docs/evaluation/features/test-playground): Try an eval against a row, span, simulation, or custom input before committing it to a dataset. diff --git a/src/pages/docs/evaluation/concepts/versioning.mdx b/src/pages/docs/evaluation/concepts/versioning.mdx index dd0d6636..e40dca16 100644 --- a/src/pages/docs/evaluation/concepts/versioning.mdx +++ b/src/pages/docs/evaluation/concepts/versioning.mdx @@ -1,6 +1,22 @@ --- title: "Versioning: Changing a Template Without Breaking Running Evals" -description: "How eval template versions work in Future AGI, what each version stores, and how to use Set as Default and Restore Version to manage changes safely." +description: "How eval template versions work in FutureAGI, what each version stores, and how to use Set as Default and Restore Version to manage changes safely." +page_type: "concept" +diataxis: "explanation" +concept_family: "evaluation" +concept_level: "intermediate" +primary_question: "How does template versioning work?" +direct_answer: "Every save of an eval template creates a new immutable version. Existing applications keep using whichever version they pinned to; new applications get the default version. You can edit a template freely without breaking running evals." +related_concepts: + - "/docs/evaluation/concepts/eval-templates" + - "/docs/evaluation/concepts/composite-evals" + - "/docs/evaluation/concepts/eval-results" +related_tasks: + - "/docs/evaluation/build/custom" +has_diagram: true +diagram_type: "mermaid-flowchart" +last_diagram_reviewed: "2026-05-21" +schema_type: "TechArticle" --- ## About @@ -9,6 +25,13 @@ Eval templates are versioned. Every save creates a new immutable snapshot, and o This means you can edit a template's criteria, model, or settings without breaking anything that's already running. The change rolls out to new uses; the old uses stay on the version they started on. +|save| V2[V2] + V2 -->|save| V3[V3 default] + B1[Existing binding: dataset A] -.pinned.-> V1 + B2[Existing binding: dataset B] -.pinned.-> V2 + B3[New binding] -.picks default.-> V3`} /> + --- ## What a version captures @@ -126,9 +149,18 @@ If you change a mapping, you don't need a new version of the template. If you ch --- -## Next Steps +## What it isn't + +- **Version vs binding pin.** Versions live on the template. The pin lives on the binding (the application of the template to a dataset, trace project, or simulation). Editing a template doesn't move existing pins; setting a new default doesn't either. You have to re-pin a binding explicitly to upgrade it. +- **Restore Version vs reset.** Restoring an old version *creates a new version* with the old content. The version history stays intact, you can still see what V3 was after restoring V2. This is the safe "rollback" pattern. +- **Versioning vs result history.** Eval results are not part of the template's version history. Results live on the data they evaluated (datasets, traces, simulations). They are not affected by template version changes. +- **Version vs mapping change.** Versions cover the template definition (criteria, model, output type, required keys). They do not cover the input mapping or per-run overrides, those are stored on the binding and don't need a new version to change. + +--- + +## Related concepts - [Eval templates](/docs/evaluation/concepts/eval-templates): The shape of a template and what versioning preserves. - [Composite evals](/docs/evaluation/concepts/composite-evals): Pinning child versions inside a composite. - [Eval results](/docs/evaluation/concepts/eval-results): Where results live and why they're not affected by template versioning. -- [Create custom evals](/docs/evaluation/features/custom): Save creates the first version automatically. +- [Create custom evals](/docs/evaluation/build/custom): Save creates the first version automatically. diff --git a/src/pages/docs/evaluation/features/evaluate.mdx b/src/pages/docs/evaluation/features/evaluate.mdx deleted file mode 100644 index c5784a27..00000000 --- a/src/pages/docs/evaluation/features/evaluate.mdx +++ /dev/null @@ -1,339 +0,0 @@ ---- -title: "Evaluate via Platform and SDK" -description: "Apply an eval template to a dataset, trace project, simulation, or run it directly from the SDK. Covers mapping, overrides, and result review." ---- - -## About - -Once you have an [eval template](/docs/evaluation/concepts/eval-templates) (built-in or custom), you apply it to your data. The platform supports five surfaces: - -| Surface | What you evaluate | -|---|---| -| **Dataset** | Every row in a dataset. Results show as new columns. | -| **Trace project** | Spans, traces, or sessions captured from your AI app. | -| **Simulation** | Calls produced by a simulated agent run. | -| **Eval Playground** | A single ad-hoc input you type or paste. | -| **SDK** | Anything you can pass to a Python or TypeScript function. | - -The same templates work across all five. The only thing that differs is the mapping: on a dataset you map to columns, on a trace you map to span attributes, on a simulation you map to call fields. The verdict format stays the same. - -This page covers the dataset and SDK paths in detail. For the others see [Test playground](/docs/evaluation/features/test-playground), [Trace evaluation](/docs/observe/features/evals), and [Simulation evaluation](/docs/quickstart/running-evals-in-simulation). - ---- - -## When to use which surface - -- **Dataset:** Offline evaluation, batch quality checks, regression suites against a fixed set of inputs. -- **Trace project:** Live and historical evaluation of production traffic. Score what your app actually did. -- **Simulation:** Pre-production testing where you control the inputs but want to evaluate the agent's behaviour. -- **Eval Playground:** Quick sanity check while authoring a template. -- **SDK:** Integrate evaluation into your own scripts, CI pipelines, or applications. - ---- - -## Apply to a dataset (UI) - - - - - -Open the dataset you want to evaluate. If you don't have one yet, see [Create a dataset](/docs/dataset). - -![Populated dataset open in the dashboard with the Evaluate button at the top right of the data grid toolbar](/images/docs/evaluation/evaluate/dataset-page.png) - - - - - -Click **Evaluate** in the top-right of the dataset view. The eval picker drawer opens. - - - - - -Browse or search the eval list. You'll see: - -- **Built-in evals** like Toxicity, Groundedness, Tone (read-only, you can duplicate them) -- **Custom evals** authored in your workspace - -Click the eval to open its config panel. - -![Eval picker drawer with search box, tag filters, and a list of available built-in and custom evals](/images/docs/evaluation/evaluate/add-evaluation.png) - -Click an eval in the list to expand it and preview the criteria, required variables, and default settings before adding it. - - - - - -Map each variable the eval expects to a column in your dataset. - -For example, a `groundedness` eval expects `output` and `context`. Pick which dataset column maps to each. - -![Configured evals panel showing each eval mapped to dataset columns and a Run All button at the bottom](/images/docs/evaluation/evaluate/configured-evals-panel.png) - - - - - -Most settings on the template are overridable per-application. Common overrides: - -| Override | What it does | -|---|---| -| **Model** | Use a different judge model than the template's default. | -| **Mode** (Agent evals) | Switch between `Quick`, `Auto`, and `Agent`. | -| **Use Internet, Connectors, Knowledge Bases** (Agent evals) | Adjust agent capabilities for this dataset. | -| **Context** | Pick which [context options](/docs/evaluation/concepts/data-injection) the eval gets. | -| **Pass threshold** | Tighten or loosen the score cutoff. | -| **Error Localization** | Turn on to flag the offending field per failed row. | -| **Reason column** | Add a second column with the eval's explanation per row. | - -These overrides apply only to this dataset. The template stays unchanged. Other datasets using the same template are not affected. - - - - - -Click **Add & Run**. The platform queues an eval job that processes every row. Each row gets a verdict and (if Reason column is on) a reason. - -A new column appears on the dataset for the eval result. Aggregates show in the eval summary at the top. - -![Dataset with eval results populated per row, an average score at the bottom, and a click-through detail popup showing the eval's reason for one row](/images/docs/evaluation/evaluate/dataset-with-results-running.png) - - - - - ---- - -## Apply to a dataset (SDK) - -Use this when you want to script eval runs as part of an offline pipeline. - - - -```python title="Python" -from fi.datasets import Dataset - -# Load the dataset -dataset = Dataset.get("my-dataset") - -# Add an eval to it (creates the column and queues the run) -dataset.add_evaluation( - name="response_groundedness", - eval_template="groundedness", - required_keys_to_column_names={ - "output": "ai_response", - "context": "source_document", - }, - run=True, -) - -# Get aggregate stats -stats = dataset.get_eval_stats() -print(stats) -``` - -```typescript title="TypeScript" -const response = await fetch( - `https://api.futureagi.com/model-hub/develops/${datasetId}/add_user_eval/`, - { - method: "POST", - headers: { - "X-Api-Key": "YOUR_API_KEY", - "X-Secret-Key": "YOUR_SECRET_KEY", - "Content-Type": "application/json", - }, - body: JSON.stringify({ - name: "response_groundedness", - template_id: "groundedness-template-uuid", - config: { - mapping: { - output: "column-uuid-for-ai-response", - context: "column-uuid-for-source", - }, - reason_column: true, - }, - run: true, - }), - }, -); - -console.log(await response.json()); -``` - -```bash title="cURL" -curl -X POST "https://api.futureagi.com/model-hub/develops/{dataset_id}/add_user_eval/" \ - -H "X-Api-Key: YOUR_API_KEY" \ - -H "X-Secret-Key: YOUR_SECRET_KEY" \ - -H "Content-Type: application/json" \ - -d '{ - "name": "response_groundedness", - "template_id": "groundedness-template-uuid", - "config": { - "mapping": { - "output": "column-uuid-for-ai-response", - "context": "column-uuid-for-source" - }, - "reason_column": true - }, - "run": true - }' -``` - - - ---- - -## Run a single eval (SDK) - -Use this when you have an input in code and want to evaluate it without a dataset. - - - -```python title="Python" -from fi.evals import Evaluator - -evaluator = Evaluator( - fi_api_key="YOUR_API_KEY", - fi_secret_key="YOUR_SECRET_KEY", -) - -result = evaluator.evaluate( - eval_templates="toxicity", - inputs={"output": "You're awesome at this!"}, - model_name="turing_flash", -) - -print(result.eval_results[0].output) # "Passed" -print(result.eval_results[0].reason) -``` - -```typescript title="TypeScript" -import { Evaluator } from "@future-agi/ai-evaluation"; - -const evaluator = new Evaluator(); - -const result = await evaluator.evaluate( - "toxicity", - { output: "You're awesome at this!" }, - { modelName: "turing_flash" }, -); - -console.log(result); -``` - -```python title="Async" -# For long-running or large-batch runs -result = evaluator.evaluate( - eval_templates="toxicity", - inputs={"output": "..."}, - model_name="turing_flash", - is_async=True, -) -eval_id = result.eval_results[0].eval_id - -# Fetch when ready -result = evaluator.get_eval_result(eval_id) -print(result.eval_results[0].output) -``` - -```bash title="cURL" -curl -X POST https://api.futureagi.com/sdk/api/v1/new-eval/ \ - -H "X-Api-Key: YOUR_API_KEY" \ - -H "X-Secret-Key: YOUR_SECRET_KEY" \ - -H "Content-Type: application/json" \ - -d '{ - "eval_templates": "toxicity", - "inputs": {"output": "You are awesome at this!"}, - "model_name": "turing_flash" - }' -``` - - - -The eval template can be a built-in name (`toxicity`, `groundedness`, `tone`, ...) or a custom template you created. - - -Some local metrics (like `contains`, `regex`, `bleu_score`) run client-side and don't need an API key. See [SDK reference](/docs/sdk/evals) for the full list. - - ---- - -## Run multiple evals at once (SDK) - -```python -results = evaluator.evaluate( - eval_templates=["toxicity", "groundedness", "tone"], - inputs={ - "output": "...", - "context": "...", - }, - model_name="turing_flash", -) - -for r in results.eval_results: - print(r.eval_template_name, r.output, r.reason) -``` - ---- - -## Reading the result - -A result has a fixed shape. The exact format of `output` depends on the eval's [output type](/docs/evaluation/concepts/output-types): - -| Output type | `output` is | -|---|---| -| **Pass/fail** | The string `"Passed"` or `"Failed"` | -| **Scoring** | An object: `{ "choice": "Good", "score": 0.7 }` | -| **Choices** | An object: `{ "choice": "Formal", "score": 1.0 }` (or a list when multi-choice) | - -```python -r = result.eval_results[0] - -r.output # the verdict -r.reason # plain-language explanation -r.runtime # seconds -r.model # the model that judged -r.eval_id # unique ID, used for async fetch -``` - -See [Eval results](/docs/evaluation/concepts/eval-results) for the full schema. - ---- - -## Common patterns - -### CI/CD gating - -Run a fixed eval suite on every pull request and fail the build when pass rate drops below threshold. See [Evaluate CI/CD pipeline](/docs/evaluation/features/cicd). - -### Compare prompts side by side - -Use [Experiments](/docs/dataset/features/experiments) to evaluate the same dataset with two different prompts and see the eval scores per variant. - -### Evaluate production traffic - -Attach evals to a [trace project](/docs/observe/features/evals) so every span captured from production gets scored automatically. - ---- - -## Next Steps - - - - Try an eval against a sample input before applying it. - - - Pre-built templates ready to apply. - - - Author your own template. - - - Read and aggregate results. - - - Gate deploys on eval scores. - - diff --git a/src/pages/docs/evaluation/features/futureagi-models.mdx b/src/pages/docs/evaluation/features/futureagi-models.mdx deleted file mode 100644 index b244d6d0..00000000 --- a/src/pages/docs/evaluation/features/futureagi-models.mdx +++ /dev/null @@ -1,89 +0,0 @@ ---- -title: "Use Future AGI Models for AI Evaluation and Scoring" -description: "Future AGI's proprietary judge models are trained on diverse datasets to perform accurate evaluations and score AI outputs." ---- - -## About - -When you run an evaluation, the model you choose determines how accurately and how fast each response gets scored. Future AGI provides a set of proprietary models built and optimized specifically for evaluation, not general-purpose chat or generation. - -Each model is designed for a different need. Some prioritize accuracy across complex multimodal inputs. Others are built for speed, making them suitable for real-time guardrailing or high-volume pipelines. Choosing the right model lets you balance quality and performance for your specific workload. - -All models are available in the platform UI and the SDK, and work with both built-in and custom eval templates. - ---- - -## Available models - -- **Turing Large** `turing_large`: Flagship evaluation model that delivers best-in-class accuracy across multimodal inputs (text, images, audio). Recommended when maximal precision outweighs latency constraints. - -- **Turing Small** `turing_small`: Compact variant that preserves high evaluation fidelity while lowering computational cost. Supports text and image evaluations. - -- **Turing Flash** `turing_flash`: Latency-optimised model providing high-accuracy assessments for text and image inputs with fast response times. Use for high-volume runs. - ---- -### Quick comparison - -| Model | Code | Inputs | Best for | Latency | -| --- | --- | --- | --- | --- | -| Turing Large | `turing_large` | Text, image, audio | Max accuracy, multimodal evals | Higher | -| Turing Small | `turing_small` | Text, image | High fidelity, lower cost | Medium | -| Turing Flash | `turing_flash` | Text, image | Fast, high-accuracy evals | Low | - ---- - -## How to - - - - - - When adding or configuring an evaluation on a dataset or run test, choose **Use Future AGI Models** and pick a model from the dropdown. - ![Use Future AGI Models in the UI](/screenshot/product/evaluation/future-agi-models/1.png) - - - - - - - Pass `model_name` in your `evaluator.evaluate()` call. Use the model code from the table above (e.g. `turing_flash`, `turing_large`). - - ```python - from fi.evals import Evaluator - - evaluator = Evaluator(fi_api_key="...", fi_secret_key="...") - result = evaluator.evaluate( - eval_templates="tone", - inputs={"input": "Your text to evaluate."}, - model_name="turing_small", # or turing_flash, turing_large - ) - ``` - - - - - ---- - -## Next Steps - - - - Run evals from the UI or SDK. - - - Define your own eval rules and choose a model to run them. - - - Combine multiple checks into a single composite score. - - - Bring your own model for evaluations. - - - Run evals automatically in your pipeline. - - - How evaluation fits into the platform. - - diff --git a/src/pages/docs/evaluation/index.mdx b/src/pages/docs/evaluation/index.mdx index fca375a2..f688f099 100644 --- a/src/pages/docs/evaluation/index.mdx +++ b/src/pages/docs/evaluation/index.mdx @@ -1,51 +1,100 @@ --- -title: "Future AGI Evaluation: Measure Prompt and Agent Quality" -description: "Measure and compare the quality of prompts and agents across datasets, simulations, and experiments using built-in or custom eval templates." +title: "Evaluation in FutureAGI" +description: "Score prompts and agents against repeatable quality criteria. Pick a template, map your data, run, and inspect the verdict and reason for every row, span, simulation call, or SDK input." --- ## About -Evaluation is Future AGI's quality measurement layer. It gives you a consistent, repeatable way to measure whether your prompts and agents are behaving correctly and whether changes you make improve things or introduce regressions. +Evaluation is how you check whether your AI is doing the right thing. You define a check once, FutureAGI runs it against your data, and every row gets a verdict and a reason. You can review individual rows, watch the aggregate over time, or fail a CI run when scores drop. -There are two building blocks: **eval templates** define what to measure (task completion, tone, hallucination, safety, factual accuracy, or a custom rule you write yourself), and **eval configs** define how to measure (the judge model, input mapping, and run settings). Combine them with your data and you get a score, a pass/fail result, and an optional explanation per row or call, plus aggregated summaries, KPIs, and trend data across runs. +The two pieces you work with are **eval templates** (what to measure: toxicity, groundedness, tone, a custom rule) and **eval configs** (how to measure: judge model, input mapping, run settings). Combine them with your data and you get a score, a pass/fail result, and an optional explanation per row, plus aggregated summaries and trends across runs. -Evaluations run across every surface in Future AGI: datasets, simulations, experiments, playground, replay sessions, and CI/CD pipelines. You can also run them programmatically via the SDK. Using the same templates and configs across contexts keeps results directly comparable without redefining your quality criteria each time. +FutureAGI ships 130+ built-in templates covering quality, safety, factuality, RAG retrieval, format, bias, audio, and image evaluation. You can also create custom templates as [Agents, LLM-As-A-Judge, or Code](/docs/evaluation/concepts/eval-types), and bundle several into a [composite eval](/docs/evaluation/concepts/composite-evals) when you want a single combined verdict. -Future AGI ships 70+ built-in templates covering quality, safety, factuality, RAG retrieval, format, bias, audio, and image evaluation. You can also create custom templates of three types ([Agents, LLM-As-A-Judge, or Code](/docs/evaluation/concepts/eval-types)) and bundle several into a [composite eval](/docs/evaluation/concepts/composite-evals) when you want a single combined verdict. +--- + +## How it works + + Run + Data --> Run + Run --> Result + Result --> Aggregate`} /> + +1. **Pick or create a template.** Choose a built-in (Toxicity, Groundedness, Tone, Task Completion, BLEU, ROUGE, ...) or [create a custom one](/docs/evaluation/build/custom). The template's [type](/docs/evaluation/concepts/eval-types) controls what it can do at run time: + - **Agents** call tools and MCP connectors, search knowledge bases, use the internet, and reason over multiple steps. + - **LLM-As-A-Judge** runs one templated prompt against a model. + - **Code** runs Python or JavaScript in a sandbox with no model call. + +2. **Map your data.** The template declares input variables as `{{variable_name}}`. You bind them to your actual data: dataset columns, span attributes, simulation call fields, or raw values from the SDK. + +3. **Configure run settings.** Override the judge model, change agent settings, turn on [error localization](/docs/evaluation/build/error-localization), or attach [ground truth examples](/docs/evaluation/build/ground-truth). -## How Evaluation Connects to Other Features +4. **Run.** Every row gets a value, a reason, runtime, and the model that produced the verdict. -- **Datasets**: Run evals across dataset rows and store scores as new columns. [Learn more](/docs/dataset) -- **Simulation**: Score simulated agent conversations for quality, context retention, and escalation. [Learn more](/docs/simulation) -- **Optimization**: Feed eval results into prompt optimization to improve quality automatically. [Learn more](/docs/optimization) -- **CI/CD**: Gate pull requests on eval scores to catch regressions before they ship. [Learn more](/docs/evaluation/features/cicd) -- **Error Feed**: Eval-powered scoring for every traced agent execution. [Learn more](/docs/error-feed) +5. **Review.** Results show as columns on the dataset, eval logs on traces, scores on simulation calls, or returned values from the SDK. Aggregates appear automatically. -## Getting Started +--- + +## Where evals run + +The same template works across every surface. Pick the surface that matches what you want to evaluate. + +| Surface | What you evaluate | When to use | +|---|---|---| +| **Dataset** | Every row in a structured dataset | Offline evaluation, regression suites, batch quality checks | +| **Trace project** | Spans, traces, or sessions captured from your AI app | Live and historical evaluation of production traffic | +| **Simulation** | Calls produced by a simulated agent run | Pre-production testing of agents and prompts | +| **Experiment** | Variant outputs side by side | A/B comparison of prompt or model changes | +| **Test playground** | A single ad-hoc input | Sanity check while authoring a template | +| **CI/CD pipeline** | Eval pass rates per code version | Gate deploys on quality | +| **SDK** | Anything you can pass to a Python or TypeScript function | Integrate evaluation into your own scripts | + +Using the same template across surfaces keeps results directly comparable without redefining your quality criteria each time. + +--- + +## Where to start - - Run the first eval from the UI or SDK in minutes. - - - Define your own eval rules in any of the three types. - - - 70+ templates across quality, safety, factuality, RAG, and more. - - - Try a template against a row, span, simulation, or custom JSON before applying it. + + Run your first eval in 5 minutes with the Python SDK. - - Anchor judges with labelled examples retrieved at run time as few-shot context. + + Apply a built-in eval to a dataset from the dashboard. - - Pinpoint which input field caused a row to fail. + + Browse 130+ built-in templates across RAG, safety, quality, and more. - - Pick the right judge model for your check. - - - Run evals automatically on every pull request. + + Define your own quality rules as Agents, LLM-As-A-Judge, or Code. + +--- + +## How evaluation connects to other features + +- **Datasets**: Run evals across dataset rows and store scores as new columns. [Learn more](/docs/dataset). +- **Observability**: Score live production traces. See [Run evals on traces](/docs/observe/features/evals). +- **Simulation**: Score simulated agent conversations. See [Run evals in simulation](/docs/quickstart/running-evals-in-simulation). +- **Experiments**: Compare prompt or model variants side by side. See [Experiments](/docs/dataset/features/experiments). +- **Optimization**: Feed eval results into prompt optimization. See [Optimization](/docs/optimization). +- **CI/CD**: Gate pull requests on eval scores. See [Run evals in CI/CD](/docs/evaluation/run/cicd). + +--- + +## Next steps by intent + +- **Want to learn the model first?** Read [Eval types](/docs/evaluation/concepts/eval-types) and [Eval templates](/docs/evaluation/concepts/eval-templates). +- **Want a quick win?** Follow the [Quickstart](/docs/quickstart/evals). +- **Want to pick the right evaluator?** Browse the [Evaluator catalog](/docs/evaluation/builtin). +- **Want to gate CI on quality?** Set up [Run evals in CI/CD](/docs/evaluation/run/cicd). +- **Want to score production traffic?** Configure [Eval tasks on traces](/docs/observe/features/evals). +- **Want to debug a failed verdict?** Use the [Test playground](/docs/evaluation/build/test-playground). diff --git a/src/pages/docs/evaluation/features/custom-models.mdx b/src/pages/docs/evaluation/judge-models/custom.mdx similarity index 76% rename from src/pages/docs/evaluation/features/custom-models.mdx rename to src/pages/docs/evaluation/judge-models/custom.mdx index 6562e65f..dec496e2 100644 --- a/src/pages/docs/evaluation/features/custom-models.mdx +++ b/src/pages/docs/evaluation/judge-models/custom.mdx @@ -1,11 +1,11 @@ --- -title: "Use Custom Models for AI Evaluation in Future AGI" -description: "Use your own or third-party models for evaluations in Future AGI via supported providers or a custom API endpoint with full configuration control." +title: "Use Custom Models for AI Evaluation in FutureAGI" +description: "Use your own or third-party models for evaluations in FutureAGI via supported providers or a custom API endpoint with full configuration control." --- ## About -Evaluations need a model to act as the judge: to read each response and decide whether it passes, fails, or scores within a range. Custom models let you bring your own judge instead of using Future AGI's built-in models. +Evaluations need a model to act as the judge: to read each response and decide whether it passes, fails, or scores within a range. Custom models let you bring your own judge instead of using FutureAGI's built-in models. This matters when you have a model that knows your domain better, when you need inference to stay within a specific cloud provider or region, or when you want to track evaluation costs against a model you already pay for. @@ -17,7 +17,7 @@ Two ways to connect: - **Custom endpoint**: Connect any model behind an HTTP API, including self-hosted, fine-tuned, or proxy deployments. -Learn how to define eval rules that use your model: [Create custom evals](/docs/evaluation/features/custom). +Learn how to define eval rules that use your model: [Create custom evals](/docs/evaluation/build/custom). --- @@ -68,14 +68,14 @@ Choose how you want to connect your model: Fill in the provider-specific authentication and options (e.g. API key, region, endpoint) in the form for your provider. - Give the model a **custom name** so you can recognise it in the model dropdown. Enter **input** and **output token cost per million tokens** so Future AGI can compute cost when running evaluations. + Give the model a **custom name** so you can recognise it in the model dropdown. Enter **input** and **output token cost per million tokens** so FutureAGI can compute cost when running evaluations. Save the model; it will appear in the model dropdown when you add or run custom evaluations. - + Connect any model behind an API endpoint: self-hosted, fine-tuned, or third-party. Use this when integrating endpoints that are not one of the supported providers. @@ -84,10 +84,10 @@ Choose how you want to connect your model: ![Add custom model](/images/custom-model/6.png) - **Model name**: a friendly identifier (e.g. `mistral-rag-prod`) so you can recognise it in selectors and reports. **API base URL**: the endpoint Future AGI will call (e.g. `https://api.my-model-server.com/v1`). Required for evaluations, RAG, and agent calls. + **Model name**: a friendly identifier (e.g. `mistral-rag-prod`) so you can recognise it in selectors and reports. **API base URL**: the endpoint FutureAGI will call (e.g. `https://api.my-model-server.com/v1`). Required for evaluations, RAG, and agent calls. - Enter **input token cost per million tokens** and **output token cost per million tokens** so Future AGI can compute cost and show usage analytics (e.g. `1.50` for input, `2.00` for output). + Enter **input token cost per million tokens** and **output token cost per million tokens** so FutureAGI can compute cost and show usage analytics (e.g. `1.50` for input, `2.00` for output). If your API needs extra headers or parameters (e.g. `Authorization: Bearer ...`), use **Add custom configuration** and add **Custom key** and **Custom value** pairs. Use this for auth, multi-tenant routing, or provider-specific options. @@ -108,34 +108,19 @@ Fields you may see when adding a model (from a provider or custom). **Applies to | Field | Applies to | About | Example | | --- | --- | --- | --- | -| **Model name** / **Custom name** | Both | Friendly name for the model in Future AGI; shown in selectors and reports. | `mistral-rag-prod`, `my-openai-gpt4` | +| **Model name** / **Custom name** | Both | Friendly name for the model in FutureAGI; shown in selectors and reports. | `mistral-rag-prod`, `my-openai-gpt4` | | **Input token cost per million tokens** | Both | Cost of input tokens per 1M tokens; used for cost tracking and analytics. | `1.50` | | **Output token cost per million tokens** | Both | Cost of output tokens per 1M tokens; used with input cost for total cost. | `2.00` | | **Provider-specific fields** (auth, region, model ID, etc.) | From providers | Vary by provider (e.g. API key, region). See provider tabs in Step 1. | | -| **API base URL** | Custom model | Endpoint Future AGI calls for your model (evaluations, RAG, agent calls). | `https://api.my-model-server.com/v1` | +| **API base URL** | Custom model | Endpoint FutureAGI calls for your model (evaluations, RAG, agent calls). | `https://api.my-model-server.com/v1` | | **Add custom configuration** (Custom key & value) | Custom model | Custom headers or params (e.g. auth). Key/value pairs. | **Key:** `Authorization` **Value:** `Bearer sk-...` | --- -## Next Steps - - - - Run a single eval from the UI or SDK. - - - Define eval rules and select your custom model. - - - Run multiple evals together as a group. - - - Built-in models available for evals. - - - Run evals automatically in your pipeline. - - - How evaluation fits into the platform. - - +## Next steps + +- [Run evals in the UI](/docs/evaluation/run/in-the-ui): apply an eval and pick your custom model from the dropdown. +- [Create custom evals](/docs/evaluation/build/custom): define eval rules that use your custom model. +- [FutureAGI models](/docs/evaluation/judge-models/futureagi): the built-in models available alongside your custom ones. +- [Composite evals](/docs/evaluation/concepts/composite-evals): bundle several checks into one verdict. +- [Run evals in CI/CD](/docs/evaluation/run/cicd): gate pull requests on eval pass rates. diff --git a/src/pages/docs/evaluation/judge-models/futureagi.mdx b/src/pages/docs/evaluation/judge-models/futureagi.mdx new file mode 100644 index 00000000..83553be2 --- /dev/null +++ b/src/pages/docs/evaluation/judge-models/futureagi.mdx @@ -0,0 +1,77 @@ +--- +title: "Use FutureAGI Models for AI Evaluation and Scoring" +description: "FutureAGI's proprietary judge models are trained on diverse datasets to perform accurate evaluations and score AI outputs." +--- + +## About + +When you run an evaluation, the model you choose determines how accurately and how fast each response gets scored. FutureAGI provides a set of proprietary models built and optimized specifically for evaluation, not general-purpose chat or generation. + +Each model is designed for a different need. Some prioritize accuracy across complex multimodal inputs. Others are built for speed, making them suitable for real-time guardrailing or high-volume pipelines. Choosing the right model lets you balance quality and performance for your specific workload. + +All models are available in the platform UI and the SDK, and work with both built-in and custom eval templates. + +--- + +## Available models + +- **Turing Large** `turing_large`: Flagship evaluation model that delivers best-in-class accuracy across multimodal inputs (text, images, audio). Recommended when maximal precision outweighs latency constraints. + +- **Turing Small** `turing_small`: Compact variant that preserves high evaluation fidelity while lowering computational cost. Supports text and image evaluations. + +- **Turing Flash** `turing_flash`: Latency-optimised model providing high-accuracy assessments for text and image inputs with fast response times. Use for high-volume runs. + +--- +### Quick comparison + +| Model | Code | Inputs | Best for | Latency | +| --- | --- | --- | --- | --- | +| Turing Large | `turing_large` | Text, image, audio | Max accuracy, multimodal evals | Higher | +| Turing Small | `turing_small` | Text, image | High fidelity, lower cost | Medium | +| Turing Flash | `turing_flash` | Text, image | Fast, high-accuracy evals | Low | + +--- + +## How to + + + + + + When adding or configuring an evaluation on a dataset or run test, choose **Use FutureAGI Models** and pick a model from the dropdown. + ![Use FutureAGI Models in the UI](/screenshot/product/evaluation/future-agi-models/1.png) + + + + + + + Pass `model` in your `evaluate()` call. Use the model code from the table above (e.g. `turing_flash`, `turing_large`). + + ```python + from fi.evals import evaluate + + result = evaluate( + "tone", + input="Your text to evaluate.", + model="turing_small", # or turing_flash, turing_large + ) + + print(result.score, result.passed, result.reason) + ``` + + See [Run evals with the Python SDK](/docs/evaluation/run/python-sdk) for the full SDK reference. + + + + + +--- + +## Next steps + +- [Run evals in the UI](/docs/evaluation/run/in-the-ui): apply an eval that uses one of these models to a dataset. +- [Run evals with the Python SDK](/docs/evaluation/run/python-sdk): pass `model="turing_flash"` (or another model code) from code. +- [Use custom models](/docs/evaluation/judge-models/custom): bring your own model as the judge instead. +- [Create custom evals](/docs/evaluation/build/custom): define your own eval rules and pick a model to run them. +- [Judge models concept](/docs/evaluation/concepts/judge-models): how the judge model affects accuracy, cost, and latency. diff --git a/src/pages/docs/evaluation/reference/input-schema.mdx b/src/pages/docs/evaluation/reference/input-schema.mdx new file mode 100644 index 00000000..c4bef4bd --- /dev/null +++ b/src/pages/docs/evaluation/reference/input-schema.mdx @@ -0,0 +1,144 @@ +--- +title: "Evaluator input schema" +description: "The required and optional input keys each eval template expects, the standard input names across the catalog, and how mapping turns dataset columns or span attributes into eval inputs." +--- + +## About + +Every eval template declares the input keys it needs. When you apply a template to data, you map your actual data (dataset columns, span attributes, simulation call fields) to those keys. The template stays the same; the mapping is stored per-binding. + +This page is the reference for what those keys are: the standard names used across the catalog, what each one expects, and how to set them from the UI, SDK, and API. + +--- + +## Standard input keys + +Most built-in evaluators reuse a small set of canonical input names. Knowing them makes mapping faster. + +| Key | What it is | Common in | +|---|---|---| +| `input` | The user's question, prompt, or query | RAG evals, agent evals, instruction-following | +| `output` | The AI's response, what's being evaluated | Nearly every eval | +| `context` | Retrieved context, source documents, or grounding material | RAG evals (Groundedness, Context Adherence) | +| `expected` / `expected_response` / `expected_text` / `expected_value` | Ground-truth reference for comparison | Statistical metrics (BLEU, ROUGE, F1), exact-match evals | +| `reference` | Reference text for metric-style evals (paired with `hypothesis`) | NLP metrics (WER, METEOR, CHRF, MAP) | +| `hypothesis` | Generated text being scored against `reference` | NLP metrics | +| `conversation` | Multi-turn conversation transcript | Customer agent evals, conversation quality | +| `system_prompt` | The system message used by the agent | Customer agent prompt conformance | +| `transcription` | Speech-to-text output | Audio evals (ASR/STT accuracy) | +| `audio` | Audio file or recording | Audio quality, TTS accuracy | +| `text` (alone) | Single text field for format/validation checks | Code-type evals (is_json, is_url, length_*) | +| `keyword` | Specific term to search for | `contains` family | +| `images` / `instruction` | Image set + the text instruction that produced them | Image evals (CLIP score, Image Instruction Adherence) | + +For each evaluator's exact required keys, see its [individual catalog page](/docs/evaluation/builtin) or hover the variable list in the eval picker. + +--- + +## Required vs optional keys + +A template can declare both required and optional keys. + +- **Required keys** must be mapped before the eval can run. The UI blocks Save until they are. +- **Optional keys** can be left unmapped; the eval handles missing values per its own rules (skip the row, return `null`, or fall back to a default). + +In the SDK, required keys correspond to required `**inputs` kwargs on `evaluate()`. Optional keys are simply omitted. + +```python +from fi.evals import evaluate + +# Context adherence: required = output + context. +result = evaluate( + "context_adherence", + output="Paris is the capital of France.", + context="France's capital city is Paris.", +) +``` + +--- + +## Mapping per surface + +The mapping mechanism varies by surface, but the template's required keys are the same. + +| Surface | What you map to | +|---|---| +| Dataset | Dataset column names (or UUIDs via API) | +| Trace project | Span attribute paths (e.g. `gen_ai.input`, `gen_ai.output`, `retrieval.documents`) | +| Simulation | Call fields: `transcript`, `recording`, `scenario`, individual variables | +| Test playground | Plain JSON keys you type in | +| SDK | Direct `**inputs` keyword arguments to `evaluate()` | + +The same template can be applied to all five surfaces with different mappings, and the verdict format stays identical. + +--- + +## Mapping schema in the API + +When attaching an eval to a dataset via REST, the mapping lives in `config.mapping`: + +```json +{ + "name": "response_groundedness", + "template_id": "groundedness-template-uuid", + "config": { + "mapping": { + "output": "column-uuid-for-ai-response", + "context": "column-uuid-for-source-document" + }, + "reason_column": true + }, + "run": true +} +``` + +Left side keys are the template's input keys (canonical names). Right side values are your dataset's column UUIDs (or column names in some endpoints). + +For single-eval SDK calls (`POST /sdk/api/v1/new-eval/`), the inputs go directly under `inputs`: + +```json +{ + "eval_name": "groundedness", + "inputs": { + "output": "Paris is the capital of France.", + "context": "France's capital city is Paris." + }, + "config": { "model": "turing_flash" } +} +``` + +--- + +## Input types beyond strings + +Most input keys are strings, but a few templates accept richer types. + +| Type | Accepted shape | Examples | +|---|---|---| +| String | Plain text | Most evals | +| Conversation | Array of `{role, content}` messages | Customer agent evals | +| JSON object | Nested dict | `json_diff`, structured-output checks | +| File reference | URL or path to media | Audio, image, OCR evals | +| List of items | Array of strings or objects | Retrieval metrics (`precision_at_k`, `recall_at_k`), `reference` and `hypothesis` as arrays | + +For file-reference inputs, the platform downloads the file before passing it to the judge. If the URL is unreachable, the eval fails with a "media file not accessible" error. Make sure files are public or pre-authenticated. + +--- + +## Common mistakes + +| Mistake | Why it fails | Fix | +|---|---|---| +| Variable name in instructions doesn't match a mapped key | The template can't find the value at run time | Make `{{variable_name}}` in the prompt match a key in your mapping exactly | +| Mapping `output` to an empty column | Eval scores against empty text, returns trivial verdicts | Ensure the column is populated before mapping | +| Mapping a long context to a `context` key that exceeds judge token limit | Truncation, partial scoring | Trim or chunk context; use a model with larger context | +| Passing a URL to a non-media key when it auto-detects as media | "Media file not accessible" error | Rename the key (e.g. `pr_link` not `pr_url`) or pass the URL as plain text | + +--- + +## Next steps + +- [Eval templates](/docs/evaluation/concepts/eval-templates): how templates declare keys. +- [Eval result schema](/docs/evaluation/reference/result-schema): what comes back after running. +- [Run evals with the Python SDK](/docs/evaluation/run/python-sdk): passing inputs as kwargs. +- [Run evals with the API](/docs/evaluation/run/api): the exact API request body. diff --git a/src/pages/docs/evaluation/reference/result-schema.mdx b/src/pages/docs/evaluation/reference/result-schema.mdx new file mode 100644 index 00000000..b554beeb --- /dev/null +++ b/src/pages/docs/evaluation/reference/result-schema.mdx @@ -0,0 +1,169 @@ +--- +title: "Eval result schema" +description: "The exact fields an eval returns: output, score, reason, runtime, model. What each field contains across Pass/fail, Scoring, and Choices output types, and how to read async results." +--- + +## About + +Every eval run returns the same top-level fields per row, span, simulation call, or SDK input: the **verdict** (`output` on API, `score` + `passed` on SDK), a **reason** (plain-language explanation), how long it took (`runtime` in seconds on API, `latency_ms` in milliseconds on SDK), the **model** (which judge produced the verdict), and an **eval_id** for later retrieval. + +The shape of the verdict is the only field that changes, it depends on the eval's [output type](/docs/evaluation/concepts/output-types). + +--- + +## Top-level fields + +| Field | Type | Description | +|---|---|---| +| `output` (API/UI) / `score` + `passed` (SDK) | varies | The verdict. API returns `output` (string or object). SDK returns `score` (float 0-1) plus `passed` (bool). Shape of the object verdict depends on output type (see below). | +| `reason` | string | Plain-language explanation. Empty for Code evals that don't return a reason. | +| `runtime` (API) / `latency_ms` (SDK) | float | How long the eval took for this row. API returns seconds, SDK returns milliseconds. | +| `model` | string | The judge model that produced the verdict. `null` for Code evals. | +| `status` (SDK) / `evalStatus` (async poll) | string | SDK sync: `"completed"` or `"error"`. Async poll: `"pending"`, `"running"`, or `"completed"`. | +| `eval_id` | string (UUID) | Unique identifier. Used to fetch results in async runs. | +| `eval_name` | string | The eval template's name (e.g. `toxicity`, `groundedness`). | + +--- + +## Value shape by output type + +### Pass/fail + +```json +{ + "eval_id": "9c1a...", + "eval_name": "toxicity", + "output": "Passed", + "score": 1.0, + "reason": "The response is professional with no harmful content.", + "runtime": 0.84, + "model": "turing_flash" +} +``` + +The API returns `output` as the string `"Passed"` or `"Failed"`. `score` is `1.0` for pass, `0.0` for fail. The SDK additionally exposes a `passed` boolean on the result object. + +### Scoring + +```json +{ + "eval_id": "9c1a...", + "eval_name": "groundedness", + "output": { "choice": "Mostly grounded", "score": 0.7 }, + "score": 0.7, + "reason": "The response cites the source for dates but adds an unrelated detail.", + "runtime": 1.42, + "model": "turing_large" +} +``` + +`output.choice` is the label the judge picked from the choice set. `output.score` is the numeric value you assigned to that label (0-1 in 0.1 increments). On the SDK, `passed` is computed by comparing `score` to the template's pass threshold. + +### Choices + +Single-choice: + +```json +{ + "eval_id": "9c1a...", + "eval_name": "tone", + "output": { "choice": "Formal", "score": 1.0 }, + "score": 1.0, + "reason": "The response uses complete sentences and avoids contractions.", + "runtime": 0.91, + "model": "turing_flash" +} +``` + +Multi-choice (when the template's `multi_choice: true`): + +```json +{ + "eval_id": "9c1a...", + "eval_name": "intent", + "output": { "choice": ["Friendly", "Concise"], "score": 1.0 }, + "score": 1.0, + "reason": "The response is warm and brief.", + "runtime": 0.88, + "model": "turing_flash" +} +``` + +`output.score` for Choices reflects the verdict mark on the chosen label: Pass = 1.0, Neutral = 0.5, Fail = 0.0. For multi-choice, the score is computed across chosen labels. + +--- + +## Where results are stored + +| Surface | Where the result lives | +|---|---| +| Dataset | New column on the dataset, one cell per row. Optional second column for the reason. | +| Trace project | Eval log on the span, trace, or session. Visible on span detail and in trace charts. | +| Simulation | On the call execution. Visible in call detail and run summary. | +| Experiment | A new column per variant in the experiment grid. | +| Test playground | Returned in the response. Not persisted. | +| SDK | Returned to the caller. For async runs, returned via `eval_id` once the run completes. | + +--- + +## Async result retrieval + +For long-running runs (large datasets, CI, batch), `Evaluator.evaluate()` accepts `is_async=True` and returns an `eval_id` immediately. The result is fetched later with `get_eval_result(eval_id)`. + +```python +import os +from fi.evals import Evaluator + +evaluator = Evaluator( + fi_api_key=os.environ["FI_API_KEY"], + fi_secret_key=os.environ["FI_SECRET_KEY"], +) + +result = evaluator.evaluate( + eval_templates="toxicity", + inputs={"output": "..."}, + model_name="turing_flash", + is_async=True, +) +eval_id = result.eval_results[0].eval_id + +# Poll later (or via REST: GET /sdk/api/v1/new-eval/?eval_id=...) +poll = evaluator.get_eval_result(eval_id) +inner = poll.get("result", {}) +if isinstance(inner, dict) and inner.get("eval_status") == "completed": + eval_data = inner["result"] + print(eval_data["output"], eval_data["reason"]) +``` + + +`is_async=True` is only available on `Evaluator.evaluate()`, not on the standalone `evaluate()` function. See the [async batch eval cookbook](/docs/cookbook/quickstart/async-batch-eval) for the polling pattern. + + +While the run is in progress, `eval_status` is `"pending"` or `"running"`. Once complete it flips to `"completed"`. + + +A status of `"error"` (SDK) means the eval itself errored. A verdict `output` of `"Failed"` means the eval ran successfully and the row didn't pass. Two different fields, two different fixes. + + +--- + +## Aggregate fields + +When an eval runs across more than one row, the platform computes aggregates that appear in dataset summaries, trace eval charts, and experiment comparison views. + +| Output type | Aggregate | +|---|---| +| Pass/fail | Pass rate (percentage of rows that passed) | +| Scoring | Average score; pass rate against the template threshold; distribution across labels | +| Choices | Distribution across labels (single-choice) or label co-occurrence (multi-choice) | + +For composite evals, the aggregate is computed using the [aggregation function](/docs/evaluation/concepts/composite-evals#the-five-aggregation-functions) set on the template. + +--- + +## Next steps + +- [Output types](/docs/evaluation/concepts/output-types): the three output types and what each one returns per row. +- [Evaluator input schema](/docs/evaluation/reference/input-schema): the required input keys per eval template. +- [Score types reference](/docs/evaluation/reference/score-types): how scores and choice values map to verdicts and pass/fail. +- [Eval results](/docs/evaluation/concepts/eval-results): conceptual overview of what a result is and where it lives. diff --git a/src/pages/docs/evaluation/reference/score-types.mdx b/src/pages/docs/evaluation/reference/score-types.mdx new file mode 100644 index 00000000..393e3c65 --- /dev/null +++ b/src/pages/docs/evaluation/reference/score-types.mdx @@ -0,0 +1,197 @@ +--- +title: "Score types reference" +description: "Numeric values, label mappings, and pass/fail derivation across the three output types: pass_fail, percentage, and deterministic. The exact rules for how a verdict becomes a score and a pass." +--- + +## About + +This page is the strict reference for how an eval verdict turns into a numeric score and a pass/fail decision. The concept page on [output types](/docs/evaluation/concepts/output-types) explains when to pick each type; this page documents the exact value mappings the platform uses. + +--- + +## The three output type codes + +| Code (API/SDK) | UI label | Verdict (`output`) | Score field | Pass derivation | +|---|---|---|---|---| +| `pass_fail` | **Pass/fail** | `"Passed"` or `"Failed"` | `1.0` if passed, `0.0` if failed | `output == "Passed"` | +| `percentage` | **Scoring** | `{ choice, score }` | The numeric score (0-1) you assigned to the chosen label | `score >= pass_threshold` | +| `deterministic` | **Choices** | `{ choice, score }` (or array when multi-choice) | Derived from the verdict mark on the chosen label | Chosen label's mark is `Pass` | + +--- + +## `pass_fail`: Pass/fail + +The simplest type. No label set to configure, no scores to assign. + +```json +{ "output": "Passed", "score": 1.0, "passed": true } +{ "output": "Failed", "score": 0.0, "passed": false } +``` + +| Aggregate | Computed as | +|---|---| +| Pass rate | `count(passed=true) / count(total)` | + +--- + +## `percentage`: Scoring + +You define a set of named levels and assign each a numeric score 0-1 (in 0.1 increments). You also set a pass threshold. + +**Template definition:** + +```json +{ + "output_type": "percentage", + "choice_scores": { + "Excellent": 1.0, + "Good": 0.7, + "Average": 0.5, + "Poor": 0.0 + }, + "pass_threshold": 0.5 +} +``` + +**Per-row verdict:** + +```json +{ + "output": { "choice": "Good", "score": 0.7 }, + "score": 0.7, + "passed": true +} +``` + +The judge picks the label. The platform looks up the score from `choice_scores`, then compares to `pass_threshold` to derive `passed`. + +**Pass derivation:** + +| Threshold | Excellent (1.0) | Good (0.7) | Average (0.5) | Poor (0.0) | +|---|---|---|---|---| +| `0.5` | passed | passed | passed | failed | +| `0.7` | passed | passed | failed | failed | +| `0.8` | passed | failed | failed | failed | + +| Aggregate | Computed as | +|---|---| +| Average score | mean of `score` across rows | +| Pass rate | `count(score >= pass_threshold) / count(total)` | +| Label distribution | per-label row counts | + + +Scoring is choice-based, not free-form. The judge picks from your labels, it doesn't return an arbitrary number. This produces more stable scores across runs than asking the judge for a raw 0-1 value. + +For free-form numeric output (BLEU, ROUGE, embedding similarity), use a [Code eval](/docs/evaluation/concepts/eval-types#code) instead. + + +--- + +## `deterministic`: Choices + +You define a set of labels, mark each as **Pass**, **Neutral**, or **Fail**, and optionally enable multi-choice. + +**Template definition (single-choice):** + +```json +{ + "output_type": "deterministic", + "choices": [ + { "label": "Formal", "verdict": "Pass" }, + { "label": "Casual", "verdict": "Pass" }, + { "label": "Aggressive", "verdict": "Fail" }, + { "label": "Neutral", "verdict": "Neutral" } + ], + "multi_choice": false +} +``` + +**Per-row verdict:** + +```json +{ + "output": { "choice": "Formal", "score": 1.0 }, + "score": 1.0, + "passed": true +} +``` + +**Score derivation by verdict mark:** + +| Verdict mark | Score | +|---|---:| +| Pass | `1.0` | +| Neutral | `0.5` | +| Fail | `0.0` | + +**Multi-choice:** + +When `multi_choice: true`, the judge can pick more than one label. The composite score is the average of the chosen labels' scores. + +```json +{ + "output": { "choice": ["Friendly", "Concise"], "score": 1.0 } +} +``` + +If `Friendly = Pass (1.0)` and `Concise = Pass (1.0)`, the row score is `1.0`. If one is `Pass` and one is `Neutral`, the row score is `0.75`. + +| Aggregate | Computed as | +|---|---| +| Label distribution | per-label row counts | +| Pass rate | `count(chosen label marked Pass) / count(total)` | +| Co-occurrence (multi-choice) | pairwise label frequency | + +--- + +## Code eval return values + +Code evals don't pick a label, they return a value directly. The platform maps that return value into the same schema. + +| Code returns | Treated as | Notes | +|---|---|---| +| `True` / `False` | `output: "Passed"` / `"Failed"`, `score: 1.0 / 0.0` | Pass/fail output type | +| `float` in `[0, 1]` | `score: `, `passed: score >= pass_threshold` | Scoring output type | +| `dict` `{score, reason}` | both fields surfaced; pass derived from score vs threshold | Richer than a bare return | +| `None` | Row is skipped | Useful when ground truth is missing | +| Other types | Error: SDK `status: "error"` | Wrap in supported return types | + +--- + +## Composite output + +Composite evals run several children and aggregate them into one final score. The composite's output type matches the **child axis** the template was configured with (Pass/fail, Scoring, or Choices). + +The score derivation depends on the aggregation function: + +| Function | Composite score | +|---|---| +| Weighted Average | `sum(child_score × weight) / sum(weights)` | +| Average | `mean(child_score)` | +| Minimum (safety gate) | `min(child_score)` | +| Maximum | `max(child_score)` | +| Pass Rate | `count(child.passed) / count(children)` | + +See [Composite evals](/docs/evaluation/concepts/composite-evals) for guidance on when to pick each function. + +--- + +## Pass threshold details + +For Scoring output type, the pass threshold: + +- Lives on the template (default `0.5`). +- Is overridable per-binding (UI: "Override pass threshold" when adding the eval to a dataset). +- Is used only for aggregation (`pass_rate`) and the `passed` boolean. Individual row scores are unaffected. +- Uses `>=` comparison: a row with `score == threshold` passes. + +For Pass/fail and Choices output types, there is no threshold. The verdict itself or the chosen label's mark determines pass/fail. + +--- + +## Next steps + +- [Output types](/docs/evaluation/concepts/output-types): conceptual overview of when to pick each type. +- [Eval result schema](/docs/evaluation/reference/result-schema): the full result object shape. +- [Composite evals](/docs/evaluation/concepts/composite-evals): aggregation across multiple child evals. +- [Eval templates](/docs/evaluation/concepts/eval-templates): how to configure choice labels and thresholds. diff --git a/src/pages/docs/evaluation/run/api.mdx b/src/pages/docs/evaluation/run/api.mdx new file mode 100644 index 00000000..55fdefa5 --- /dev/null +++ b/src/pages/docs/evaluation/run/api.mdx @@ -0,0 +1,147 @@ +--- +title: "Run evals with the API" +description: "Call the FutureAGI evaluation API directly with cURL or any HTTP client. Cover the new-eval endpoint, multi-eval batches, config params, and async result retrieval." +--- + +## About + +Use the FutureAGI REST API when you need to evaluate from a language without an SDK, embed eval calls in a CI step, or build a custom integration. All endpoints return JSON and accept JSON request bodies. + +For language-specific paths see [Run evals with the Python SDK](/docs/evaluation/run/python-sdk) and [Run evals with TypeScript](/docs/evaluation/run/typescript-sdk). + +--- + +## Prerequisites + +- A FutureAGI workspace. +- API key and secret from **Settings → API Keys**. +- Any HTTP client. Examples below use cURL. + +Authentication uses two headers: + +| Header | Value | +|---|---| +| `X-Api-Key` | Your FutureAGI API key | +| `X-Secret-Key` | Your FutureAGI secret key | + +Base URL: `https://api.futureagi.com`. + +--- + +## Run a single eval + +`POST /sdk/api/v1/new-eval/` runs one eval template against the inputs you provide. + +```bash +curl -X POST https://api.futureagi.com/sdk/api/v1/new-eval/ \ + -H "X-Api-Key: YOUR_API_KEY" \ + -H "X-Secret-Key: YOUR_SECRET_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "eval_name": "toxicity", + "inputs": { + "output": "You are awesome at this!" + }, + "config": { + "model": "turing_flash" + } + }' +``` + +Request body fields: + +| Field | Type | Required | Notes | +|---|---|---:|---| +| `eval_name` | string | Yes | Built-in name (`toxicity`, `groundedness`, `tone`, ...) or a custom template name in your workspace. | +| `inputs` | object | Yes | Eval's required input fields as a flat object (e.g. `{"output": "...", "context": "..."}`). | +| `config` | object | No | Eval-specific config such as `model`, threshold overrides, or function-eval params. | +| `is_async` | boolean | No | Default `false`. Set to `true` to queue the eval and fetch the result later. | + +Response: + +```json +{ + "eval_id": "abc-123-...", + "eval_name": "toxicity", + "output": "Passed", + "score": 1.0, + "reason": "The response is positive and contains no harmful content.", + "runtime": 0.84, + "model": "turing_flash" +} +``` + +--- + +## Pass eval-specific config params + +Function evals (Code type) accept extra parameters in `config.params`. + +```bash +curl -X POST https://api.futureagi.com/sdk/api/v1/new-eval/ \ + -H "X-Api-Key: YOUR_API_KEY" \ + -H "X-Secret-Key: YOUR_SECRET_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "eval_name": "precision_at_k", + "inputs": { + "hypothesis": "[\"A\", \"B\", \"C\"]", + "reference": "[\"A\", \"C\", \"D\"]" + }, + "config": { + "params": {"k": 2} + } + }' +``` + +See each eval's reference page for its required `inputs` keys and accepted `config` params. + +--- + +## Run an eval asynchronously + +For long-running evaluations, set `is_async: true`. The response returns an `eval_id` immediately; fetch the result when ready. + +```bash +# Kick off async run +curl -X POST https://api.futureagi.com/sdk/api/v1/new-eval/ \ + -H "X-Api-Key: YOUR_API_KEY" \ + -H "X-Secret-Key: YOUR_SECRET_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "eval_name": "toxicity", + "inputs": {"output": "..."}, + "config": {"model": "turing_flash"}, + "is_async": true + }' +``` + +Then fetch by `eval_id`: + +```bash +curl "https://api.futureagi.com/sdk/api/v1/new-eval/?eval_id=abc-123-..." \ + -H "X-Api-Key: YOUR_API_KEY" \ + -H "X-Secret-Key: YOUR_SECRET_KEY" +``` + +--- + +## Read the result + +The format of `output` depends on the eval's [output type](/docs/evaluation/concepts/output-types): + +| Output type | `output` value | `score` | +|---|---|---| +| **Pass/fail** | `"Passed"` or `"Failed"` | `1.0` or `0.0` | +| **Scoring** | `{ "choice": "Good", "score": 0.7 }` | numeric 0-1 | +| **Choices** | `{ "choice": "Formal", "score": 1.0 }`, or array when multi-choice | numeric 0-1 | + +See [Eval results](/docs/evaluation/concepts/eval-results) for the full schema. + +--- + +## Next steps + +- [Run evals in CI/CD](/docs/evaluation/run/cicd): gate pull requests on eval pass rates. +- [Run evals with the Python SDK](/docs/evaluation/run/python-sdk): Python-native path. +- [Eval results](/docs/evaluation/concepts/eval-results): full result schema. diff --git a/src/pages/docs/evaluation/features/cicd.mdx b/src/pages/docs/evaluation/run/cicd.mdx similarity index 95% rename from src/pages/docs/evaluation/features/cicd.mdx rename to src/pages/docs/evaluation/run/cicd.mdx index 456dd99d..93f46294 100644 --- a/src/pages/docs/evaluation/features/cicd.mdx +++ b/src/pages/docs/evaluation/run/cicd.mdx @@ -1,6 +1,6 @@ --- title: "Evaluate via CI/CD Pipeline: Automated Quality Checks" -description: "Run Future AGI evaluations in your CI/CD pipeline to assess model performance on every pull request and keep quality checks consistent before deployment." +description: "Run FutureAGI evaluations in your CI/CD pipeline to assess model performance on every pull request and keep quality checks consistent before deployment." --- ## About @@ -22,7 +22,7 @@ This catches regressions before they ship and gives your team a versioned histor ## Prerequisites -- A Future AGI account with API key and secret key +- A FutureAGI account with API key and secret key - A CI system that can run Python (GitHub Actions, GitLab CI, Jenkins, or any runner with Python and network access) - The `ai-evaluation` package (`pip install ai-evaluation>=0.1.7`) @@ -42,21 +42,21 @@ Set these as environment variables or in your CI's secret store. Do not commit t | Secret | Description | |---|---| -| `FI_API_KEY` | Your Future AGI API key | -| `FI_SECRET_KEY` | Your Future AGI secret key | +| `FI_API_KEY` | Your FutureAGI API key | +| `FI_SECRET_KEY` | Your FutureAGI secret key | | `PAT_GITHUB` | Personal Access Token for repository access (GitHub Actions only) | ### Required variables | Variable | Description | Default | |---|---|---| -| `PROJECT_NAME` | Future AGI project name | `Voice Agent` | +| `PROJECT_NAME` | FutureAGI project name | `Voice Agent` | | `VERSION` | Current version identifier | `v0.1.0` | | `COMPARISON_VERSIONS` | Comma-separated versions to compare against | *(empty)* | --- -## Core SDK Functions +## Core SDK functions The pipeline uses two SDK functions: `evaluate_pipeline` to submit an eval run tagged to a version, and `get_pipeline_results` to retrieve and compare results across versions. @@ -140,7 +140,7 @@ result = evaluator.get_pipeline_results( --- -## Full GitHub Actions Implementation +## Full GitHub Actions implementation ### Workflow File @@ -379,7 +379,7 @@ if __name__ == "__main__": --- -## Expected Output +## Expected output The workflow posts a comment on your PR with the current version identifier and a metrics comparison table across versions. @@ -398,19 +398,19 @@ The workflow posts a comment on your PR with the current version identifier and --- -## Next Steps +## Next steps - + Run a single eval from the UI or SDK. - + Define eval templates to use in your pipeline. Run multiple evals together as a group. - + Bring your own model for evaluations. diff --git a/src/pages/docs/evaluation/run/in-the-ui.mdx b/src/pages/docs/evaluation/run/in-the-ui.mdx new file mode 100644 index 00000000..cbebd930 --- /dev/null +++ b/src/pages/docs/evaluation/run/in-the-ui.mdx @@ -0,0 +1,116 @@ +--- +title: "Run evals in the UI" +description: "Apply a built-in or custom eval template to a dataset from the FutureAGI dashboard. Cover picking the eval, mapping variables, overriding settings, and reading results." +--- + +## About + +Apply any built-in or custom eval template to a dataset directly from the FutureAGI dashboard. Pick the template, map its required variables to your dataset columns, set per-run overrides if needed, and click Run. Results show as new columns on the dataset alongside an aggregate summary at the top. + +By the end of this page you'll have one or more evals running on every row in your dataset, with per-row verdicts and aggregate stats visible in the UI. + +This is the right starting point if you have a dataset already and want a quality signal without writing code. For the SDK path see [Run evals with the Python SDK](/docs/evaluation/run/python-sdk). For the API see [Run evals with the API](/docs/evaluation/run/api). For evals attached to live production traffic see [Run evals on traces](/docs/observe/features/evals). + +--- + +## Prerequisites + +- A FutureAGI workspace. +- A dataset with the columns your eval needs as inputs. See [Create a dataset](/docs/dataset). + +--- + +## Steps + + + + + +Open the dataset you want to evaluate. If you don't have one yet, see [Create a dataset](/docs/dataset). + +![Populated dataset open in the dashboard with the Evaluate button at the top right of the data grid toolbar](/images/docs/evaluation/evaluate/dataset-page.png) + + + + + +Click **Evaluate** in the top-right of the dataset view. The eval picker drawer opens. + + + + + +Browse or search the eval list. You'll see: + +- **Built-in evals** like Toxicity, Groundedness, Tone (read-only, you can duplicate them) +- **Custom evals** authored in your workspace + +Click the eval to open its config panel. + +![Eval picker drawer with search box, tag filters, and a list of available built-in and custom evals](/images/docs/evaluation/evaluate/add-evaluation.png) + +Click an eval in the list to expand it and preview the criteria, required variables, and default settings before adding it. + + + + + +Map each variable the eval expects to a column in your dataset. + +For example, a `groundedness` eval expects `output` and `context`. Pick which dataset column maps to each. + +![Configured evals panel showing each eval mapped to dataset columns and a Run All button at the bottom](/images/docs/evaluation/evaluate/configured-evals-panel.png) + + + + + +Most settings on the template are overridable per-application. Common overrides: + +| Override | What it does | +|---|---| +| **Model** | Use a different judge model than the template's default. | +| **Mode** (Agent evals) | Switch between `Quick`, `Auto`, and `Agent`. | +| **Use Internet, Connectors, Knowledge Bases** (Agent evals) | Adjust agent capabilities for this dataset. | +| **Context** | Pick which [context options](/docs/evaluation/concepts/data-injection) the eval gets. | +| **Pass threshold** | Tighten or loosen the score cutoff. | +| **Error Localization** | Turn on to flag the offending field per failed row. See [Enable error localization](/docs/evaluation/build/error-localization). | +| **Reason column** | Add a second column with the eval's explanation per row. | + +These overrides apply only to this dataset. The template stays unchanged. Other datasets using the same template are not affected. + + + + + +Click **Add & Run**. FutureAGI queues an eval job that processes every row. Each row gets a verdict and (if Reason column is on) a reason. + +A new column appears on the dataset for the eval result. Aggregates show in the eval summary at the top. + +![Dataset with eval results populated per row, an average score at the bottom, and a click-through detail popup showing the eval's reason for one row](/images/docs/evaluation/evaluate/dataset-with-results-running.png) + + + + + +--- + +## Verify + +You should see: + +- A new column on the dataset named after the eval. +- Per-row verdict (Pass/Fail, score label, or category) and runtime. +- A reason column if you enabled it. +- An aggregate (pass rate, average score, or distribution) in the eval summary at the top of the dataset view. + +If you don't see results within a minute or two, check that the variables are mapped to columns with non-empty values for the rows you expect to score. + +--- + +## Next steps + +- [Run evals with the Python SDK](/docs/evaluation/run/python-sdk): script the same eval as part of an offline pipeline. +- [Run evals in CI/CD](/docs/evaluation/run/cicd): gate pull requests on eval pass rates. +- [Run evals on traces](/docs/observe/features/evals): score live production traffic. +- [Eval results](/docs/evaluation/concepts/eval-results): full result schema and aggregate behavior. diff --git a/src/pages/docs/evaluation/run/python-sdk.mdx b/src/pages/docs/evaluation/run/python-sdk.mdx new file mode 100644 index 00000000..74e9939e --- /dev/null +++ b/src/pages/docs/evaluation/run/python-sdk.mdx @@ -0,0 +1,151 @@ +--- +title: "Run evals with the Python SDK" +description: "Score inputs from Python using fi.evals.evaluate. Cover single evals, batch evals across metrics, local-only metrics, async runs, and reading the result." +--- + +## About + +Score one input or batch several evals against the same input from Python with the `ai-evaluation` package. One function, `evaluate()`, covers all three [eval types](/docs/evaluation/concepts/eval-types): pass an eval name, your inputs as keyword arguments, and an optional judge model. The engine (local, cloud, or LLM-as-Judge) is picked automatically based on the metric and what you pass. + +By the end of this page you'll know how to call a single eval, batch several, run a local-only metric without an API key, write an LLM-as-Judge with custom criteria, and read the result fields. + +For TypeScript see [Run evals with TypeScript](/docs/evaluation/run/typescript-sdk). For raw HTTP see [Run evals with the API](/docs/evaluation/run/api). For the full parameter and return reference see the [`evaluate()` SDK reference](/docs/sdk/evals/evaluate). + +--- + +## Prerequisites + +- A FutureAGI workspace. +- `pip install ai-evaluation`. +- API key and secret from **Settings → API Keys**, exported as environment variables: + +```bash +export FI_API_KEY="your-api-key" +export FI_SECRET_KEY="your-secret-key" +``` + +Local-only metrics (`contains`, `regex`, `is_json`, `bleu_score`, and others) run client-side and don't require an API key. + +--- + +## Run a single eval + +```python +from fi.evals import evaluate + +result = evaluate( + "toxicity", + output="You're awesome at this!", + model="turing_flash", +) + +print(result.eval_name) # "toxicity" +print(result.score) # 1.0 +print(result.passed) # True +print(result.reason) # "The response is positive..." +print(result.latency_ms) # 840.2 +``` + +The first argument is the eval name; everything after is `**inputs` passed as keyword arguments. The `model` parameter is optional, and only used for cloud and LLM-as-Judge evals. + +--- + +## Run a local metric (no API key) + +Local metrics run in your process with no network call. They are fast, deterministic, and free. + +```python +from fi.evals import evaluate + +result = evaluate("contains", output="Hello world", keyword="Hello") +print(result.score) # 1.0 +print(result.passed) # True +``` + +See the [evaluator catalog](/docs/evaluation/builtin) for the full list of local-only metrics (filter for `Code` type). + +--- + +## Run multiple evals at once + +Pass a list of eval names to batch several evaluations on the same inputs in one call. Each result is independent. + +```python +results = evaluate( + ["contains", "one_line", "is_json"], + output="Hello world", + keyword="Hello", +) + +for r in results: + print(f"{r.eval_name}: score={r.score}, passed={r.passed}") +# contains: score=1.0, passed=True +# one_line: score=1.0, passed=True +# is_json: score=0.0, passed=False +``` + + +Don't mix local and cloud metrics in the same batch call. If you pass `model="turing_flash"`, only cloud metrics will return results; local metrics will return `score=None`. Run them separately. + + +--- + +## LLM-as-Judge with custom criteria + +When no built-in fits, write your own criteria and have a model score against it. + +```python +from fi.evals import evaluate + +result = evaluate( + prompt="Rate how helpful this response is from 0 to 1. A helpful response directly answers the question with actionable steps.", + output="Here are 3 steps to fix the issue: 1. Check your config...", + query="How do I fix the login error?", + engine="llm", + model="gemini/gemini-2.5-flash", +) + +print(result.score) # 0.9 +print(result.reason) +``` + +Use `{field_name}` placeholders in the prompt to reference any kwarg you pass. + +--- + +## Async runs + +For long-running batches, set `is_async=True` on the underlying API call (advanced; most users do not need this). The SDK normally handles batching synchronously and returns results when ready. See [`evaluate()` SDK reference](/docs/sdk/evals/evaluate) for the full parameter list. + +--- + +## Read the result + +A result has a fixed shape. The format of `score` depends on the eval's [output type](/docs/evaluation/concepts/output-types): + +| Output type | `score` | `passed` | +|---|---|---| +| **Pass/fail** | `1.0` if passed, `0.0` if failed | `True` / `False` | +| **Scoring** | numeric 0-1 mapped from the chosen label | `True` if `score >= threshold` | +| **Choices** | numeric 0-1 derived from the chosen label's verdict mark | `True` if the chosen label is marked Pass | + +```python +r = result # single eval +r.eval_name # the eval that ran +r.score # 0.0-1.0 +r.passed # bool +r.reason # plain-language explanation +r.latency_ms # float, eval runtime +``` + +See [Eval results](/docs/evaluation/concepts/eval-results) for full schema details. + +--- + +## Next steps + +- [Run evals in CI/CD](/docs/evaluation/run/cicd): gate pull requests on eval pass rates. +- [Run evals in the UI](/docs/evaluation/run/in-the-ui): dashboard-driven path. +- [Create custom evals](/docs/evaluation/build/custom): define your own quality rules. +- [`evaluate()` SDK reference](/docs/sdk/evals/evaluate): full parameter and return-value reference. +- [Local evals SDK reference](/docs/sdk/evals/local): run metrics without API calls. diff --git a/src/pages/docs/evaluation/run/typescript-sdk.mdx b/src/pages/docs/evaluation/run/typescript-sdk.mdx new file mode 100644 index 00000000..59138d9c --- /dev/null +++ b/src/pages/docs/evaluation/run/typescript-sdk.mdx @@ -0,0 +1,129 @@ +--- +title: "Run evals with TypeScript" +description: "Score inputs from a Node.js or TypeScript application using @future-agi/ai-evaluation. Cover single evals, batches, custom templates, and reading the result." +--- + +## About + +Score inputs from a Node.js or TypeScript application using the `@future-agi/ai-evaluation` package. Construct an `Evaluator`, call `evaluator.evaluate()` with an eval name (or imported template class), your inputs, and an optional judge model, and read the result back as a typed object. + +By the end of this page you'll know how to run one eval with a string name, switch to typed template classes for editor autocomplete, batch several evals on the same input, and read the result. + +For Python see [Run evals with the Python SDK](/docs/evaluation/run/python-sdk). For raw HTTP see [Run evals with the API](/docs/evaluation/run/api). + +--- + +## Prerequisites + +- A FutureAGI workspace. +- API key and secret from **Settings → API Keys**. +- `npm install @future-agi/ai-evaluation` (or `yarn add` / `pnpm add`). + +Set credentials in your environment: + +```bash +export FI_API_KEY="your-api-key" +export FI_SECRET_KEY="your-secret-key" +``` + +--- + +## Run a single eval + +The simplest path: pass an eval name as a string, your inputs as a plain object, and an options object with the judge model. + +```typescript +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "context_adherence", + { + context: "Honey never spoils because it has low moisture content and high acidity.", + output: "Honey doesn't spoil because its low moisture and high acidity prevent bacteria.", + }, + { modelName: "turing_flash" }, +); + +console.log(result); +``` + +The client reads `FI_API_KEY` and `FI_SECRET_KEY` from environment variables by default. Pass them explicitly if you prefer: + +```typescript +const evaluator = new Evaluator({ + fiApiKey: process.env.FI_API_KEY!, + fiSecretKey: process.env.FI_SECRET_KEY!, +}); +``` + +--- + +## Use template classes + +For type safety and editor autocomplete, import the template class directly instead of using a string name. Pass it through the object-arg signature. + +```typescript +import { Evaluator, Tone } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate({ + evalTemplates: [new Tone()], + inputs: [{ + query: "Write a professional email", + response: "Dear Sir/Madam, I hope this message finds you well...", + }], + modelName: "turing_flash", +}); + +console.log(result); +``` + +Common template classes: `Tone`, `Toxicity`, `Groundedness`, `ContextAdherence`, `Hallucination`, `Pii`, `PromptInjection`, and more. Import the ones you use. + +--- + +## Run multiple evals at once + +Pass an array of evaluators in `evalTemplates` to score the same input against several checks in one call. + +```typescript +import { Evaluator, Tone, Toxicity, Groundedness } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate({ + evalTemplates: [new Tone(), new Toxicity(), new Groundedness()], + inputs: [{ + query: "Was the order shipped?", + response: "Yes, your order shipped on Friday and should arrive Tuesday.", + context: "Order #1234 shipped on 2026-05-16, expected delivery 2026-05-19.", + }], + modelName: "turing_flash", +}); + +console.log(result); +``` + +--- + +## Read the result + +The result format depends on the eval's [output type](/docs/evaluation/concepts/output-types). The TS SDK returns an object containing the per-eval results plus aggregate fields. See [Eval results](/docs/evaluation/concepts/eval-results) for the canonical schema; field names in TS use camelCase. + +| Output type | result fields per eval | +|---|---| +| **Pass/fail** | `output: "Passed" \| "Failed"`, `reason`, `runtime` | +| **Scoring** | `output: { choice, score }`, `reason`, `runtime` | +| **Choices** | `output: { choice, score }` (single) or array of choices (multi-choice), `reason`, `runtime` | + +--- + +## Next steps + +- [Run evals in CI/CD](/docs/evaluation/run/cicd): gate pull requests on eval pass rates. +- [Run evals with the Python SDK](/docs/evaluation/run/python-sdk): Python equivalent. +- [Run evals with the API](/docs/evaluation/run/api): raw HTTP for any runtime. +- [Create custom evals](/docs/evaluation/build/custom): define your own quality rules. diff --git a/src/pages/docs/evaluation/troubleshooting/ci-failures.mdx b/src/pages/docs/evaluation/troubleshooting/ci-failures.mdx new file mode 100644 index 00000000..e8b53faf --- /dev/null +++ b/src/pages/docs/evaluation/troubleshooting/ci-failures.mdx @@ -0,0 +1,128 @@ +--- +title: "CI eval gate failed" +description: "Troubleshoot a failing eval gate in a CI/CD pipeline. Auth errors, threshold misses, judge variance, missing dataset, and version pin drift." +--- + +## Symptom + +Your CI job that runs evals fails. Common failure surfaces: + +- GitHub Actions / GitLab CI step exits non-zero +- The PR check shows `eval-pipeline: failed` +- The PR comment from the eval workflow says scores dropped below threshold +- The job dies with an authentication or rate-limit error before any eval runs + +--- + +## Quick checks + +1. Read the job log, does it say "auth error", "rate limit", "below threshold", or something else? +2. Did the PR change the system prompt, model, dataset, or eval template? Any of those legitimately affects scores. +3. Did the eval template version change since the last passing run? +4. Are `FI_API_KEY` and `FI_SECRET_KEY` set as CI secrets and exposed to the eval step? +5. What's the threshold the job is gating on? Is it set tight enough to flap on judge variance? + +--- + +## Causes and fixes + +| Cause | How to confirm | Fix | +|---|---|---| +| Missing or wrong API keys in CI | Error message: "401 Unauthorized" or "authentication failed" | Add `FI_API_KEY` and `FI_SECRET_KEY` as repository / org secrets; expose them as env vars in the workflow step | +| Threshold gates on noisy score | Last several runs flap between pass/fail without code changes | Raise the threshold band; allow a delta (e.g. "fail only if score drops more than 0.05") | +| Real quality regression | Score genuinely dropped because of the PR change | The eval is doing its job; either fix the change or accept the new baseline | +| Template version was promoted between runs | The binding now uses a newer version with different criteria | Pin the binding to a specific template version; don't rely on default | +| Judge model swapped | Model field in result differs from prior run | Set model explicitly in the eval call; don't let it inherit from template | +| Dataset path / ID wrong | Job log says "dataset not found" | Verify dataset ID; for CI, prefer a stable dataset name pinned in the script | +| Network timeout to FutureAGI | Sporadic failures, not consistent | Increase job timeout; retry the step; check FutureAGI status page | +| Rate limit on judge model | Error mentions 429 | Reduce CI batch size; throttle requests; ask for a higher quota | +| CI runner has no Python or missing package | `ModuleNotFoundError: No module named 'fi'` | Add `pip install ai-evaluation` to the workflow step before the eval call | + +--- + +## Diagnostic: reproduce locally + +The fastest debug path is to run the same eval script outside CI. + +```bash +# Use the same env vars CI uses +export FI_API_KEY="" +export FI_SECRET_KEY="" + +# Run the same script the CI step runs +python scripts/evaluate_pipeline.py +``` + +If it passes locally but fails in CI, the issue is environmental (auth, network, runner). If it fails locally too, the issue is in the script or the data. + +--- + +## Diagnostic: check the PR comment + +If your workflow posts a PR comment with per-eval scores (recommended pattern from [Run evals in CI/CD](/docs/evaluation/run/cicd)), the comment shows which specific eval failed and by how much. That tells you whether to: + +- **Fix the PR** (a real regression, the change broke quality) +- **Adjust the threshold** (flapping on noise, see judge variance below) +- **Update the baseline** (intentional behavior change, re-baseline expected scores) + +--- + +## Stabilize a flaky gate + +If the gate flaps without code changes, it's measuring noise. Pick from these: + +| Approach | Tradeoff | +|---|---| +| Raise threshold (e.g. 0.7 → 0.8) | Fewer false fails; misses smaller real regressions | +| Lower threshold (e.g. 0.7 → 0.5) | Same fewer false fails, but accepts lower quality | +| Switch to a "delta" gate ("fail only if drop > 0.05 vs prior run") | Best for small datasets; catches regressions without flapping on noise | +| Use a Code eval where possible | Zero variance; deterministic | +| Add ground truth to the LLM judge | Stabilizes verdicts; modest implementation cost | +| Average across N runs of the same row | Slower; more cost; reliable | +| Increase the regression dataset size | More signal, less noise; only works if you have rows to add | + +For LLM-as-Judge gates specifically, see [Judge output is inconsistent](/docs/evaluation/troubleshooting/judge-variance) for the full set of stabilization techniques. + +--- + +## Common GitHub Actions setup + +A working baseline that handles the most common failure modes: + +```yaml +- name: Run eval pipeline + env: + FI_API_KEY: ${{ secrets.FI_API_KEY }} + FI_SECRET_KEY: ${{ secrets.FI_SECRET_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + run: | + pip install ai-evaluation openai + python scripts/evaluate_pipeline.py + timeout-minutes: 15 +``` + +Key points: +- Secrets are passed as env vars, not echoed +- `pip install` is in the run step so dependencies are explicit +- `timeout-minutes` prevents hung judge calls from blocking forever + +See the [CI/CD cookbook](/docs/cookbook/quickstart/cicd-eval-pipeline) for an end-to-end example with PR commenting. + +--- + +## Prevent recurrence + +- Always pin template versions in CI bindings. Default promotion to a new version should never silently affect CI. +- Set the judge model explicitly in every eval call. +- Add a smoke-test step before the main eval that confirms auth, env, and one successful eval call. +- Post a PR comment with per-eval scores so failures are immediately actionable. +- Re-baseline thresholds after a major prompt or model change, not silently in CI config. + +--- + +## Next steps + +- [Run evals in CI/CD](/docs/evaluation/run/cicd): the full how-to for setting up a pipeline. +- [CI/CD eval pipeline cookbook](/docs/cookbook/quickstart/cicd-eval-pipeline): end-to-end GitHub Actions example. +- [Judge output is inconsistent](/docs/evaluation/troubleshooting/judge-variance): stabilize the judge so gates stop flapping. +- [Scores changed unexpectedly](/docs/evaluation/troubleshooting/score-drift): if the score dropped without a PR change. diff --git a/src/pages/docs/evaluation/troubleshooting/judge-variance.mdx b/src/pages/docs/evaluation/troubleshooting/judge-variance.mdx new file mode 100644 index 00000000..6d4f50e0 --- /dev/null +++ b/src/pages/docs/evaluation/troubleshooting/judge-variance.mdx @@ -0,0 +1,107 @@ +--- +title: "Judge output is inconsistent" +description: "Same input scored multiple times returns different verdicts. Causes and fixes for non-deterministic LLM judge output, including model temperature, prompt clarity, and ground truth anchoring." +--- + +## Symptom + +You score the same input five times with the same eval and get three different verdicts. Or the same template gives Pass on one row and Fail on a near-identical row in the same dataset. Or reasons cite different parts of the input each time. + +This is judge variance, a property of LLM-based eval engines. + +--- + +## Quick checks + +1. Is the eval type **LLM-As-A-Judge** or **Agents**? Code evals are deterministic; variance there indicates a code bug, not judge variance. +2. Run the same row through the [Test playground](/docs/evaluation/build/test-playground) 3-5 times. Note the verdict and score spread. +3. Read three different `reason` strings, are they pointing at different parts of the input as the failure? +4. Is the model `turing_flash` or a smaller / faster model? Smaller models have higher variance on subjective checks. + +--- + +## Causes and fixes + +| Cause | How to confirm | Fix | +|---|---|---| +| Subjective criteria with no clear pass bar | The prompt asks "is this helpful?" without defining "helpful" | Rewrite the criteria to define the bar concretely: "Pass if the response includes at least one actionable step." | +| Model temperature is non-zero | Custom model config; built-in models default to low but non-zero | If using a custom model, set `temperature=0`; for built-ins, switch to `turing_large` which is more stable | +| Model can't reliably evaluate the modality | Image / audio / long-context evals on a small text model | Use `turing_large` for multimodal or long inputs | +| Pass threshold sits in the noise band | Many rows score in 0.4-0.6 range with `threshold=0.5` | Move the threshold to a clearer cut (0.7) and accept fewer passes; or restructure the criteria so scores cluster at the edges | +| Choice labels overlap semantically | Labels like "Good" and "Acceptable" without clear separation | Reduce to 3-4 labels with clearly distinct definitions; or switch to Pass/fail | +| Long context with the verdict signal in a small part | Judge gets distracted by surrounding content | Trim context to the relevant section; or use [data injection](/docs/evaluation/concepts/data-injection) to scope what the judge sees | +| Prompt doesn't reference the variables explicitly | Criteria says "rate the response" but the response field is named `output` | Use `{{output}}` etc. explicitly in the criteria so the judge focuses on the right field | + +--- + +## Diagnostic: measure variance + +Run the same input several times and look at the spread. + +```python +from fi.evals import evaluate + +inputs = {"output": "Your test response here", "context": "Your test context"} +runs = [ + evaluate("your_template_name", model="turing_large", **inputs) + for _ in range(5) +] + +scores = [r.score for r in runs] +print(f"min={min(scores):.2f} max={max(scores):.2f} spread={max(scores)-min(scores):.2f}") +for r in runs: + print(f" {r.score:.2f} {r.reason[:80]}") +``` + +| Spread | What it means | Action | +|---|---|---| +| `0.0` | Deterministic for this row | Variance is happening on different rows; check criteria clarity | +| `< 0.1` | Acceptable variance | Probably fine; ensure threshold has room to absorb it | +| `0.1 – 0.3` | High variance | Refine criteria, add ground truth, or switch to a larger model | +| `> 0.3` | Unreliable | Criteria is too subjective; rewrite or use a different eval type | + +--- + +## Anchor the judge with ground truth + +The strongest fix for judge variance is showing the judge examples of how you want it to score, not just rules. + +1. Curate 20-50 examples of inputs paired with the verdict you want. +2. Attach them as [ground truth](/docs/evaluation/build/ground-truth) on the template. +3. At eval time, the platform retrieves the most similar examples and includes them in the judge's prompt as few-shot context. + +Ground truth typically reduces variance more than rewriting criteria, because it shifts the judge from "interpret rules" to "match patterns." + +--- + +## When to switch to Code instead + +If your check can be expressed as a rule, use a [Code eval](/docs/evaluation/concepts/eval-types#code). Zero variance, zero cost, faster. + +Good candidates for Code: +- Format / structure: JSON validity, regex match, length range +- Exact match: response equals expected string +- Numeric similarity: BLEU, ROUGE, embedding similarity +- Retrieval metrics: precision@k, recall@k, NDCG + +Bad candidates (keep LLM judge): +- Tone, helpfulness, factuality, groundedness +- Multi-criterion judgments that need reasoning + +--- + +## Prevent recurrence + +- Pin a stable model (`turing_large` for accuracy, never a custom model without `temperature=0`). +- Add 20-50 ground-truth examples to any LLM judge that gates production decisions. +- Set thresholds away from the noise band (≥0.7 for binary decisions). +- Define every subjective term in the criteria with one-sentence operational definitions. + +--- + +## Next steps + +- [Ground truth](/docs/evaluation/build/ground-truth): attach labelled examples to stabilize the judge. +- [Eval types](/docs/evaluation/concepts/eval-types): when Code is better than LLM-as-Judge. +- [Judge models](/docs/evaluation/concepts/judge-models): pick the right model for accuracy vs cost. +- [Scores changed unexpectedly](/docs/evaluation/troubleshooting/score-drift): when scores change across runs, not within one batch. diff --git a/src/pages/docs/evaluation/troubleshooting/mapping.mdx b/src/pages/docs/evaluation/troubleshooting/mapping.mdx new file mode 100644 index 00000000..ab712c40 --- /dev/null +++ b/src/pages/docs/evaluation/troubleshooting/mapping.mdx @@ -0,0 +1,99 @@ +--- +title: "Dataset fields don't match the eval template" +description: "Validation errors when applying an eval to a dataset because required input keys aren't mapped, column types don't fit, or URLs trip media auto-detection. Causes and fixes." +--- + +## Symptom + +You try to apply an eval to a dataset and get one of these: + +- "Required key `context` is not mapped" +- "Media file is not accessible for `pr_url`" +- The eval runs but every row returns trivial verdicts (always Pass / always 0) +- "Variable `{{output}}` not found in inputs" +- The eval picker shows variables in red + +--- + +## Quick checks + +1. Open the eval template's detail page and read the **Required keys** section. Note every key. +2. Open the dataset's column list. Confirm there's a column for each required key (the column name doesn't have to match, you can map it). +3. Make sure required-key columns actually have values populated (not empty / null for the rows you want to score). +4. Check if any required key name contains "url", "image", "audio", "video", or "file", those trigger media auto-detection. + +--- + +## Causes and fixes + +| Cause | How to confirm | Fix | +|---|---|---| +| Required key not mapped | The Add Evaluation form shows the variable in red or blocks Save | Pick a dataset column from the dropdown next to each required key | +| Column name and variable name confused | The variable in the criteria is `{{response}}` but the eval template's required key is `output` | Use the template's canonical key name in the mapping. Variable name in the criteria can be anything, it just must match a key in `inputs` | +| Empty column | The mapped column is sparsely populated; rows with empty values fail or return trivial scores | Filter the dataset to rows with the required field populated, or fix upstream data generation | +| Media auto-detection on a non-media URL | The variable name contains "url" and the value is a webpage URL; FAGI tries to download it as media and fails | Rename the variable in the eval criteria (e.g. `{{pr_link}}` not `{{pr_url}}`) and update the mapping | +| Wrong field type | A retrieval eval expects `reference` as an array but the column stores JSON-stringified arrays | Either parse the column to a real list type, or change the mapping to a column with the right type | +| Conversation field is plain text instead of message array | Customer agent evals need `conversation` as an array of `{role, content}` | Restructure the column or use a column that already holds the array shape | +| Mapping points at the wrong column | Two columns have similar names; you mapped to the wrong one | Re-check each mapping row by opening the dropdown | + +--- + +## Diagnostic: inspect a failing row + +In the dataset, click the row → the eval result detail. The result shows what `inputs` the eval received. + +If `inputs.output` is empty or `null`, the column has no value for that row. If it shows a URL when you expected text, media detection swapped it. If the field is present but the verdict is trivial, the criteria isn't doing what you think (test it in the [Test playground](/docs/evaluation/build/test-playground) with a known input). + +--- + +## Map dataset columns to canonical keys + +When in doubt, name your dataset columns to match the canonical eval input keys. It saves mapping time and avoids confusion. + +| If your eval needs | Name your column | +|---|---| +| The user's question / query | `input` | +| The AI's response | `output` | +| Retrieved RAG context | `context` | +| Ground truth answer | `expected` (or `expected_response`) | +| Reference for metric comparison | `reference` | +| Hypothesis being scored | `hypothesis` | +| Multi-turn dialogue | `conversation` | +| Audio file URL | `audio_file` *(not `audio_url`)* | +| Image set URL | `image_set` *(not `image_url`)* | + +See the [evaluator input schema reference](/docs/evaluation/reference/input-schema) for the full canonical key list. + +--- + +## Avoid media auto-detection + +The platform auto-classifies any input key that contains the substring `url`, `file`, `image`, `audio`, or `video` as a downloadable media file. It will try to fetch the URL and pass the file content to the judge. + +For text URLs that aren't downloadable media (a webpage, a GitHub PR, a Notion page), use a key name that doesn't trigger detection: + +| Avoid | Prefer | +|---|---| +| `{{pr_url}}` | `{{pr_link}}`, `{{github_pr}}` | +| `{{doc_url}}` | `{{doc_link}}`, `{{reference_link}}` | +| `{{page_url}}` | `{{page_link}}` | + +If you genuinely need the judge to read the page content, fetch it server-side first and pass the text as `{{page_text}}` instead. + +--- + +## Prevent recurrence + +- Standardize column names to match canonical eval input keys across all datasets. +- Add a smoke-test row at the top of every dataset with known-good values for every required key. +- For evals with file-reference inputs, validate file accessibility before running the full eval. +- In CI, run the eval on 3-5 sample rows first; if mapping is wrong, you'll see it fast. + +--- + +## Next steps + +- [Evaluator input schema](/docs/evaluation/reference/input-schema): canonical key names and shapes. +- [Test playground](/docs/evaluation/build/test-playground): test mapping on one row before committing. +- [Run evals in the UI](/docs/evaluation/run/in-the-ui): the mapping step in the dashboard flow. +- [Create custom evals](/docs/evaluation/build/custom): declare required keys when authoring your own template. diff --git a/src/pages/docs/evaluation/troubleshooting/score-drift.mdx b/src/pages/docs/evaluation/troubleshooting/score-drift.mdx new file mode 100644 index 00000000..7adf66eb --- /dev/null +++ b/src/pages/docs/evaluation/troubleshooting/score-drift.mdx @@ -0,0 +1,83 @@ +--- +title: "Eval scores changed unexpectedly" +description: "Same eval, same data, different scores between runs. Causes, diagnostics, and fixes for score drift across reruns, template versions, and judge models." +--- + +## Symptom + +You ran an eval against the same dataset twice and got different scores. Or a CI run that previously passed now fails on unchanged code. Or aggregate pass rates moved without you changing the template. + +--- + +## Quick checks (run first, in order) + +1. Did the template version change? Check the eval's **Versions** tab. If the default is now a different version, new runs are using new criteria. +2. Did the judge model change? Open the binding and compare the **Model** field to what was used in the prior run. +3. Did the data change? A dataset row was edited, added, or deleted between runs. +4. Is the eval an LLM-as-Judge with `temperature > 0`? Non-zero temperature is the most common source of run-to-run variance. +5. Did the input mapping change? A column rename or remap can silently flip what the judge sees. + +--- + +## Causes and fixes + +| Cause | How to confirm | Fix | +|---|---|---| +| Template was edited between runs | Compare V1 vs V2 in the **Versions** tab | Pin the binding to a specific version (re-add the eval and pick the version explicitly), or [restore](/docs/evaluation/concepts/versioning#restore-version) the prior version | +| New default version was promoted | The binding shows "version updated" or the version number changed | Pin to the prior version, or accept the new default and re-baseline expected scores | +| Judge model was overridden | Compare `model` field in result vs prior result | Set the same model in the binding's overrides | +| LLM judge non-determinism | Re-run the same row 3-5 times; if scores vary, this is the cause | Use a Code eval (deterministic), pick a smaller temperature in the judge model config, or add [ground truth examples](/docs/evaluation/build/ground-truth) to anchor the judge | +| Dataset row was edited | Check the row's edit history | Lock the dataset version used for regression checks | +| Input mapping silently changed | Compare the binding's mapping field to the prior binding | Re-pin mapping; consider naming columns to match canonical eval keys (`output`, `context`, etc.) to make drift obvious | +| Context grew or shrank | The retrieved context column has different values per run | Check upstream retrieval; pin retrieval to the same chunk store version | +| Choice scores changed on a Scoring template | `choice_scores` differ between versions | Restore prior scoring map, or re-baseline | + +--- + +## Diagnostic checks + +**1. Compare run metadata across runs** + +In the eval results table for any row, the result object includes `model` and the binding's pinned version. Diff those fields between a passing run and a failing run. + +**2. Test the row in isolation** + +Open the row in the [Test playground](/docs/evaluation/build/test-playground) with the current template and current model. If the verdict matches the failing run but not the passing run, the template or model is the cause. If the verdict matches the passing run, something about the run environment (mapping, dataset state) changed. + +**3. Re-run a single row N times to measure judge variance** + +```python +from fi.evals import evaluate + +scores = [] +for _ in range(5): + r = evaluate( + "your_template_name", + output="...", + model="turing_flash", + ) + scores.append(r.score) + +print(f"min={min(scores)} max={max(scores)} spread={max(scores)-min(scores)}") +``` + +A spread > 0.1 on the same input means the judge is the source of variance. See [Judge output is inconsistent](/docs/evaluation/troubleshooting/judge-variance). + +--- + +## Prevent recurrence + +- Pin bindings to specific template versions for CI / regression suites. Don't rely on "default". +- Set the model explicitly in every binding, don't let it inherit. +- Lock dataset snapshots used for regression checks. +- For high-stakes pass/fail gates, use Code evals instead of LLM judges when the check can be expressed deterministically. +- Add [ground truth examples](/docs/evaluation/build/ground-truth) to LLM judges to anchor verdicts. + +--- + +## Next steps + +- [Judge output is inconsistent](/docs/evaluation/troubleshooting/judge-variance): diagnose run-to-run variance from the judge model itself. +- [Versioning](/docs/evaluation/concepts/versioning): how template versions work and how to pin bindings. +- [Ground truth](/docs/evaluation/build/ground-truth): anchor judges with labelled examples to reduce drift. +- [Run evals in CI/CD](/docs/evaluation/run/cicd): how to set up regression gates that don't drift silently. diff --git a/src/pages/docs/evaluation/troubleshooting/slow-runs.mdx b/src/pages/docs/evaluation/troubleshooting/slow-runs.mdx new file mode 100644 index 00000000..446cf644 --- /dev/null +++ b/src/pages/docs/evaluation/troubleshooting/slow-runs.mdx @@ -0,0 +1,109 @@ +--- +title: "Eval run is slow" +description: "Eval batches take much longer than expected. Causes and fixes for slow LLM-as-Judge runs, Agent-mode tool calls, large datasets, and judge model selection." +--- + +## Symptom + +A dataset eval that should finish in minutes is taking an hour. A 1000-row CI batch is timing out. Or a single LLM-as-Judge call against one row is taking 10+ seconds. + +--- + +## Quick checks + +1. How many rows are in the batch? `100` should finish in a few minutes; `10,000` will take longer. +2. What model is the binding using? `turing_large` is ~3x slower than `turing_flash`. +3. Is this an Agent eval with connectors or Internet enabled? Each tool call adds a round trip. +4. Is the eval running synchronously or async? Synchronous batches block on each row. +5. Is there a model rate limit triggering retries? + +--- + +## Causes and fixes + +| Cause | How to confirm | Fix | +|---|---|---| +| Wrong model for the scale | Per-row latency from `runtime` field is 1-3s for cloud evals; multiply by row count for total | Switch to `turing_flash` for high-volume runs; reserve `turing_large` for high-stakes / multimodal | +| Synchronous batching | The SDK call is blocking on each row | Use `is_async=True` and fetch results by `eval_id` later; or apply the eval at the dataset level (UI Add & Run) which queues in parallel | +| Agent mode tool calls | Each tool call adds 1-3s; an Agent eval averaging 2-3 tool calls per row is ~10x slower than LLM-as-Judge | Switch to `Quick` mode in the Agent dropdown if the row doesn't need tools; trim the connector tool list to only what's needed | +| Long context | `runtime` > 5s per row on text inputs | Trim context to the relevant section; chunk large documents; use a model with larger context if the trim isn't possible | +| Multimodal eval on every row | `runtime` > 10s per row; images/audio in inputs | Sample a subset (e.g. 100 rows) instead of all 10,000; pre-resize images | +| Rate limit retries | Errors mention 429 or "rate limit"; runtime spikes intermittently | Lower batch concurrency; spread runs over time; ask for a higher quota | +| Network latency | Same eval runs faster locally than in CI | Move the CI runner to a region closer to the FutureAGI region (typically US East) | + +--- + +## Latency expectations + +Approximate per-row latency by eval type and model. Use these to set expectations and detect anomalies. + +| Eval type | Model | Typical per-row latency | +|---|---|---| +| Code (local) | n/a | < 5ms | +| LLM-as-Judge | `turing_flash` | 0.5-1.5s | +| LLM-as-Judge | `turing_large` | 2-4s | +| Agent (Quick mode) | `turing_flash` | 1-3s | +| Agent (Auto mode, 1-2 tool calls) | `turing_flash` | 4-8s | +| Agent (Auto mode, 1-2 tool calls) | `turing_large` | 8-15s | +| Agent (Agent mode, 3+ tool calls) | `turing_large` | 15-40s | +| Multimodal (image input) | `turing_large` | 5-10s | +| Audio (long clip) | `turing_large` | 10-30s | + +If your `runtime` is more than 2x the typical, look at the row's input size and the model's recent error rate. + +--- + +## Diagnostic: profile a small batch + +Run 20 rows and look at runtime distribution. + +```python +from fi.evals import evaluate +import statistics + +rows = your_test_rows[:20] +results = [ + evaluate("your_template_name", model="turing_flash", **row) + for row in rows +] + +runtimes = [r.latency_ms / 1000 for r in results] +print(f"min={min(runtimes):.1f}s") +print(f"median={statistics.median(runtimes):.1f}s") +print(f"max={max(runtimes):.1f}s") +print(f"p95={sorted(runtimes)[int(len(runtimes)*0.95)]:.1f}s") +``` + +If p95 is more than 3x the median, a few outlier rows are dragging the batch. Find them by sorting results by runtime; their inputs are usually too long or contain media that's slow to fetch. + +--- + +## Scaling strategies + +| Batch size | Recommended approach | +|---|---| +| < 100 rows | Synchronous via SDK or UI is fine | +| 100 – 1,000 rows | Async via SDK (`is_async=True`) or UI Add & Run which queues in parallel | +| 1,000 – 10,000 rows | Async; sample a stratified subset for fast feedback; full run for nightly | +| > 10,000 rows | Async; partition by category/date; sample for CI; full run on schedule | + +For CI specifically, score a fixed regression set of 30-100 rows. The goal is fast feedback on PRs, not full-corpus accuracy. + +--- + +## Prevent recurrence + +- Default new bindings to `turing_flash`; promote to `turing_large` only when accuracy demands it. +- Cap Agent eval `Mode` to `Quick` for high-volume use cases; reserve `Auto`/`Agent` for diagnostic deep dives. +- Limit connector tools to the ones the criteria actually needs. +- Trim context columns to the relevant section before evaluation. +- Use a stratified sample for CI gates instead of the full dataset. + +--- + +## Next steps + +- [Judge models](/docs/evaluation/concepts/judge-models): pick the right model for cost / accuracy / latency. +- [Eval types](/docs/evaluation/concepts/eval-types): how Agent / LLM-as-Judge / Code compare on cost. +- [Run evals with the Python SDK](/docs/evaluation/run/python-sdk): async patterns. +- [Data injection](/docs/evaluation/concepts/data-injection): trim what the judge sees to speed it up. diff --git a/src/pages/docs/faq.mdx b/src/pages/docs/faq.mdx index 123fbbc3..5eb7fd26 100644 --- a/src/pages/docs/faq.mdx +++ b/src/pages/docs/faq.mdx @@ -33,7 +33,7 @@ Future AGI has 70+ built-in evaluation templates covering quality, safety, factu **How do I run my first evaluation?** -See [Evaluate via Platform & SDK](/docs/evaluation/features/evaluate) for step-by-step instructions using the UI or Python SDK. +See [Evaluate via Platform & SDK](/docs/evaluation/run/in-the-ui) for step-by-step instructions using the UI or Python SDK. **How do I evaluate RAG applications?** diff --git a/src/pages/docs/observe/features/evals.mdx b/src/pages/docs/observe/features/evals.mdx index b6dd8992..9e0f2080 100644 --- a/src/pages/docs/observe/features/evals.mdx +++ b/src/pages/docs/observe/features/evals.mdx @@ -163,7 +163,7 @@ From the task list: --- -## Next Steps +## Next steps diff --git a/src/pages/docs/quickstart/evals.mdx b/src/pages/docs/quickstart/evals.mdx new file mode 100644 index 00000000..d178b50f --- /dev/null +++ b/src/pages/docs/quickstart/evals.mdx @@ -0,0 +1,120 @@ +--- +title: "Quickstart: Run your first eval" +description: "Score one input with FutureAGI evaluation in 5 minutes. Install ai-evaluation, set credentials, run a built-in eval, and inspect the verdict and reason." +--- + +## About + +Score one input against a built-in eval template in about 5 minutes. You will install the `ai-evaluation` package, set credentials, call `evaluate()` on a single message, and read back the verdict, reason, and latency. + +By the end of this page you will know exactly what an eval returns and how to call one again from your own code. + +--- + +## Prerequisites + +- A FutureAGI workspace. +- API key and secret from **Settings → API Keys** in the dashboard. +- Python 3.9+ (the same flow works in TypeScript and via the API; see [Run evals with TypeScript](/docs/evaluation/run/typescript-sdk) and [Run evals with the API](/docs/evaluation/run/api)). + +--- + +## Install + +```bash +pip install ai-evaluation +``` + +--- + +## Configure + +Export your credentials. Don't paste them inline, keep them in environment variables or a secret manager. + +```bash +export FI_API_KEY="your-api-key" +export FI_SECRET_KEY="your-secret-key" +``` + +--- + +## Run your first eval + +Save this as `first_eval.py` and run it: + +```python +from fi.evals import evaluate + +result = evaluate( + "toxicity", + output="You're awesome at this!", + model="turing_flash", +) + +print(f"verdict: {'Passed' if result.passed else 'Failed'}") +print(f"score: {result.score}") +print(f"reason: {result.reason}") +print(f"latency: {result.latency_ms:.0f}ms") +``` + +```bash +python first_eval.py +``` + +--- + +## Verify + +You should see output like: + +```text +verdict: Passed +score: 1.0 +reason: The response is a positive compliment with no toxic, harmful, or offensive content. +latency: 840ms +``` + +The `evaluate()` function reads `FI_API_KEY` and `FI_SECRET_KEY` from your environment. If you get an authentication error, double-check that they are exported in the same shell where you ran `python first_eval.py`. + +--- + +## Try another eval + +Swap the eval name to score a different dimension. Some built-ins to try: + +```python +# Check for unsupported claims against retrieved context +result = evaluate( + "groundedness", + output="The capital of France is Paris.", + context="France is a country in Western Europe. Its capital is Paris.", + model="turing_flash", +) + +# Run a local-only metric (no API key needed) +result = evaluate("contains", output="Hello world", keyword="Hello") +print(result.score) # 1.0 +print(result.passed) # True + +# Batch several evals on the same input +results = evaluate( + ["contains", "one_line", "is_json"], + output="Hello world", + keyword="Hello", +) +for r in results: + print(f"{r.eval_name}: score={r.score}, passed={r.passed}") +``` + +Browse the full list in the [evaluator catalog](/docs/evaluation/builtin). + +--- + +## Next steps + +- [Run evals in the UI](/docs/evaluation/run/in-the-ui): apply evals to a full dataset from the dashboard. +- [Run evals on traces](/docs/observe/features/evals): score live production traffic. +- [Run evals in CI/CD](/docs/evaluation/run/cicd): gate pull requests on eval pass rates. +- [Create custom evals](/docs/evaluation/build/custom): define your own quality rules. +- [Eval types](/docs/evaluation/concepts/eval-types): pick between Agents, LLM-As-A-Judge, and Code for your check. +- [`evaluate()` SDK reference](/docs/sdk/evals/evaluate): full parameter and return-value reference. diff --git a/src/pages/docs/quickstart/running-evals-in-simulation.mdx b/src/pages/docs/quickstart/running-evals-in-simulation.mdx index 3b8c1edc..a54835db 100644 --- a/src/pages/docs/quickstart/running-evals-in-simulation.mdx +++ b/src/pages/docs/quickstart/running-evals-in-simulation.mdx @@ -7,7 +7,7 @@ description: "Score simulated agent calls against eval templates. Pick built-in Simulation runs your AI agent against simulated customers in defined scenarios. Evals score what the agent did. Together they let you test agent quality before any real users are involved: define who the customer is, watch the agent handle the call, and see how it scored on the criteria you care about. -The same eval templates that work everywhere else in Future AGI also work here. The only difference is what you map: instead of dataset columns or span attributes, you map to the call's transcript, recording, or scenario fields. +The same eval templates that work everywhere else in FutureAGI also work here. The only difference is what you map: instead of dataset columns or span attributes, you map to the call's transcript, recording, or scenario fields. --- @@ -111,13 +111,13 @@ To add or change evals on an existing run, open the run from the Run Simulation ## Create a custom eval for simulation -If the built-in evals don't cover what you need, create a custom one. Custom evals work the same in simulation as everywhere else; see [Create custom evals](/docs/evaluation/features/custom). +If the built-in evals don't cover what you need, create a custom one. Custom evals work the same in simulation as everywhere else; see [Create custom evals](/docs/evaluation/build/custom). A few simulation-specific notes when authoring: - **Agent type is usually right** for calls because it can read the full transcript and reason across turns. LLM-As-A-Judge also works for simpler binary checks. - **Use `{{variable}}` placeholders** in the criteria. When you map them later, you'll point them at call fields (transcript, recording, scenario data). -- **Test before saving** with the [Test Playground](/docs/evaluation/features/test-playground) using the **Simulation** source mode to run the eval against an existing call. +- **Test before saving** with the [Test Playground](/docs/evaluation/build/test-playground) using the **Simulation** source mode to run the eval against an existing call. --- @@ -129,7 +129,7 @@ A few simulation-specific notes when authoring: --- -## Next Steps +## Next steps @@ -138,7 +138,7 @@ A few simulation-specific notes when authoring: Pick Agents, LLM-As-A-Judge, or Code. - + Try an eval on a single call before applying. diff --git a/src/pages/docs/simulation/features/prompt-simulation.mdx b/src/pages/docs/simulation/features/prompt-simulation.mdx index 6a617ba0..14120835 100644 --- a/src/pages/docs/simulation/features/prompt-simulation.mdx +++ b/src/pages/docs/simulation/features/prompt-simulation.mdx @@ -159,7 +159,7 @@ Before you start: have a **prompt template** with at least one saved **prompt ve Use AI-powered suggestions to improve your prompt based on simulation results. - + Build evaluations tailored to your specific use case. diff --git a/src/plugins/vite-docs-transform.mjs b/src/plugins/vite-docs-transform.mjs index dab3550b..2dffacfc 100644 --- a/src/plugins/vite-docs-transform.mjs +++ b/src/plugins/vite-docs-transform.mjs @@ -22,6 +22,7 @@ const COMPONENT_MAP = { CopyButton: '@docs/CopyButton.astro', Expandable: '@docs/Expandable.astro', Icon: '@docs/Icon.astro', + Mermaid: '@docs/Mermaid.astro', Note: '@docs/Note.astro', ParamField: '@docs/ParamField.astro', Prerequisites: '@docs/Prerequisites.astro',