Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
165 changes: 165 additions & 0 deletions src/components/docs/Mermaid.astro
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
---
// Renders a Mermaid diagram. Source is passed via the `code` prop:
//
// <Mermaid code={`flowchart LR
// A --> B
// `} />
//
// Mermaid is loaded once per page via a hoisted, deduplicated <script>.
// We re-run on theme changes so dark/light switches re-render the diagram.

const { code } = Astro.props as { code: string };
---

<div class="mermaid-wrapper" data-mermaid>
<pre class="mermaid">{code}</pre>
</div>

<script>
// Lazy-load mermaid from CDN the first time a diagram appears on the page.
// Subsequent pages with diagrams reuse the cached module thanks to the CDN
// and the browser HTTP cache; this script tag is hoisted+deduplicated by
// Astro so it only runs once per page even if the component renders many times.

declare global {
interface Window {
__fagiMermaidPromise?: Promise<typeof import('https://cdn.jsdelivr.net/npm/mermaid@11/dist/mermaid.esm.min.mjs')>;
}
}

async function loadMermaid() {
if (!window.__fagiMermaidPromise) {
window.__fagiMermaidPromise = import(
/* @vite-ignore */
'https://cdn.jsdelivr.net/npm/mermaid@11/dist/mermaid.esm.min.mjs'
);
}
const mod = await window.__fagiMermaidPromise;
return (mod as any).default;
}

function isDarkTheme(): boolean {
if (typeof document === 'undefined') return true;
const root = document.documentElement;
if (root.dataset.theme) return root.dataset.theme === 'dark';
if (root.classList.contains('dark')) return true;
if (root.classList.contains('light')) return false;
return window.matchMedia('(prefers-color-scheme: dark)').matches;
}

function themeConfig() {
const dark = isDarkTheme();
return {
startOnLoad: false,
theme: 'base' as const,
securityLevel: 'loose' as const,
flowchart: { htmlLabels: true, curve: 'basis' },
themeVariables: dark
? {
background: '#0a0a0a',
primaryColor: '#1f1f23',
primaryTextColor: '#e5e7eb',
primaryBorderColor: '#3f3f46',
lineColor: '#8b5cf6',
secondaryColor: '#18181b',
tertiaryColor: '#111113',
textColor: '#e5e7eb',
mainBkg: '#1f1f23',
secondBkg: '#27272a',
tertiaryBkg: '#111113',
nodeBorder: '#3f3f46',
clusterBkg: '#111113',
clusterBorder: '#27272a',
labelTextColor: '#e5e7eb',
edgeLabelBackground: '#18181b',
actorBkg: '#1f1f23',
actorBorder: '#3f3f46',
actorTextColor: '#e5e7eb',
actorLineColor: '#6b7280',
signalColor: '#a78bfa',
signalTextColor: '#e5e7eb',
sequenceNumberColor: '#0a0a0a',
noteBkgColor: '#312e81',
noteTextColor: '#e5e7eb',
noteBorderColor: '#4338ca',
}
: {
background: '#ffffff',
primaryColor: '#f3f4f6',
primaryTextColor: '#111827',
primaryBorderColor: '#d1d5db',
lineColor: '#6d28d9',
textColor: '#111827',
mainBkg: '#f9fafb',
edgeLabelBackground: '#ffffff',
},
};
}

async function render() {
const wrappers = document.querySelectorAll<HTMLElement>('[data-mermaid]');
if (wrappers.length === 0) return;
const mermaid = await loadMermaid();
mermaid.initialize(themeConfig());

// Reset any previously-rendered diagrams before re-running.
for (const wrap of wrappers) {
const pre = wrap.querySelector<HTMLElement>('pre.mermaid');
if (!pre) continue;
if (wrap.dataset.mermaidSource) {
pre.textContent = wrap.dataset.mermaidSource;
} else {
wrap.dataset.mermaidSource = pre.textContent ?? '';
}
pre.removeAttribute('data-processed');
}

await mermaid.run({ querySelector: '[data-mermaid] pre.mermaid' });
}

function init() {
render().catch((err) => console.error('Mermaid render failed', err));
}

if (document.readyState === 'loading') {
document.addEventListener('DOMContentLoaded', init);
} else {
init();
}

// Re-render on theme toggle (the site uses data-theme or .dark on <html>).
const themeObserver = new MutationObserver(() => render().catch(() => {}));
themeObserver.observe(document.documentElement, {
attributes: true,
attributeFilter: ['data-theme', 'class'],
});

// Re-render after Astro view transitions, if the site uses them.
document.addEventListener('astro:after-swap', init);
</script>

<style>
.mermaid-wrapper {
margin: 1.25rem 0;
padding: 1.25rem;
background: var(--mermaid-bg, rgba(255, 255, 255, 0.02));
border: 1px solid var(--color-border-default, #27272a);
border-radius: 0.75rem;
overflow-x: auto;
}
.mermaid-wrapper pre.mermaid {
background: transparent !important;
border: 0 !important;
padding: 0 !important;
margin: 0 !important;
font-family: ui-monospace, "SF Mono", Menlo, monospace;
font-size: 0.85rem;
color: var(--color-text-muted, #9ca3af);
text-align: center;
}
.mermaid-wrapper svg {
max-width: 100%;
height: auto;
display: inline-block;
}
</style>
74 changes: 59 additions & 15 deletions src/lib/navigation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -289,34 +289,78 @@ export const tabNavigation: NavTab[] = [
icon: 'chart',
items: [
{ title: 'Overview', href: '/docs/evaluation' },
{ title: 'Quickstart', href: '/docs/quickstart/evals' },
{
title: 'Concepts',
items: [
{ title: 'Understanding Evaluation', href: '/docs/evaluation/concepts/understanding-evaluation' },
{ title: 'Eval Types', href: '/docs/evaluation/concepts/eval-types' },
{ title: 'Eval Templates', href: '/docs/evaluation/concepts/eval-templates' },
{ title: 'Output Types', href: '/docs/evaluation/concepts/output-types' },
{ title: 'Data Injection', href: '/docs/evaluation/concepts/data-injection' },
{ title: 'Composite Evals', href: '/docs/evaluation/concepts/composite-evals' },
{ title: 'Versioning', href: '/docs/evaluation/concepts/versioning' },
{ title: 'Judge Models', href: '/docs/evaluation/concepts/judge-models' },
{ title: 'Eval Results', href: '/docs/evaluation/concepts/eval-results' },
{ title: 'Composite Evals', href: '/docs/evaluation/concepts/composite-evals' },
{ title: 'Versioning', href: '/docs/evaluation/concepts/versioning' },
{ title: 'Data Injection', href: '/docs/evaluation/concepts/data-injection' },
{ title: 'MCP Connectors in Evaluation', href: '/docs/evaluation/concepts/mcp-connectors' },
]
},
{
title: 'Features',
title: 'Run evals',
items: [
{ title: 'Run evals in the UI', href: '/docs/evaluation/run/in-the-ui' },
{ title: 'Run evals with the Python SDK', href: '/docs/evaluation/run/python-sdk' },
{ title: 'Run evals with TypeScript', href: '/docs/evaluation/run/typescript-sdk' },
{ title: 'Run evals with the API', href: '/docs/evaluation/run/api' },
{ title: 'Run evals in CI/CD', href: '/docs/evaluation/run/cicd' },
]
},
{
title: 'Build evals',
items: [
{ title: 'Create custom evals', href: '/docs/evaluation/build/custom' },
{ title: 'Test playground', href: '/docs/evaluation/build/test-playground' },
{ title: 'Ground truth', href: '/docs/evaluation/build/ground-truth' },
{ title: 'Error localization', href: '/docs/evaluation/build/error-localization' },
{ title: 'Configure MCP connectors', href: '/docs/evaluation/build/mcp-connectors' },
]
},
{
title: 'Judge models',
items: [
{ title: 'FutureAGI models', href: '/docs/evaluation/judge-models/futureagi' },
{ title: 'Use custom models', href: '/docs/evaluation/judge-models/custom' },
]
},
{
title: 'Evaluator catalog',
items: [
{ title: 'All evaluators', href: '/docs/evaluation/builtin' },
{ title: 'RAG & retrieval', href: '/docs/evaluation/builtin/categories/rag' },
{ title: 'Agent & conversation', href: '/docs/evaluation/builtin/categories/agent' },
{ title: 'Safety & policy', href: '/docs/evaluation/builtin/categories/safety' },
{ title: 'Text quality & NLP metrics', href: '/docs/evaluation/builtin/categories/text' },
{ title: 'Format & validation', href: '/docs/evaluation/builtin/categories/format' },
{ title: 'Code', href: '/docs/evaluation/builtin/categories/code' },
{ title: 'Multimodal', href: '/docs/evaluation/builtin/categories/multimodal' },
{ title: 'Audio', href: '/docs/evaluation/builtin/categories/audio' },
]
},
{
title: 'Reference',
items: [
{ title: 'Eval result schema', href: '/docs/evaluation/reference/result-schema' },
{ title: 'Evaluator input schema', href: '/docs/evaluation/reference/input-schema' },
{ title: 'Score types', href: '/docs/evaluation/reference/score-types' },
]
},
{
title: 'Troubleshooting',
items: [
{ title: 'Built-in Evals', href: '/docs/evaluation/builtin' },
{ title: 'Evaluate via Platform & SDK', href: '/docs/evaluation/features/evaluate' },
{ title: 'Create Custom Evals', href: '/docs/evaluation/features/custom' },
{ title: 'Test Playground', href: '/docs/evaluation/features/test-playground' },
{ title: 'Ground Truth', href: '/docs/evaluation/features/ground-truth' },
{ title: 'Error Localization', href: '/docs/evaluation/features/error-localization' },
{ title: 'Configure MCP Connectors for an Eval', href: '/docs/evaluation/features/mcp-connectors' },
{ title: 'Use Custom Models', href: '/docs/evaluation/features/custom-models' },
{ title: 'Future AGI Models', href: '/docs/evaluation/features/futureagi-models' },
{ title: 'Evaluate CI/CD Pipeline', href: '/docs/evaluation/features/cicd' },
{ title: 'Scores changed unexpectedly', href: '/docs/evaluation/troubleshooting/score-drift' },
{ title: 'Judge output is inconsistent', href: '/docs/evaluation/troubleshooting/judge-variance' },
{ title: 'Eval run is slow', href: '/docs/evaluation/troubleshooting/slow-runs' },
{ title: "Dataset fields don't match", href: '/docs/evaluation/troubleshooting/mapping' },
{ title: 'CI eval gate failed', href: '/docs/evaluation/troubleshooting/ci-failures' },
]
},
]
Expand Down
14 changes: 7 additions & 7 deletions src/lib/redirects.ts
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ export const redirectMap: Record<string, string> = {
'/future-agi/get-started/evaluation/builtin-evals/fuzzy-match': '/docs/evaluation/builtin/fuzzy-match',
'/future-agi/get-started/evaluation/builtin-evals/groundedness': '/docs/evaluation/builtin/groundedness',
'/future-agi/get-started/evaluation/builtin-evals/hit-rate': '/docs/evaluation/builtin/hit-rate',
'/future-agi/get-started/evaluation/builtin-evals/instruction-adherence': '/docs/evaluation/builtin/instruction-adherence',
'/future-agi/get-started/evaluation/builtin-evals/instruction-adherence': '/docs/evaluation/builtin/prompt-adherence',
'/future-agi/get-started/evaluation/builtin-evals/is-compliant': '/docs/evaluation/builtin',
'/future-agi/get-started/evaluation/builtin-evals/is-concise': '/docs/evaluation/builtin/is-concise',
'/future-agi/get-started/evaluation/builtin-evals/is-email': '/docs/evaluation/builtin/is-email',
Expand Down Expand Up @@ -154,12 +154,12 @@ export const redirectMap: Record<string, string> = {
'/future-agi/get-started/evaluation/builtin-evals/toxicity': '/docs/evaluation/builtin/toxicity',
'/future-agi/get-started/evaluation/builtin-evals/translation-accuracy': '/docs/evaluation/builtin/translation-accuracy',
'/future-agi/get-started/evaluation/builtin-evals/valid-links': '/docs/evaluation/builtin',
'/future-agi/get-started/evaluation/create-custom-evals': '/docs/evaluation/features/custom',
'/future-agi/get-started/evaluation/evaluate-ci-cd-pipeline': '/docs/evaluation/features/cicd',
'/future-agi/get-started/evaluation/evaluate-patterns': '/docs/evaluation/features/evaluate',
'/future-agi/get-started/evaluation/future-agi-models': '/docs/evaluation/features/futureagi-models',
'/future-agi/get-started/evaluation/running-your-first-eval': '/docs/evaluation/features/evaluate',
'/future-agi/get-started/evaluation/use-custom-models': '/docs/evaluation/features/custom-models',
'/future-agi/get-started/evaluation/create-custom-evals': '/docs/evaluation/build/custom',
'/future-agi/get-started/evaluation/evaluate-ci-cd-pipeline': '/docs/evaluation/run/cicd',
'/future-agi/get-started/evaluation/evaluate-patterns': '/docs/evaluation/run/in-the-ui',
'/future-agi/get-started/evaluation/future-agi-models': '/docs/evaluation/judge-models/futureagi',
'/future-agi/get-started/evaluation/running-your-first-eval': '/docs/evaluation/run/in-the-ui',
'/future-agi/get-started/evaluation/use-custom-models': '/docs/evaluation/judge-models/custom',
'/future-agi/get-started/knowledge-base/concept': '/docs/knowledge-base/concepts/concept',
'/future-agi/get-started/knowledge-base/how-to/create-kb-using-sdk': '/docs/knowledge-base/features/sdk',
'/future-agi/get-started/knowledge-base/how-to/create-kb-using-ui': '/docs/knowledge-base/features/ui',
Expand Down
2 changes: 1 addition & 1 deletion src/pages/docs/cookbook/decrease-hallucination.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -592,7 +592,7 @@ The winner configuration was CharacterTextSplitter_mmr_map_rerank, which combine

- **Can I create custom evaluations tailored to my RAG use case in Future AGI?**

Yes. The Deterministic Eval template in Future AGI supports custom evaluations (***Click [here](/docs/evaluation/features/custom) to learn more about deterministic eval***). This lets you apply stringent criteria to your RAG outputs minimising variability.
Yes. The Deterministic Eval template in Future AGI supports custom evaluations (***Click [here](/docs/evaluation/build/custom) to learn more about deterministic eval***). This lets you apply stringent criteria to your RAG outputs minimising variability.


## Ready to Reduce Hallucinations in Your RAG Applications?
Expand Down
4 changes: 2 additions & 2 deletions src/pages/docs/cookbook/evaluation/eval-correction-loop.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -250,10 +250,10 @@ You ran a built-in eval, found rows where it disagreed with human judgment, enco
## Explore further

<CardGroup cols={3}>
<Card title="Create Custom Evals" icon="wand-magic-sparkles" href="/docs/evaluation/features/custom">
<Card title="Create Custom Evals" icon="wand-magic-sparkles" href="/docs/evaluation/build/custom">
Full reference for the custom eval template API
</Card>
<Card title="Future AGI Models" icon="brain" href="/docs/evaluation/features/futureagi-models">
<Card title="FutureAGI Models" icon="brain" href="/docs/evaluation/judge-models/futureagi">
Pick the right judge model: turing_small, turing_flash, turing_large
</Card>
<Card title="Eval Templates" icon="layer-group" href="/docs/evaluation/concepts/eval-templates">
Expand Down
10 changes: 5 additions & 5 deletions src/pages/docs/cookbook/evaluation/eval-with-mcp-connectors.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ pip install fastmcp
python crm_mcp_server.py
```

Expose it through ngrok or your own tunnel so Future AGI can reach it:
Expose it through ngrok or your own tunnel so FutureAGI can reach it:

```bash
ngrok http 8000
Expand All @@ -68,7 +68,7 @@ Grab the public URL (e.g. `https://abc123.ngrok-free.app`).

---

## Step 2: Register the connector in Future AGI
## Step 2: Register the connector in FutureAGI

1. Open **Settings → Connectors → Add Connector**.
2. Fill in:
Expand Down Expand Up @@ -162,7 +162,7 @@ Open any row to see the **Tool Trace** — the exact tool call and response the

- **Add more tools**: Extend the MCP server with `get_subscription`, `get_invoices`, `get_feature_flags`. The judge will pick the right tool per claim.
- **Use a real CRM**: Replace the stub with HubSpot, Stripe, or Linear MCP servers. Set the connector auth to OAuth or bearer.
- **Score traces, not datasets**: The same Agent-mode eval can run against [traced](/docs/observe) production conversations — wire it as a [continuous evaluation](/docs/evaluation/features/cicd).
- **Score traces, not datasets**: The same Agent-mode eval can run against [traced](/docs/observe) production conversations — wire it as a [continuous evaluation](/docs/evaluation/run/cicd).
- **Cross-source evals**: Enable two connectors at once (CRM + Notion docs). The judge calls both when the reply cites a product feature *and* a customer record.

---
Expand All @@ -179,6 +179,6 @@ Open any row to see the **Tool Trace** — the exact tool call and response the
## Next steps

- [MCP Connectors concept](/docs/evaluation/concepts/mcp-connectors): The model behind Agent-mode evals.
- [Configure MCP Connectors for an Eval](/docs/evaluation/features/mcp-connectors): The UI walkthrough.
- [Create custom evals](/docs/evaluation/features/custom): Building eval templates without connectors.
- [Configure MCP Connectors for an Eval](/docs/evaluation/build/mcp-connectors): The UI walkthrough.
- [Create custom evals](/docs/evaluation/build/custom): Building eval templates without connectors.
- [Falcon AI MCP Connectors](/docs/falcon-ai/features/mcp-connectors): Registering and authenticating a connector.
4 changes: 2 additions & 2 deletions src/pages/docs/dataset/features/experiments.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ Experiment creation is a guided three-step flow: **Basic Info → Configuration
<Tab title="Custom models" icon="gear">
Models you've added through Custom Models show up in the model picker for prompt configurations across all experiment types.
<Tip>
See [Custom Models](/docs/evaluation/features/custom-models) for how to register a custom or self-hosted model.
See [Custom Models](/docs/evaluation/judge-models/custom) for how to register a custom or self-hosted model.
</Tip>
</Tab>
</Tabs>
Expand All @@ -90,7 +90,7 @@ Experiment creation is a guided three-step flow: **Basic Info → Configuration

**Compare against baseline (optional)**: pick a column from the dataset to compare model outputs against (typically a ground-truth or existing run-prompt column). Skip it if you don't have a reference output yet; you can still run the experiment, attach evals that don't need a baseline, and add a base column later by editing the experiment.

**Add evaluations**: click **Add Evaluation** and pick from the [built-in eval](/docs/evaluation/builtin) catalog or [create a custom eval](/docs/evaluation/features/custom). Add as many as you need. Every eval runs on every configuration so the results are directly comparable.
**Add evaluations**: click **Add Evaluation** and pick from the [built-in eval](/docs/evaluation/builtin) catalog or [create a custom eval](/docs/evaluation/build/custom). Add as many as you need. Every eval runs on every configuration so the results are directly comparable.
![Choosing Evals](/screenshot/product/dataset/how-to/experiments-in-dataset/7.png)

For each eval, map its inputs (e.g. `output`, `input`, `expected`) to the model output or to dataset columns. Mapping is required before the experiment can run.
Expand Down
Loading