future-agi · SuhaniNagpal7 · May 21, 2026 · May 22, 2026 · May 22, 2026 · May 22, 2026
diff --git a/public/images/docs/evaluation/mcp-connectors/1.png b/public/images/docs/evaluation/mcp-connectors/1.png
diff --git a/public/images/docs/evaluation/mcp-connectors/2.png b/public/images/docs/evaluation/mcp-connectors/2.png
diff --git a/public/images/docs/evaluation/mcp-connectors/3.png b/public/images/docs/evaluation/mcp-connectors/3.png
diff --git a/public/images/docs/evaluation/mcp-connectors/4.png b/public/images/docs/evaluation/mcp-connectors/4.png
diff --git a/public/images/docs/evaluation/mcp-connectors/5.png b/public/images/docs/evaluation/mcp-connectors/5.png
diff --git a/src/components/docs/Mermaid.astro b/src/components/docs/Mermaid.astro
@@ -0,0 +1,165 @@
+---
+// Renders a Mermaid diagram. Source is passed via the `code` prop:
+//
+//   <Mermaid code={`flowchart LR
+//     A --> B
+//   `} />
+//
+// Mermaid is loaded once per page via a hoisted, deduplicated <script>.
+// We re-run on theme changes so dark/light switches re-render the diagram.
+
+const { code } = Astro.props as { code: string };
+---
+
+<div class="mermaid-wrapper" data-mermaid>
+  <pre class="mermaid">{code}</pre>
+</div>
+
+<script>
+  // Lazy-load mermaid from CDN the first time a diagram appears on the page.
+  // Subsequent pages with diagrams reuse the cached module thanks to the CDN
+  // and the browser HTTP cache; this script tag is hoisted+deduplicated by
+  // Astro so it only runs once per page even if the component renders many times.
+
+  declare global {
+    interface Window {
+      __fagiMermaidPromise?: Promise<typeof import('https://cdn.jsdelivr.net/npm/mermaid@11/dist/mermaid.esm.min.mjs')>;
+    }
+  }
+
+  async function loadMermaid() {
+    if (!window.__fagiMermaidPromise) {
+      window.__fagiMermaidPromise = import(
+        /* @vite-ignore */
+        'https://cdn.jsdelivr.net/npm/mermaid@11/dist/mermaid.esm.min.mjs'
+      );
+    }
+    const mod = await window.__fagiMermaidPromise;
+    return (mod as any).default;
+  }
+
+  function isDarkTheme(): boolean {
+    if (typeof document === 'undefined') return true;
+    const root = document.documentElement;
+    if (root.dataset.theme) return root.dataset.theme === 'dark';
+    if (root.classList.contains('dark')) return true;
+    if (root.classList.contains('light')) return false;
+    return window.matchMedia('(prefers-color-scheme: dark)').matches;
+  }
+
+  function themeConfig() {
+    const dark = isDarkTheme();
+    return {
+      startOnLoad: false,
+      theme: 'base' as const,
+      securityLevel: 'loose' as const,
+      flowchart: { htmlLabels: true, curve: 'basis' },
+      themeVariables: dark
+        ? {
+            background: '#0a0a0a',
+            primaryColor: '#1f1f23',
+            primaryTextColor: '#e5e7eb',
+            primaryBorderColor: '#3f3f46',
+            lineColor: '#8b5cf6',
+            secondaryColor: '#18181b',
+            tertiaryColor: '#111113',
+            textColor: '#e5e7eb',
+            mainBkg: '#1f1f23',
+            secondBkg: '#27272a',
+            tertiaryBkg: '#111113',
+            nodeBorder: '#3f3f46',
+            clusterBkg: '#111113',
+            clusterBorder: '#27272a',
+            labelTextColor: '#e5e7eb',
+            edgeLabelBackground: '#18181b',
+            actorBkg: '#1f1f23',
+            actorBorder: '#3f3f46',
+            actorTextColor: '#e5e7eb',
+            actorLineColor: '#6b7280',
+            signalColor: '#a78bfa',
+            signalTextColor: '#e5e7eb',
+            sequenceNumberColor: '#0a0a0a',
+            noteBkgColor: '#312e81',
+            noteTextColor: '#e5e7eb',
+            noteBorderColor: '#4338ca',
+          }
+        : {
+            background: '#ffffff',
+            primaryColor: '#f3f4f6',
+            primaryTextColor: '#111827',
+            primaryBorderColor: '#d1d5db',
+            lineColor: '#6d28d9',
+            textColor: '#111827',
+            mainBkg: '#f9fafb',
+            edgeLabelBackground: '#ffffff',
+          },
+    };
+  }
+
+  async function render() {
+    const wrappers = document.querySelectorAll<HTMLElement>('[data-mermaid]');
+    if (wrappers.length === 0) return;
+    const mermaid = await loadMermaid();
+    mermaid.initialize(themeConfig());
+
+    // Reset any previously-rendered diagrams before re-running.
+    for (const wrap of wrappers) {
+      const pre = wrap.querySelector<HTMLElement>('pre.mermaid');
+      if (!pre) continue;
+      if (wrap.dataset.mermaidSource) {
+        pre.textContent = wrap.dataset.mermaidSource;
+      } else {
+        wrap.dataset.mermaidSource = pre.textContent ?? '';
+      }
+      pre.removeAttribute('data-processed');
+    }
+
+    await mermaid.run({ querySelector: '[data-mermaid] pre.mermaid' });
+  }
+
+  function init() {
+    render().catch((err) => console.error('Mermaid render failed', err));
+  }
+
+  if (document.readyState === 'loading') {
+    document.addEventListener('DOMContentLoaded', init);
+  } else {
+    init();
+  }
+
+  // Re-render on theme toggle (the site uses data-theme or .dark on <html>).
+  const themeObserver = new MutationObserver(() => render().catch(() => {}));
+  themeObserver.observe(document.documentElement, {
+    attributes: true,
+    attributeFilter: ['data-theme', 'class'],
+  });
+
+  // Re-render after Astro view transitions, if the site uses them.
+  document.addEventListener('astro:after-swap', init);
+</script>
+
+<style>
+  .mermaid-wrapper {
+    margin: 1.25rem 0;
+    padding: 1.25rem;
+    background: var(--mermaid-bg, rgba(255, 255, 255, 0.02));
+    border: 1px solid var(--color-border-default, #27272a);
+    border-radius: 0.75rem;
+    overflow-x: auto;
+  }
+  .mermaid-wrapper pre.mermaid {
+    background: transparent !important;
+    border: 0 !important;
+    padding: 0 !important;
+    margin: 0 !important;
+    font-family: ui-monospace, "SF Mono", Menlo, monospace;
+    font-size: 0.85rem;
+    color: var(--color-text-muted, #9ca3af);
+    text-align: center;
+  }
+  .mermaid-wrapper svg {
+    max-width: 100%;
+    height: auto;
+    display: inline-block;
+  }
+</style>
diff --git a/src/lib/navigation.ts b/src/lib/navigation.ts
@@ -289,34 +289,78 @@ export const tabNavigation: NavTab[] = [
         icon: 'chart',
         items: [
           { title: 'Overview', href: '/docs/evaluation' },
+          { title: 'Quickstart', href: '/docs/quickstart/evals' },
           {
             title: 'Concepts',
             items: [
-              { title: 'Understanding Evaluation', href: '/docs/evaluation/concepts/understanding-evaluation' },
               { title: 'Eval Types', href: '/docs/evaluation/concepts/eval-types' },
               { title: 'Eval Templates', href: '/docs/evaluation/concepts/eval-templates' },
               { title: 'Output Types', href: '/docs/evaluation/concepts/output-types' },
-              { title: 'Data Injection', href: '/docs/evaluation/concepts/data-injection' },
-              { title: 'Composite Evals', href: '/docs/evaluation/concepts/composite-evals' },
-              { title: 'Versioning', href: '/docs/evaluation/concepts/versioning' },
               { title: 'Judge Models', href: '/docs/evaluation/concepts/judge-models' },
               { title: 'Eval Results', href: '/docs/evaluation/concepts/eval-results' },
+              { title: 'Composite Evals', href: '/docs/evaluation/concepts/composite-evals' },
+              { title: 'Versioning', href: '/docs/evaluation/concepts/versioning' },
+              { title: 'Data Injection', href: '/docs/evaluation/concepts/data-injection' },
               { title: 'MCP Connectors in Evaluation', href: '/docs/evaluation/concepts/mcp-connectors' },
             ]
           },
           {
-            title: 'Features',
+            title: 'Run evals',
+            items: [
+              { title: 'Run evals in the UI', href: '/docs/evaluation/run/in-the-ui' },
+              { title: 'Run evals with the Python SDK', href: '/docs/evaluation/run/python-sdk' },
+              { title: 'Run evals with TypeScript', href: '/docs/evaluation/run/typescript-sdk' },
+              { title: 'Run evals with the API', href: '/docs/evaluation/run/api' },
+              { title: 'Run evals in CI/CD', href: '/docs/evaluation/run/cicd' },
+            ]
+          },
+          {
+            title: 'Build evals',
+            items: [
+              { title: 'Create custom evals', href: '/docs/evaluation/build/custom' },
+              { title: 'Test playground', href: '/docs/evaluation/build/test-playground' },
+              { title: 'Ground truth', href: '/docs/evaluation/build/ground-truth' },
+              { title: 'Error localization', href: '/docs/evaluation/build/error-localization' },
+              { title: 'Configure MCP connectors', href: '/docs/evaluation/build/mcp-connectors' },
+            ]
+          },
+          {
+            title: 'Judge models',
+            items: [
+              { title: 'FutureAGI models', href: '/docs/evaluation/judge-models/futureagi' },
+              { title: 'Use custom models', href: '/docs/evaluation/judge-models/custom' },
+            ]
+          },
+          {
+            title: 'Evaluator catalog',
+            items: [
+              { title: 'All evaluators', href: '/docs/evaluation/builtin' },
+              { title: 'RAG & retrieval', href: '/docs/evaluation/builtin/categories/rag' },
+              { title: 'Agent & conversation', href: '/docs/evaluation/builtin/categories/agent' },
+              { title: 'Safety & policy', href: '/docs/evaluation/builtin/categories/safety' },
+              { title: 'Text quality & NLP metrics', href: '/docs/evaluation/builtin/categories/text' },
+              { title: 'Format & validation', href: '/docs/evaluation/builtin/categories/format' },
+              { title: 'Code', href: '/docs/evaluation/builtin/categories/code' },
+              { title: 'Multimodal', href: '/docs/evaluation/builtin/categories/multimodal' },
+              { title: 'Audio', href: '/docs/evaluation/builtin/categories/audio' },
+            ]
+          },
+          {
+            title: 'Reference',
+            items: [
+              { title: 'Eval result schema', href: '/docs/evaluation/reference/result-schema' },
+              { title: 'Evaluator input schema', href: '/docs/evaluation/reference/input-schema' },
+              { title: 'Score types', href: '/docs/evaluation/reference/score-types' },
+            ]
+          },
+          {
+            title: 'Troubleshooting',
             items: [
-              { title: 'Built-in Evals', href: '/docs/evaluation/builtin' },
-              { title: 'Evaluate via Platform & SDK', href: '/docs/evaluation/features/evaluate' },
-              { title: 'Create Custom Evals', href: '/docs/evaluation/features/custom' },
-              { title: 'Test Playground', href: '/docs/evaluation/features/test-playground' },
-              { title: 'Ground Truth', href: '/docs/evaluation/features/ground-truth' },
-              { title: 'Error Localization', href: '/docs/evaluation/features/error-localization' },
-              { title: 'Configure MCP Connectors for an Eval', href: '/docs/evaluation/features/mcp-connectors' },
-              { title: 'Use Custom Models', href: '/docs/evaluation/features/custom-models' },
-              { title: 'Future AGI Models', href: '/docs/evaluation/features/futureagi-models' },
-              { title: 'Evaluate CI/CD Pipeline', href: '/docs/evaluation/features/cicd' },
+              { title: 'Scores changed unexpectedly', href: '/docs/evaluation/troubleshooting/score-drift' },
+              { title: 'Judge output is inconsistent', href: '/docs/evaluation/troubleshooting/judge-variance' },
+              { title: 'Eval run is slow', href: '/docs/evaluation/troubleshooting/slow-runs' },
+              { title: "Dataset fields don't match", href: '/docs/evaluation/troubleshooting/mapping' },
+              { title: 'CI eval gate failed', href: '/docs/evaluation/troubleshooting/ci-failures' },
             ]
           },
         ]

diff --git a/src/lib/redirects.ts b/src/lib/redirects.ts
@@ -114,7 +114,7 @@ export const redirectMap: Record<string, string> = {
   '/future-agi/get-started/evaluation/builtin-evals/fuzzy-match': '/docs/evaluation/builtin/fuzzy-match',
   '/future-agi/get-started/evaluation/builtin-evals/groundedness': '/docs/evaluation/builtin/groundedness',
   '/future-agi/get-started/evaluation/builtin-evals/hit-rate': '/docs/evaluation/builtin/hit-rate',
-  '/future-agi/get-started/evaluation/builtin-evals/instruction-adherence': '/docs/evaluation/builtin/instruction-adherence',
+  '/future-agi/get-started/evaluation/builtin-evals/instruction-adherence': '/docs/evaluation/builtin/prompt-adherence',
   '/future-agi/get-started/evaluation/builtin-evals/is-compliant': '/docs/evaluation/builtin',
   '/future-agi/get-started/evaluation/builtin-evals/is-concise': '/docs/evaluation/builtin/is-concise',
   '/future-agi/get-started/evaluation/builtin-evals/is-email': '/docs/evaluation/builtin/is-email',
@@ -154,12 +154,12 @@ export const redirectMap: Record<string, string> = {
   '/future-agi/get-started/evaluation/builtin-evals/toxicity': '/docs/evaluation/builtin/toxicity',
   '/future-agi/get-started/evaluation/builtin-evals/translation-accuracy': '/docs/evaluation/builtin/translation-accuracy',
   '/future-agi/get-started/evaluation/builtin-evals/valid-links': '/docs/evaluation/builtin',
-  '/future-agi/get-started/evaluation/create-custom-evals': '/docs/evaluation/features/custom',
-  '/future-agi/get-started/evaluation/evaluate-ci-cd-pipeline': '/docs/evaluation/features/cicd',
-  '/future-agi/get-started/evaluation/evaluate-patterns': '/docs/evaluation/features/evaluate',
-  '/future-agi/get-started/evaluation/future-agi-models': '/docs/evaluation/features/futureagi-models',
-  '/future-agi/get-started/evaluation/running-your-first-eval': '/docs/evaluation/features/evaluate',
-  '/future-agi/get-started/evaluation/use-custom-models': '/docs/evaluation/features/custom-models',
+  '/future-agi/get-started/evaluation/create-custom-evals': '/docs/evaluation/build/custom',
+  '/future-agi/get-started/evaluation/evaluate-ci-cd-pipeline': '/docs/evaluation/run/cicd',
+  '/future-agi/get-started/evaluation/evaluate-patterns': '/docs/evaluation/run/in-the-ui',
+  '/future-agi/get-started/evaluation/future-agi-models': '/docs/evaluation/judge-models/futureagi',
+  '/future-agi/get-started/evaluation/running-your-first-eval': '/docs/evaluation/run/in-the-ui',
+  '/future-agi/get-started/evaluation/use-custom-models': '/docs/evaluation/judge-models/custom',
   '/future-agi/get-started/knowledge-base/concept': '/docs/knowledge-base/concepts/concept',
   '/future-agi/get-started/knowledge-base/how-to/create-kb-using-sdk': '/docs/knowledge-base/features/sdk',
   '/future-agi/get-started/knowledge-base/how-to/create-kb-using-ui': '/docs/knowledge-base/features/ui',

diff --git a/src/pages/docs/cookbook/decrease-hallucination.mdx b/src/pages/docs/cookbook/decrease-hallucination.mdx
@@ -592,7 +592,7 @@ The winner configuration was CharacterTextSplitter_mmr_map_rerank, which combine
 
 - **Can I create custom evaluations tailored to my RAG use case in Future AGI?**
 
-    Yes. The Deterministic Eval template in Future AGI supports custom evaluations (***Click [here](/docs/evaluation/features/custom) to learn more about deterministic eval***). This lets you apply stringent criteria to your RAG outputs minimising variability.
+    Yes. The Deterministic Eval template in Future AGI supports custom evaluations (***Click [here](/docs/evaluation/build/custom) to learn more about deterministic eval***). This lets you apply stringent criteria to your RAG outputs minimising variability.
 
 
 ## Ready to Reduce Hallucinations in Your RAG Applications?

diff --git a/src/pages/docs/cookbook/evaluation/eval-correction-loop.mdx b/src/pages/docs/cookbook/evaluation/eval-correction-loop.mdx
@@ -250,10 +250,10 @@ You ran a built-in eval, found rows where it disagreed with human judgment, enco
 ## Explore further
 
 <CardGroup cols={3}>
-  <Card title="Create Custom Evals" icon="wand-magic-sparkles" href="/docs/evaluation/features/custom">
+  <Card title="Create Custom Evals" icon="wand-magic-sparkles" href="/docs/evaluation/build/custom">
     Full reference for the custom eval template API
   </Card>
-  <Card title="Future AGI Models" icon="brain" href="/docs/evaluation/features/futureagi-models">
+  <Card title="FutureAGI Models" icon="brain" href="/docs/evaluation/judge-models/futureagi">
     Pick the right judge model: turing_small, turing_flash, turing_large
   </Card>
   <Card title="Eval Templates" icon="layer-group" href="/docs/evaluation/concepts/eval-templates">

diff --git a/src/pages/docs/cookbook/evaluation/eval-with-mcp-connectors.mdx b/src/pages/docs/cookbook/evaluation/eval-with-mcp-connectors.mdx
@@ -58,7 +58,7 @@ pip install fastmcp
 python crm_mcp_server.py
 ```
 
-Expose it through ngrok or your own tunnel so Future AGI can reach it:
+Expose it through ngrok or your own tunnel so FutureAGI can reach it:
 
 ```bash
 ngrok http 8000
@@ -68,7 +68,7 @@ Grab the public URL (e.g. `https://abc123.ngrok-free.app`).
 
 ---
 
-## Step 2: Register the connector in Future AGI
+## Step 2: Register the connector in FutureAGI
 
 1. Open **Settings → Connectors → Add Connector**.
 2. Fill in:
@@ -162,7 +162,7 @@ Open any row to see the **Tool Trace** — the exact tool call and response the
 
 - **Add more tools**: Extend the MCP server with `get_subscription`, `get_invoices`, `get_feature_flags`. The judge will pick the right tool per claim.
 - **Use a real CRM**: Replace the stub with HubSpot, Stripe, or Linear MCP servers. Set the connector auth to OAuth or bearer.
-- **Score traces, not datasets**: The same Agent-mode eval can run against [traced](/docs/observe) production conversations — wire it as a [continuous evaluation](/docs/evaluation/features/cicd).
+- **Score traces, not datasets**: The same Agent-mode eval can run against [traced](/docs/observe) production conversations — wire it as a [continuous evaluation](/docs/evaluation/run/cicd).
 - **Cross-source evals**: Enable two connectors at once (CRM + Notion docs). The judge calls both when the reply cites a product feature *and* a customer record.
 
 ---
@@ -179,6 +179,6 @@ Open any row to see the **Tool Trace** — the exact tool call and response the
 ## Next steps
 
 - [MCP Connectors concept](/docs/evaluation/concepts/mcp-connectors): The model behind Agent-mode evals.
-- [Configure MCP Connectors for an Eval](/docs/evaluation/features/mcp-connectors): The UI walkthrough.
-- [Create custom evals](/docs/evaluation/features/custom): Building eval templates without connectors.
+- [Configure MCP Connectors for an Eval](/docs/evaluation/build/mcp-connectors): The UI walkthrough.
+- [Create custom evals](/docs/evaluation/build/custom): Building eval templates without connectors.
 - [Falcon AI MCP Connectors](/docs/falcon-ai/features/mcp-connectors): Registering and authenticating a connector.
diff --git a/src/pages/docs/dataset/features/experiments.mdx b/src/pages/docs/dataset/features/experiments.mdx
@@ -77,7 +77,7 @@ Experiment creation is a guided three-step flow: **Basic Info → Configuration
       <Tab title="Custom models" icon="gear">
         Models you've added through Custom Models show up in the model picker for prompt configurations across all experiment types.
         <Tip>
-          See [Custom Models](/docs/evaluation/features/custom-models) for how to register a custom or self-hosted model.
+          See [Custom Models](/docs/evaluation/judge-models/custom) for how to register a custom or self-hosted model.
         </Tip>
       </Tab>
     </Tabs>
@@ -90,7 +90,7 @@ Experiment creation is a guided three-step flow: **Basic Info → Configuration
 
     **Compare against baseline (optional)**: pick a column from the dataset to compare model outputs against (typically a ground-truth or existing run-prompt column). Skip it if you don't have a reference output yet; you can still run the experiment, attach evals that don't need a baseline, and add a base column later by editing the experiment.
 
-    **Add evaluations**: click **Add Evaluation** and pick from the [built-in eval](/docs/evaluation/builtin) catalog or [create a custom eval](/docs/evaluation/features/custom). Add as many as you need. Every eval runs on every configuration so the results are directly comparable.
+    **Add evaluations**: click **Add Evaluation** and pick from the [built-in eval](/docs/evaluation/builtin) catalog or [create a custom eval](/docs/evaluation/build/custom). Add as many as you need. Every eval runs on every configuration so the results are directly comparable.
     ![Choosing Evals](/screenshot/product/dataset/how-to/experiments-in-dataset/7.png)
 
     For each eval, map its inputs (e.g. `output`, `input`, `expected`) to the model output or to dataset columns. Mapping is required before the experiment can run.