diff --git a/public/screenshot/product/dataset/how-to/experiments-in-dataset/2.png b/public/screenshot/product/dataset/how-to/experiments-in-dataset/2.png
index 73210f02..58e3c804 100644
Binary files a/public/screenshot/product/dataset/how-to/experiments-in-dataset/2.png and b/public/screenshot/product/dataset/how-to/experiments-in-dataset/2.png differ
diff --git a/public/screenshot/product/dataset/how-to/experiments-in-dataset/3.png b/public/screenshot/product/dataset/how-to/experiments-in-dataset/3.png
index 19ae84cb..a4cd484e 100644
Binary files a/public/screenshot/product/dataset/how-to/experiments-in-dataset/3.png and b/public/screenshot/product/dataset/how-to/experiments-in-dataset/3.png differ
diff --git a/public/screenshot/product/dataset/how-to/experiments-in-dataset/4.png b/public/screenshot/product/dataset/how-to/experiments-in-dataset/4.png
index a5e7602b..c617c25c 100644
Binary files a/public/screenshot/product/dataset/how-to/experiments-in-dataset/4.png and b/public/screenshot/product/dataset/how-to/experiments-in-dataset/4.png differ
diff --git a/public/screenshot/product/dataset/how-to/experiments-in-dataset/5.png b/public/screenshot/product/dataset/how-to/experiments-in-dataset/5.png
index 12ddf231..a239538d 100644
Binary files a/public/screenshot/product/dataset/how-to/experiments-in-dataset/5.png and b/public/screenshot/product/dataset/how-to/experiments-in-dataset/5.png differ
diff --git a/public/screenshot/product/dataset/how-to/experiments-in-dataset/6.png b/public/screenshot/product/dataset/how-to/experiments-in-dataset/6.png
index c10714ab..2eec4f4b 100644
Binary files a/public/screenshot/product/dataset/how-to/experiments-in-dataset/6.png and b/public/screenshot/product/dataset/how-to/experiments-in-dataset/6.png differ
diff --git a/public/screenshot/product/dataset/how-to/experiments-in-dataset/7.png b/public/screenshot/product/dataset/how-to/experiments-in-dataset/7.png
index 72fe6ad0..29510fed 100644
Binary files a/public/screenshot/product/dataset/how-to/experiments-in-dataset/7.png and b/public/screenshot/product/dataset/how-to/experiments-in-dataset/7.png differ
diff --git a/public/screenshot/product/dataset/how-to/experiments-in-dataset/8.png b/public/screenshot/product/dataset/how-to/experiments-in-dataset/8.png
index 1366a68e..c2b950b2 100644
Binary files a/public/screenshot/product/dataset/how-to/experiments-in-dataset/8.png and b/public/screenshot/product/dataset/how-to/experiments-in-dataset/8.png differ
diff --git a/public/screenshot/product/dataset/how-to/experiments-in-dataset/9.png b/public/screenshot/product/dataset/how-to/experiments-in-dataset/9.png
index eebc3ab3..9952a0ff 100644
Binary files a/public/screenshot/product/dataset/how-to/experiments-in-dataset/9.png and b/public/screenshot/product/dataset/how-to/experiments-in-dataset/9.png differ
diff --git a/src/components/docs/ApiExplorer.astro b/src/components/docs/ApiExplorer.astro
index ed80aaa3..9f0e3398 100644
--- a/src/components/docs/ApiExplorer.astro
+++ b/src/components/docs/ApiExplorer.astro
@@ -9,16 +9,27 @@
  * target pages and extracts .apg data to update the drawer in-place (no page reload).
  */
 import { tabNavigation } from '../../lib/navigation.ts';
+import { apiNavigation } from '../../lib/api-navigation.ts';
+// Build a href → method lookup from api-navigation.ts (source of truth for HTTP methods)
+const methodByHref: Record<string, string> = {};
+for (const group of apiNavigation) {
+  for (const item of group.items) {
+    if (item.href && item.method) {
+      methodByHref[item.href] = item.method;
+    }
+  }
+}
 
 // Extract the API tab navigation at build time
 const apiTab = tabNavigation.find(t => t.tab === 'API');
 const apiGroups = apiTab ? apiTab.groups : [];
 
 // Flatten the API groups into a structure suitable for the sidebar:
-// { group: string, items: { title: string, href: string, method?: string }[] }[]
+// { group: string, items: { title: string, href: string, method: string }[] }[]
 interface SidebarEndpoint {
   title: string;
   href: string;
+  method: string;
 }
 interface SidebarGroup {
   group: string;
@@ -29,7 +40,7 @@ function flattenItems(items: any[]): SidebarEndpoint[] {
   const result: SidebarEndpoint[] = [];
   for (const item of items) {
     if (item.href && item.href !== '/docs/api') {
-      result.push({ title: item.title, href: item.href });
+      result.push({ title: item.title, href: item.href, method: methodByHref[item.href] || 'GET' });
     }
     if (item.items) {
       result.push(...flattenItems(item.items));
@@ -476,8 +487,8 @@ const sidebarDataJson = JSON.stringify(sidebarGroups);
 
       for (var j = 0; j < groupItems.length; j++) {
         var ep = groupItems[j];
-        // Infer method from title
-        var method = inferMethodFromTitle(ep.title);
+        // Use method from api-navigation.ts (baked in at build time), fall back to title inference
+        var method = ep.method || inferMethodFromTitle(ep.title);
         var isActive = ep.href === window.location.pathname.replace(/\/$/, '');
         html += '<a href="' + esc(ep.href) + '" class="apx-sidebar-link' + (isActive ? ' apx-active' : '') + '" data-href="' + esc(ep.href) + '">';
         html += '<span class="apx-sidebar-method apx-sm-' + method.toLowerCase() + '">' + shortMethod(method) + '</span>';
@@ -494,7 +505,7 @@ const sidebarDataJson = JSON.stringify(sidebarGroups);
 
   function inferMethodFromTitle(title) {
     var t = title.toLowerCase();
-    if (t.indexOf('create') === 0 || t.indexOf('add') === 0 || t.indexOf('submit') === 0 || t.indexOf('apply') === 0 || t.indexOf('bulk') === 0 || t.indexOf('upload') === 0 || t.indexOf('run') === 0 || t.indexOf('execute') === 0 || t.indexOf('send') === 0 || t.indexOf('generate') === 0 || t.indexOf('clone') === 0) return 'POST';
+    if (t.indexOf('create') === 0 || t.indexOf('add') === 0 || t.indexOf('submit') === 0 || t.indexOf('apply') === 0 || t.indexOf('bulk') === 0 || t.indexOf('upload') === 0 || t.indexOf('run') === 0 || t.indexOf('execute') === 0 || t.indexOf('send') === 0 || t.indexOf('generate') === 0 || t.indexOf('clone') === 0 || t.indexOf('rerun') === 0 || t.indexOf('cancel') === 0 || t.indexOf('duplicate') === 0 || t.indexOf('restore') === 0 || t.indexOf('pause') === 0 || t.indexOf('unpause') === 0 || t.indexOf('release') === 0 || t.indexOf('assign') === 0 || t.indexOf('complete') === 0 || t.indexOf('skip') === 0 || t.indexOf('fetch') === 0 || t.indexOf('export') === 0) return 'POST';
     if (t.indexOf('update') === 0 || t.indexOf('edit') === 0 || t.indexOf('merge') === 0) return 'PATCH';
     if (t.indexOf('delete') === 0 || t.indexOf('remove') === 0) return 'DELETE';
     return 'GET';
@@ -680,13 +691,29 @@ const sidebarDataJson = JSON.stringify(sidebarGroups);
   function buildBodyForm(body, method) {
     if (!bodySection || !bodyFields) return;
 
-    if (!body || typeof body !== 'object' || Array.isArray(body) || !/POST|PUT|PATCH/.test(method)) {
+    if (!/POST|PUT|PATCH|DELETE/.test(method)) {
       bodySection.style.display = 'none';
       bodyFields.innerHTML = '';
       if (bodyIntro) bodyIntro.style.display = 'none';
       return;
     }
 
+    // Merge ParamField body metadata keys into body so fields documented
+    // via <ParamField body="..."> but missing from requestBody still appear.
+    body = body && typeof body === 'object' && !Array.isArray(body) ? Object.assign({}, body) : {};
+    for (var mk in currentBodyMeta) {
+      if (currentBodyMeta.hasOwnProperty(mk) && mk.indexOf('.') === -1 && !(mk in body)) {
+        var mt = currentBodyMeta[mk].type || 'string';
+        if (/array|list/i.test(mt)) body[mk] = [];
+        else if (/object/i.test(mt)) body[mk] = {};
+        else if (/bool/i.test(mt)) body[mk] = false;
+        else if (/int|number|double|float/i.test(mt)) body[mk] = 0;
+        else body[mk] = '';
+      }
+    }
+    // Keep currentReqBody in sync so optional picker and code gen use merged body
+    currentReqBody = body;
+
     var keys = Object.keys(body);
     if (keys.length === 0) {
       bodySection.style.display = 'none';
@@ -2011,7 +2038,7 @@ const sidebarDataJson = JSON.stringify(sidebarGroups);
       } else {
         lines.push('  -H "Authorization: Bearer ' + authInfo.token + '"');
       }
-      if (body && /POST|PUT|PATCH/.test(method)) {
+      if (body && /POST|PUT|PATCH|DELETE/.test(method)) {
         lines[lines.length - 1] += ' \\';
         lines.push('  -H "Content-Type: application/json" \\');
         lines.push("  -d '" + JSON.stringify(body, null, 2).replace(/'/g, "'\\''") + "'");
@@ -2026,7 +2053,7 @@ const sidebarDataJson = JSON.stringify(sidebarGroups);
       if (parts.length > 1 && method === 'GET') sdkMethod = 'get';
       var apiKeyVal = isApiKey ? authInfo.apiKey : authInfo.token;
       var lines = ['from fi.client import FutureAGI', '', 'client = FutureAGI(api_key="' + apiKeyVal + '")', ''];
-      if (body && typeof body === 'object' && /POST|PUT|PATCH/.test(method)) {
+      if (body && typeof body === 'object' && /POST|PUT|PATCH|DELETE/.test(method)) {
         lines.push('response = client.' + resource + '.' + sdkMethod + '(');
         var bKeys = Object.keys(body);
         for (var ki = 0; ki < bKeys.length; ki++) {
@@ -2047,16 +2074,16 @@ const sidebarDataJson = JSON.stringify(sidebarGroups);
         headerStr = '    headers={"Authorization": "Bearer ' + authInfo.token + '"},';
       }
       var lines = ['import requests', '', 'response = requests.' + method.toLowerCase() + '(', '    "' + url + '",', headerStr];
-      if (body && /POST|PUT|PATCH/.test(method)) lines.push('    json=' + JSON.stringify(body, null, 4) + ',');
+      if (body && /POST|PUT|PATCH|DELETE/.test(method)) lines.push('    json=' + JSON.stringify(body, null, 4) + ',');
       lines.push(')'); lines.push('print(response.json())');
       return lines.join('\n');
     }
     if (lang === 'go') {
       var lines = ['package main', '', 'import (', '    "bytes"', '    "fmt"', '    "io"', '    "net/http"'];
-      if (body && /POST|PUT|PATCH/.test(method)) lines.push('    "encoding/json"');
+      if (body && /POST|PUT|PATCH|DELETE/.test(method)) lines.push('    "encoding/json"');
       lines.push(')', '');
       lines.push('func main() {');
-      if (body && /POST|PUT|PATCH/.test(method)) {
+      if (body && /POST|PUT|PATCH|DELETE/.test(method)) {
         lines.push('    body, _ := json.Marshal(' + JSON.stringify(body) + ')');
         lines.push('    req, _ := http.NewRequest("' + method + '", "' + url + '", bytes.NewBuffer(body))');
       } else {
@@ -2089,7 +2116,7 @@ const sidebarDataJson = JSON.stringify(sidebarGroups);
         builderLines.push('    .header("Authorization", "Bearer ' + authInfo.token + '")');
       }
       builderLines.push('    .header("Content-Type", "application/json")');
-      if (body && /POST|PUT|PATCH/.test(method)) {
+      if (body && /POST|PUT|PATCH|DELETE/.test(method)) {
         builderLines.push('    .' + method + '(HttpRequest.BodyPublishers.ofString(' + JSON.stringify(JSON.stringify(body)) + '))');
       } else if (method === 'DELETE') {
         builderLines.push('    .DELETE()');
@@ -2105,7 +2132,7 @@ const sidebarDataJson = JSON.stringify(sidebarGroups);
     }
     if (lang === 'php') {
       var lines = ['<?php', '', '$ch = curl_init();', 'curl_setopt($ch, CURLOPT_URL, "' + url + '");', 'curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);'];
-      if (/POST|PUT|PATCH/.test(method)) {
+      if (/POST|PUT|PATCH|DELETE/.test(method)) {
         lines.push('curl_setopt($ch, CURLOPT_CUSTOMREQUEST, "' + method + '");');
       } else if (method === 'DELETE') {
         lines.push('curl_setopt($ch, CURLOPT_CUSTOMREQUEST, "DELETE");');
@@ -2121,7 +2148,7 @@ const sidebarDataJson = JSON.stringify(sidebarGroups);
       lines.push('curl_setopt($ch, CURLOPT_HTTPHEADER, [');
       lines.push(phpHeaders.join(',\n'));
       lines.push(']);');
-      if (body && /POST|PUT|PATCH/.test(method)) {
+      if (body && /POST|PUT|PATCH|DELETE/.test(method)) {
         lines.push("curl_setopt($ch, CURLOPT_POSTFIELDS, '" + JSON.stringify(body, null, 2).replace(/'/g, "\\'") + "');");
       }
       lines.push('');
@@ -2145,7 +2172,7 @@ const sidebarDataJson = JSON.stringify(sidebarGroups);
         lines.push('request["Authorization"] = "Bearer ' + authInfo.token + '"');
       }
       lines.push('request["Content-Type"] = "application/json"');
-      if (body && /POST|PUT|PATCH/.test(method)) {
+      if (body && /POST|PUT|PATCH|DELETE/.test(method)) {
         lines.push('request.body = ' + JSON.stringify(JSON.stringify(body, null, 2)));
       }
       lines.push('');
@@ -2163,7 +2190,7 @@ const sidebarDataJson = JSON.stringify(sidebarGroups);
     var lines = ['const response = await fetch("' + url + '", {', '  method: "' + method + '",', '  headers: {'];
     lines = lines.concat(jsHeaders);
     lines.push('  },');
-    if (body && /POST|PUT|PATCH/.test(method)) lines.push('  body: JSON.stringify(' + JSON.stringify(body, null, 4) + '),');
+    if (body && /POST|PUT|PATCH|DELETE/.test(method)) lines.push('  body: JSON.stringify(' + JSON.stringify(body, null, 4) + '),');
     lines.push('});'); lines.push('const data = await response.json();');
     return lines.join('\n');
   }
diff --git a/src/components/docs/ApiPlayground.astro b/src/components/docs/ApiPlayground.astro
index 3db47281..6c2174ee 100644
--- a/src/components/docs/ApiPlayground.astro
+++ b/src/components/docs/ApiPlayground.astro
@@ -200,7 +200,7 @@ const respJson = responseExample ? JSON.stringify(responseExample, null, 2) : nu
     var b=hiddenFields ? Object.assign({}, requestBody, hiddenFields) : requestBody;
     if(lang==='curl'){
       var r=['curl '+u+' \\','  -H "Authorization: Bearer '+t+'"'];
-      if(b&&/POST|PUT|PATCH/.test(m)){
+      if(b&&/POST|PUT|PATCH|DELETE/.test(m)){
         r[r.length-1]+=' \\';
         r.push('  -H "Content-Type: application/json" \\');
         r.push("  -d '"+JSON.stringify(b,null,2)+"'");
@@ -221,7 +221,7 @@ const respJson = responseExample ? JSON.stringify(responseExample, null, 2) : nu
       var r = ['from fi.client import FutureAGI', ''];
       r.push('client = FutureAGI(api_key="' + t + '")');
       r.push('');
-      if (b && /POST|PUT|PATCH/.test(m)) {
+      if (b && /POST|PUT|PATCH|DELETE/.test(m)) {
         r.push('response = client.' + resource + '.' + sdkMethod + '(');
         var bKeys = Object.keys(b);
         for (var ki = 0; ki < bKeys.length; ki++) {
@@ -238,13 +238,13 @@ const respJson = responseExample ? JSON.stringify(responseExample, null, 2) : nu
     }
     if(lang==='python'){
       var r=['import requests','','response = requests.'+m.toLowerCase()+'(','    "'+u+'",','    headers={"Authorization": "Bearer '+t+'"},'];
-      if(b&&/POST|PUT|PATCH/.test(m)) r.push('    json='+JSON.stringify(b,null,4)+',');
+      if(b&&/POST|PUT|PATCH|DELETE/.test(m)) r.push('    json='+JSON.stringify(b,null,4)+',');
       r.push(')');r.push('print(response.json())');
       return r.join('\n');
     }
     // javascript
     var r=['const response = await fetch("'+u+'", {','  method: "'+m+'",','  headers: {','    "Authorization": "Bearer '+t+'",','    "Content-Type": "application/json",','  },'];
-    if(b&&/POST|PUT|PATCH/.test(m)) r.push('  body: JSON.stringify('+JSON.stringify(b,null,4)+'),');
+    if(b&&/POST|PUT|PATCH|DELETE/.test(m)) r.push('  body: JSON.stringify('+JSON.stringify(b,null,4)+'),');
     r.push('});');r.push('const data = await response.json();');
     return r.join('\n');
   }
diff --git a/src/lib/api-navigation.ts b/src/lib/api-navigation.ts
index 1d37e675..f5ae3e9d 100644
--- a/src/lib/api-navigation.ts
+++ b/src/lib/api-navigation.ts
@@ -267,11 +267,6 @@ export const apiNavigation: ApiNavGroup[] = [
         "href": "/docs/api/test-executions/getperformancesummary",
         "method": "GET"
       },
-      {
-        "title": "Get eval explanation summary",
-        "href": "/docs/api/test-executions/getevalexplanationsummary",
-        "method": "GET"
-      },
       {
         "title": "Cancel test execution",
         "href": "/docs/api/test-executions/cancelexecution",
@@ -281,20 +276,10 @@ export const apiNavigation: ApiNavGroup[] = [
         "title": "Rerun call executions",
         "href": "/docs/api/test-executions/reruncalls",
         "method": "POST"
-      }
-    ]
-  },
-  {
-    "title": "Call Executions",
-    "items": [
-      {
-        "title": "Get call execution details",
-        "href": "/docs/api/call-executions/getcallexecutiondetails",
-        "method": "GET"
       },
       {
-        "title": "Compare execution sessions",
-        "href": "/docs/api/call-executions/getsessioncomparison",
+        "title": "Get call execution details",
+        "href": "/docs/api/test-executions/getcallexecutiondetails",
         "method": "GET"
       }
     ]
diff --git a/src/lib/navigation.ts b/src/lib/navigation.ts
index ec0da150..16b415c2 100644
--- a/src/lib/navigation.ts
+++ b/src/lib/navigation.ts
@@ -1112,16 +1112,9 @@ export const tabNavigation: NavTab[] = [
               { title: 'Get Execution Details', href: '/docs/api/test-executions/gettestexecutiondetails' },
               { title: 'Get Execution KPIs', href: '/docs/api/test-executions/getkpis' },
               { title: 'Get Performance Summary', href: '/docs/api/test-executions/getperformancesummary' },
-              { title: 'Get Eval Explanation', href: '/docs/api/test-executions/getevalexplanationsummary' },
               { title: 'Cancel Execution', href: '/docs/api/test-executions/cancelexecution' },
               { title: 'Rerun Calls', href: '/docs/api/test-executions/reruncalls' },
-            ]
-          },
-          {
-            title: 'Call Executions',
-            items: [
-              { title: 'Get Call Details', href: '/docs/api/call-executions/getcallexecutiondetails' },
-              { title: 'Compare Sessions', href: '/docs/api/call-executions/getsessioncomparison' },
+              { title: 'Get Call Details', href: '/docs/api/test-executions/getcallexecutiondetails' },
             ]
           },
           {
diff --git a/src/pages/docs/api/call-executions/getsessioncomparison.mdx b/src/pages/docs/api/call-executions/getsessioncomparison.mdx
deleted file mode 100644
index 9861468a..00000000
--- a/src/pages/docs/api/call-executions/getsessioncomparison.mdx
+++ /dev/null
@@ -1,84 +0,0 @@
----
-title: "Compare execution sessions"
-description: "Compares a call execution with its most recent rerun snapshot."
----
-
-<ApiPlayground
-  method="GET"
-  endpoint="/simulate/call-executions/{call_execution_id}/session-comparison/"
-  baseUrl="https://api.futureagi.com"
-  parameters={[
-    {"name": "call_execution_id", "in": "path", "required": true, "description": "UUID of the call execution.", "type": "string"}
-  ]}
-  responseExample={{
-    metrics_comparison: {
-      current: { avg_agent_latency: 0.85, talk_ratio: 0.55 },
-      previous: { avg_agent_latency: 1.12, talk_ratio: 0.48 }
-    },
-    transcript_comparison: {
-      current: [
-        { role: "assistant", content: "Hello, how can I help you today?" },
-        { role: "user", content: "I have a billing question." }
-      ],
-      previous: [
-        { role: "assistant", content: "Hi there, what can I do for you?" },
-        { role: "user", content: "I need help with my bill." }
-      ]
-    },
-    recording_urls: {
-      current: "https://storage.example.com/recordings/call-v2.wav",
-      previous: "https://storage.example.com/recordings/call-v1.wav"
-    },
-    eval_comparison: {
-      current: { tone_check: { score: 9 } },
-      previous: { tone_check: { score: 7 } }
-    }
-  }}
-  responseStatus={200}
-  responseStatusText="OK"
-/>
-
-<ApiSection title="Authentication">
-  <ParamField name="X-Api-Key" type="API Key" required>
-    Your Future AGI API key used to authenticate requests. You can find and manage your API keys in the [Dashboard](https://app.futureagi.com) under Settings.
-  </ParamField>
-  <ParamField name="X-Secret-Key" type="Secret Key" required>
-    Your Future AGI secret key, used alongside the API key for request authentication. This is generated when you create an API key in the [Dashboard](https://app.futureagi.com).
-  </ParamField>
-</ApiSection>
-
-<ApiSection title="Path parameters">
-  <ParamField path="call_execution_id" type="UUID" required>
-    The call execution ID. Must have at least one rerun snapshot.
-  </ParamField>
-</ApiSection>
-
-<ApiSection title="Response" status={200} statusText="OK">
-  <ResponseField name="metrics_comparison" type="object">Performance metrics for current and previous sessions.</ResponseField>
-  <ApiCollapsible title="Show 2 properties">
-    <ResponseField name="current" type="object">Current execution metrics.</ResponseField>
-    <ResponseField name="previous" type="object">Previous rerun snapshot metrics.</ResponseField>
-  </ApiCollapsible>
-  <ResponseField name="transcript_comparison" type="object">Transcripts for current and previous sessions.</ResponseField>
-  <ApiCollapsible title="Show 2 properties">
-    <ResponseField name="current" type="array of objects">Current session transcript with `role` and `content`.</ResponseField>
-    <ResponseField name="previous" type="array of objects">Previous rerun snapshot transcript.</ResponseField>
-  </ApiCollapsible>
-  <ResponseField name="recording_urls" type="object">Audio recording URLs. Voice only.</ResponseField>
-  <ApiCollapsible title="Show 2 properties">
-    <ResponseField name="current" type="string">Current execution recording URL.</ResponseField>
-    <ResponseField name="previous" type="string">Previous rerun snapshot recording URL.</ResponseField>
-  </ApiCollapsible>
-  <ResponseField name="eval_comparison" type="object">Evaluation results for both sessions.</ResponseField>
-  <ApiCollapsible title="Show 2 properties">
-    <ResponseField name="current" type="object">Current execution eval results.</ResponseField>
-    <ResponseField name="previous" type="object">Previous rerun snapshot eval results.</ResponseField>
-  </ApiCollapsible>
-</ApiSection>
-
-<ApiSection title="Errors">
-  <ParamField name="400" type="Bad Request">No rerun snapshot available for comparison.</ParamField>
-  <ParamField name="401" type="Unauthorized">Invalid or missing credentials.</ParamField>
-  <ParamField name="404" type="Not Found">Call execution not found.</ParamField>
-  <ParamField name="500" type="Internal Server Error">Unexpected server error.</ParamField>
-</ApiSection>
diff --git a/src/pages/docs/api/index.mdx b/src/pages/docs/api/index.mdx
index 4eb23010..6bbc4508 100644
--- a/src/pages/docs/api/index.mdx
+++ b/src/pages/docs/api/index.mdx
@@ -50,9 +50,6 @@ Get your API key from the [Future AGI Dashboard](https://app.futureagi.com/setti
   <Card title="Test Executions" href="/docs/api/test-executions/gettestexecutiondetails" icon="code">
     Test execution tracking and analytics
   </Card>
-  <Card title="Call Executions" href="/docs/api/call-executions/getcallexecutiondetails" icon="code">
-    Individual call execution details
-  </Card>
   <Card title="Datasets" href="/docs/api/datasets/create-dataset" icon="code">
     Dataset creation, modification, and data management
   </Card>
diff --git a/src/pages/docs/api/run-tests/deletetestexecutions.mdx b/src/pages/docs/api/run-tests/deletetestexecutions.mdx
index 6855a0c9..bd748e69 100644
--- a/src/pages/docs/api/run-tests/deletetestexecutions.mdx
+++ b/src/pages/docs/api/run-tests/deletetestexecutions.mdx
@@ -10,8 +10,8 @@ description: "Bulk-deletes test executions from a test run."
   parameters={[
     {"name": "run_test_id", "in": "path", "required": true, "description": "UUID of the test run from which to delete test executions.", "type": "string"}
   ]}
-  requestBody={{"testExecutionIds": ["execution-uuid-1", "execution-uuid-2"], "selectAll": false}}
-  responseExample={{"message": "Successfully deleted 2 test execution(s).", "runTestId": "run-test-uuid", "deletedCount": 2, "deletedIds": ["execution-uuid-1", "execution-uuid-2"]}}
+  requestBody={{"test_execution_ids": ["execution-uuid-1", "execution-uuid-2"], "select_all": false}}
+  responseExample={{"message": "Successfully deleted 2 test execution(s).", "run_test_id": "run-test-uuid", "deleted_count": 2, "deleted_ids": ["execution-uuid-1", "execution-uuid-2"]}}
   responseStatus={200}
   responseStatusText="OK"
 />
@@ -32,19 +32,19 @@ description: "Bulk-deletes test executions from a test run."
 </ApiSection>
 
 <ApiSection title="Request body">
-  <ParamField body="testExecutionIds" type="array of strings" optional>
-    Array of test execution UUIDs to delete. Required when `selectAll` is `false`. Executions in `RUNNING`, `PENDING`, or `CANCELLING` status cannot be deleted.
+  <ParamField body="test_execution_ids" type="array of strings" optional>
+    Array of test execution UUIDs to delete. Required when `select_all` is `false`. Executions in `RUNNING`, `PENDING`, or `CANCELLING` status cannot be deleted.
   </ParamField>
-  <ParamField body="selectAll" type="boolean" optional>
-    When `true`, deletes all eligible executions, ignoring `testExecutionIds`. Defaults to `false`.
+  <ParamField body="select_all" type="boolean" optional>
+    When `true`, deletes all eligible executions, ignoring `test_execution_ids`. Defaults to `false`.
   </ParamField>
 </ApiSection>
 
 <ApiSection title="Response" status={200} statusText="OK">
   <ResponseField name="message" type="string">Confirmation message with deletion count.</ResponseField>
-  <ResponseField name="runTestId" type="string">UUID of the parent test run.</ResponseField>
-  <ResponseField name="deletedCount" type="integer">Number of executions deleted.</ResponseField>
-  <ResponseField name="deletedIds" type="array of strings">UUIDs of the deleted executions.</ResponseField>
+  <ResponseField name="run_test_id" type="string">UUID of the parent test run.</ResponseField>
+  <ResponseField name="deleted_count" type="integer">Number of executions deleted.</ResponseField>
+  <ResponseField name="deleted_ids" type="array of strings">UUIDs of the deleted executions.</ResponseField>
 </ApiSection>
 
 <ApiSection title="Errors">
diff --git a/src/pages/docs/api/test-executions/cancelexecution.mdx b/src/pages/docs/api/test-executions/cancelexecution.mdx
index be2f3888..724759b0 100644
--- a/src/pages/docs/api/test-executions/cancelexecution.mdx
+++ b/src/pages/docs/api/test-executions/cancelexecution.mdx
@@ -10,7 +10,6 @@ description: "Cancels a test execution."
   parameters={[
     {"name": "test_execution_id", "in": "path", "required": true, "description": "UUID of the test execution to cancel.", "type": "string"}
   ]}
-  requestBody={{}}
   responseExample={{
     success: true,
     message: "Test execution cancellation initiated",
@@ -35,11 +34,6 @@ description: "Cancels a test execution."
   </ParamField>
 </ApiSection>
 
-<ApiSection title="Request body">
-  <ParamField body="(empty)" type="object" optional>
-    No body required. Send `{}`.
-  </ParamField>
-</ApiSection>
 
 <ApiSection title="Response" status={200} statusText="OK">
   <ResponseField name="success" type="boolean">Whether the cancellation was accepted.</ResponseField>
diff --git a/src/pages/docs/api/call-executions/getcallexecutiondetails.mdx b/src/pages/docs/api/test-executions/getcallexecutiondetails.mdx
similarity index 75%
rename from src/pages/docs/api/call-executions/getcallexecutiondetails.mdx
rename to src/pages/docs/api/test-executions/getcallexecutiondetails.mdx
index 33e8eea0..3f1632ca 100644
--- a/src/pages/docs/api/call-executions/getcallexecutiondetails.mdx
+++ b/src/pages/docs/api/test-executions/getcallexecutiondetails.mdx
@@ -35,26 +35,46 @@ description: "Retrieves a specific call execution."
     customer_name: "Jane Doe",
     call_summary: "Customer inquired about billing charges.",
     ended_reason: "customer_hangup",
-    simulatorAgentName: "Billing Simulator",
-    simulatorAgentId: "sim-agent-uuid",
+    simulator_agent_name: "Billing Simulator",
+    simulator_agent_id: "sim-agent-uuid",
     agent_definition_used_name: "Support Agent v2",
     agent_definition_used_id: "agent-def-uuid",
     tool_outputs: null,
     rerun_snapshots: [],
+    provider: "vapi",
+    phone_number: "+14155550100",
+    simulation_call_type: "voice",
+    processing_skipped: false,
+    processing_skip_reason: null,
+    is_snapshot: false,
+    snapshot_timestamp: null,
+    rerun_type: null,
+    original_call_execution_id: null,
     avg_agent_latency: 0.85,
     user_interruption_count: 1,
     user_interruption_rate: 0.05,
     user_wpm: 130,
     bot_wpm: 145,
     talk_ratio: 0.55,
+    agent_talk_percentage: 55.0,
     ai_interruption_count: 0,
     ai_interruption_rate: 0.0,
     avg_stop_time_after_interruption: 0.3,
+    total_tokens: null,
+    input_tokens: null,
+    output_tokens: null,
+    avg_latency_ms: null,
+    turn_count: null,
+    csat_score: null,
     stt_cost: 0.012,
     llm_cost: 0.045,
     tts_cost: 0.008,
     storage_cost: 0.002,
-    total_cost: 0.067
+    total_cost: 0.067,
+    customer_cost_cents: null,
+    customer_cost_breakdown: null,
+    customer_latency_metrics: null,
+    customer_call_id: null
   }}
   responseStatus={200}
   responseStatusText="OK"
@@ -101,12 +121,21 @@ description: "Retrieves a specific call execution."
   <ResponseField name="customer_name" type="string or null">Simulated customer persona name.</ResponseField>
   <ResponseField name="call_summary" type="string or null">AI-generated conversation summary.</ResponseField>
   <ResponseField name="ended_reason" type="string or null">Reason the call ended, e.g. `customer_hangup`, `agent_hangup`, `timeout`, `error`.</ResponseField>
-  <ResponseField name="simulatorAgentName" type="string or null">Simulator agent name.</ResponseField>
-  <ResponseField name="simulatorAgentId" type="string or null">UUID of the simulator agent.</ResponseField>
+  <ResponseField name="simulator_agent_name" type="string or null">Simulator agent name.</ResponseField>
+  <ResponseField name="simulator_agent_id" type="string or null">UUID of the simulator agent.</ResponseField>
   <ResponseField name="agent_definition_used_name" type="string or null">Agent definition name.</ResponseField>
   <ResponseField name="agent_definition_used_id" type="string or null">UUID of the agent definition.</ResponseField>
   <ResponseField name="tool_outputs" type="object or null">Tool call outputs from the conversation.</ResponseField>
   <ResponseField name="rerun_snapshots" type="array">Snapshots from previous reruns.</ResponseField>
+  <ResponseField name="provider" type="string or null">Telephony or chat provider used for this call, e.g. `vapi`, `retell`.</ResponseField>
+  <ResponseField name="phone_number" type="string or null">Phone number dialed for this call. Voice only.</ResponseField>
+  <ResponseField name="simulation_call_type" type="string or null">Simulation mode: `voice` or `text`.</ResponseField>
+  <ResponseField name="processing_skipped" type="boolean or null">Whether post-call processing was skipped.</ResponseField>
+  <ResponseField name="processing_skip_reason" type="string or null">Reason processing was skipped, if applicable.</ResponseField>
+  <ResponseField name="is_snapshot" type="boolean">Whether this record is a rerun snapshot rather than the live call.</ResponseField>
+  <ResponseField name="snapshot_timestamp" type="datetime or null">Timestamp when the snapshot was taken.</ResponseField>
+  <ResponseField name="rerun_type" type="string or null">Type of the most recent rerun: `eval_only` or `call_and_eval`. `null` if never rerun.</ResponseField>
+  <ResponseField name="original_call_execution_id" type="string or null">UUID of the original call execution this is a snapshot of.</ResponseField>
   <ResponseField name="avg_agent_latency" type="number or null">Average agent response latency in seconds. Voice only.</ResponseField>
   <ResponseField name="user_interruption_count" type="integer or null">User interruption count. Voice only.</ResponseField>
   <ResponseField name="user_interruption_rate" type="number or null">Proportion of agent turns interrupted by user (0-1). Voice only.</ResponseField>
@@ -121,7 +150,12 @@ description: "Retrieves a specific call execution."
   <ResponseField name="output_tokens" type="integer or null">Output tokens generated. Text only.</ResponseField>
   <ResponseField name="avg_latency_ms" type="number or null">Average response latency in milliseconds. Text only.</ResponseField>
   <ResponseField name="turn_count" type="integer or null">Total conversation turns. Text only.</ResponseField>
+  <ResponseField name="agent_talk_percentage" type="number or null">Percentage of conversation time the agent was talking (0-100). Voice only.</ResponseField>
   <ResponseField name="csat_score" type="number or null">Customer satisfaction score. Text only.</ResponseField>
+  <ResponseField name="customer_cost_cents" type="integer or null">Cost of the call in cents as reported by the customer's telephony provider.</ResponseField>
+  <ResponseField name="customer_cost_breakdown" type="object or null">Detailed cost breakdown from the customer's provider.</ResponseField>
+  <ResponseField name="customer_latency_metrics" type="object or null">Latency metrics as reported by the customer's provider.</ResponseField>
+  <ResponseField name="customer_call_id" type="string or null">Call ID assigned by the customer's telephony provider.</ResponseField>
   <ResponseField name="stt_cost" type="number or null">Speech-to-text cost in USD. Voice only.</ResponseField>
   <ResponseField name="llm_cost" type="number or null">LLM inference cost in USD.</ResponseField>
   <ResponseField name="tts_cost" type="number or null">Text-to-speech cost in USD. Voice only.</ResponseField>
diff --git a/src/pages/docs/api/test-executions/getevalexplanationsummary.mdx b/src/pages/docs/api/test-executions/getevalexplanationsummary.mdx
deleted file mode 100644
index 21bc87ce..00000000
--- a/src/pages/docs/api/test-executions/getevalexplanationsummary.mdx
+++ /dev/null
@@ -1,58 +0,0 @@
----
-title: "Get eval explanation summary"
-description: "Retrieves the eval explanation summary for a test execution."
----
-
-<ApiPlayground
-  method="GET"
-  endpoint="/simulate/test-executions/{test_execution_id}/eval-explanation-summary/"
-  baseUrl="https://api.futureagi.com"
-  parameters={[
-    {"name": "test_execution_id", "in": "path", "required": true, "description": "UUID of the test execution.", "type": "string"}
-  ]}
-  responseExample={{
-    data: {
-      response: {
-        summary: "The agent performed well on tone and accuracy but struggled with complex billing disputes.",
-        critical_issues: ["Agent failed to escalate unresolved billing complaints in 3 out of 10 calls."],
-        recommendations: ["Add escalation logic for billing disputes exceeding $500."]
-      },
-      last_updated: "2026-03-15T12:45:00Z",
-      status: "completed"
-    },
-    success: true
-  }}
-  responseStatus={200}
-  responseStatusText="OK"
-/>
-
-<ApiSection title="Authentication">
-  <ParamField name="X-Api-Key" type="API Key" required>
-    Your Future AGI API key used to authenticate requests. You can find and manage your API keys in the [Dashboard](https://app.futureagi.com) under Settings.
-  </ParamField>
-  <ParamField name="X-Secret-Key" type="Secret Key" required>
-    Your Future AGI secret key, used alongside the API key for request authentication. This is generated when you create an API key in the [Dashboard](https://app.futureagi.com).
-  </ParamField>
-</ApiSection>
-
-<ApiSection title="Path parameters">
-  <ParamField path="test_execution_id" type="UUID" required>
-    The test execution ID. Triggers async generation if not yet available.
-  </ParamField>
-</ApiSection>
-
-<ApiSection title="Response" status={200} statusText="OK">
-  <ResponseField name="data" type="object">Summary data and generation metadata.</ResponseField>
-  <ApiCollapsible title="Show 3 properties">
-    <ResponseField name="response" type="object or null">Summary with performance overview, issues, and recommendations.</ResponseField>
-    <ResponseField name="last_updated" type="datetime or null">ISO 8601 timestamp of last generation.</ResponseField>
-    <ResponseField name="status" type="string">Generation status: `pending`, `running`, `completed`, or `failed`.</ResponseField>
-  </ApiCollapsible>
-  <ResponseField name="success" type="boolean">Whether the request succeeded.</ResponseField>
-</ApiSection>
-
-<ApiSection title="Errors">
-  <ParamField name="401" type="Unauthorized">Invalid or missing credentials.</ParamField>
-  <ParamField name="404" type="Not Found">Test execution not found.</ParamField>
-  <ParamField name="500" type="Internal Server Error">Unexpected server error.</ParamField>
-</ApiSection>
diff --git a/src/pages/docs/api/test-executions/getkpis.mdx b/src/pages/docs/api/test-executions/getkpis.mdx
index 4e5cfd91..90b06778 100644
--- a/src/pages/docs/api/test-executions/getkpis.mdx
+++ b/src/pages/docs/api/test-executions/getkpis.mdx
@@ -14,14 +14,14 @@ description: "Retrieves KPI metrics for a test execution."
     total_calls: 50,
     avg_score: 8.2,
     avg_response: 1.15,
-    callsAttempted: 50,
-    connectedCalls: 47,
-    callsConnectedPercentage: 94.0,
+    calls_attempted: 50,
+    connected_calls: 47,
+    calls_connected_percentage: 94.0,
     failed_calls: 3,
     total_duration: 6250,
     agent_type: "voice",
     is_inbound: false,
-    scenarioGraphs: {},
+    scenario_graphs: {},
     avg_agent_latency: 0.92,
     avg_user_interruption_count: 1.4,
     avg_user_interruption_rate: 0.08,
@@ -31,8 +31,8 @@ description: "Retrieves KPI metrics for a test execution."
     avg_ai_interruption_count: 0.3,
     avg_ai_interruption_rate: 0.02,
     avg_stop_time_after_interruption: 0.35,
-    agentTalkPercentage: 55.0,
-    customerTalkPercentage: 45.0,
+    agent_talk_percentage: 55.0,
+    customer_talk_percentage: 45.0,
     avg_tone_check: 8.7,
     avg_accuracy: 7.9
   }}
@@ -59,14 +59,14 @@ description: "Retrieves KPI metrics for a test execution."
   <ResponseField name="total_calls" type="integer">Total call executions.</ResponseField>
   <ResponseField name="avg_score" type="number">Average evaluation score across completed calls.</ResponseField>
   <ResponseField name="avg_response" type="number">Average response time in seconds.</ResponseField>
-  <ResponseField name="callsAttempted" type="integer">Total calls initiated.</ResponseField>
-  <ResponseField name="connectedCalls" type="integer">Calls that connected.</ResponseField>
-  <ResponseField name="callsConnectedPercentage" type="number">Percentage of calls that connected.</ResponseField>
+  <ResponseField name="calls_attempted" type="integer">Total calls initiated.</ResponseField>
+  <ResponseField name="connected_calls" type="integer">Calls that connected.</ResponseField>
+  <ResponseField name="calls_connected_percentage" type="number">Percentage of calls that connected.</ResponseField>
   <ResponseField name="failed_calls" type="integer">Calls that failed.</ResponseField>
   <ResponseField name="total_duration" type="integer">Combined duration in seconds.</ResponseField>
   <ResponseField name="agent_type" type="string">`voice` or `text`.</ResponseField>
   <ResponseField name="is_inbound" type="boolean or null">`true` for inbound, `false` for outbound. `null` for text agents.</ResponseField>
-  <ResponseField name="scenarioGraphs" type="object">Per-scenario performance data.</ResponseField>
+  <ResponseField name="scenario_graphs" type="object">Per-scenario performance data.</ResponseField>
   <ResponseField name="avg_agent_latency" type="number">Average agent latency in seconds. Voice only.</ResponseField>
   <ResponseField name="avg_user_interruption_count" type="number">Average user interruptions per call. Voice only.</ResponseField>
   <ResponseField name="avg_user_interruption_rate" type="number">Average user interruption rate (0-1). Voice only.</ResponseField>
@@ -76,8 +76,8 @@ description: "Retrieves KPI metrics for a test execution."
   <ResponseField name="avg_ai_interruption_count" type="number">Average agent interruptions per call. Voice only.</ResponseField>
   <ResponseField name="avg_ai_interruption_rate" type="number">Average agent interruption rate (0-1). Voice only.</ResponseField>
   <ResponseField name="avg_stop_time_after_interruption" type="number">Average seconds to stop after interruption. Voice only.</ResponseField>
-  <ResponseField name="agentTalkPercentage" type="number">Agent talk time percentage (0-100). Voice only.</ResponseField>
-  <ResponseField name="customerTalkPercentage" type="number">Customer talk time percentage (0-100). Voice only.</ResponseField>
+  <ResponseField name="agent_talk_percentage" type="number">Agent talk time percentage (0-100). Voice only.</ResponseField>
+  <ResponseField name="customer_talk_percentage" type="number">Customer talk time percentage (0-100). Voice only.</ResponseField>
   <ResponseField name="avg_total_tokens" type="number">Average total tokens per call. Text only.</ResponseField>
   <ResponseField name="avg_input_tokens" type="number">Average input tokens per call. Text only.</ResponseField>
   <ResponseField name="avg_output_tokens" type="number">Average output tokens per call. Text only.</ResponseField>
diff --git a/src/pages/docs/api/test-executions/gettestexecutiondetails.mdx b/src/pages/docs/api/test-executions/gettestexecutiondetails.mdx
index 0df874e3..f914bbbf 100644
--- a/src/pages/docs/api/test-executions/gettestexecutiondetails.mdx
+++ b/src/pages/docs/api/test-executions/gettestexecutiondetails.mdx
@@ -1,6 +1,6 @@
 ---
 title: "Get test execution details"
-description: "Retrieves a test execution with its call executions."
+description: "Retrieves a test execution with paginated call executions and column configuration."
 ---
 
 <ApiPlayground
@@ -9,37 +9,44 @@ description: "Retrieves a test execution with its call executions."
   baseUrl="https://api.futureagi.com"
   parameters={[
     {"name": "test_execution_id", "in": "path", "required": true, "description": "UUID of the test execution.", "type": "string"},
-    {"name": "search", "in": "query", "required": false, "description": "Filter call executions by search string.", "type": "string"},
+    {"name": "search", "in": "query", "required": false, "description": "Filter call executions by phone number or scenario name.", "type": "string"},
     {"name": "page", "in": "query", "required": false, "description": "Page number for pagination.", "type": "integer"},
-    {"name": "filters", "in": "query", "required": false, "description": "JSON-encoded filter array.", "type": "string"},
+    {"name": "limit", "in": "query", "required": false, "description": "Number of call executions per page. Defaults to 30.", "type": "integer"},
+    {"name": "filters", "in": "query", "required": false, "description": "JSON-encoded array of filter objects. Each item must have column_id and filter_config.filter_op/filter_type/filter_value.", "type": "string"},
     {"name": "row_groups", "in": "query", "required": false, "description": "JSON-encoded array of column IDs to group by.", "type": "string"},
     {"name": "group_keys", "in": "query", "required": false, "description": "JSON-encoded array of group key values.", "type": "string"}
   ]}
   responseExample={{
-    id: "a1b2c3d4-e5f6-7890-abcd-ef1234567890",
-    run_test: "f7a8b9c0-d1e2-3456-789a-bcdef0123456",
-    run_test_name: "Billing Support Test",
-    agent_definition_name: "Support Agent v2",
+    count: 50,
+    next: "https://api.futureagi.com/simulate/test-executions/a1b2c3d4-e5f6-7890-abcd-ef1234567890/?page=2",
+    previous: null,
+    total_pages: 2,
+    current_page: 1,
+    results: [
+      {
+        id: "call-exec-uuid-1",
+        status: "completed",
+        duration: 45.2,
+        overall_score: 8.5,
+        eval_outputs: {},
+        scenario: "Billing Inquiry",
+        created_at: "2026-03-15T10:05:00Z"
+      }
+    ],
+    column_order: [
+      {
+        id: "col-uuid-1",
+        column_name: "status",
+        visible: true,
+        data_type: "string",
+        type: "scenario_dataset_column",
+        dataset_id: "dataset-uuid"
+      }
+    ],
     status: "completed",
-    error_reason: null,
-    started_at: "2026-03-15T10:00:00Z",
-    completed_at: "2026-03-15T10:45:00Z",
-    total_scenarios: 5,
-    total_calls: 50,
-    completed_calls: 47,
-    failed_calls: 3,
-    execution_metadata: {},
-    duration_seconds: 2700,
-    success_rate: 94.0,
-    calls: [],
-    created_at: "2026-03-15T09:58:00Z",
-    scenario_ids: ["scen-uuid-1", "scen-uuid-2"],
-    simulator_agent_name: "Default Simulator",
-    simulator_agent_id: "sim-uuid",
-    agent_definition_used_name: "Support Agent v2",
-    agent_definition_used_id: "agent-def-uuid",
-    calls_attempted: 50,
-    calls_connected_percentage: 94.0
+    error_messages: [],
+    provider: "vapi",
+    agent_type: "voice"
   }}
   responseStatus={200}
   responseStatusText="OK"
@@ -62,13 +69,48 @@ description: "Retrieves a test execution with its call executions."
 
 <ApiSection title="Query parameters">
   <ParamField query="search" type="string" optional>
-    Filter by scenario name, transcript content, or status.
+    Filter call executions by phone number or scenario name.
   </ParamField>
   <ParamField query="page" type="integer" optional>
     Page number. Defaults to `1`.
   </ParamField>
+  <ParamField query="limit" type="integer" optional>
+    Number of call executions per page. Defaults to `30`.
+  </ParamField>
   <ParamField query="filters" type="string" optional>
-    JSON-encoded filter array, e.g. `[{"colId":"status","filterType":"text","type":"equals","filter":"completed"}]`.
+    JSON-encoded array of filter objects. Each object must contain a `column_id` and a `filter_config` object.
+
+    **Structure:**
+    ```json
+    [
+      {
+        "column_id": "<column>",
+        "filter_config": {
+          "filter_type": "<type>",
+          "filter_op": "<operator>",
+          "filter_value": "<value>"
+        }
+      }
+    ]
+    ```
+
+    **`column_id` values:** `status`, `timestamp`, `call_execution_id`, `overall_score`, `response_time`, `call_type`, `scenario`, or an eval config UUID.
+
+    **`filter_type` values:** `text`, `number`, `datetime`, `boolean`, `list`.
+
+    **`filter_op` values:** `equals`, `not_equals`, `contains`, `not_contains`, `greater_than`, `less_than`, `greater_than_or_equal`, `less_than_or_equal`, `between`, `not_in_between`, `in`.
+
+    **`filter_value`:** A string, number, ISO 8601 datetime string, or array (for `between` / `in` operators).
+
+    **Example — filter by status:**
+    ```json
+    [{"column_id":"status","filter_config":{"filter_type":"text","filter_op":"equals","filter_value":"completed"}}]
+    ```
+
+    **Example — filter by score range:**
+    ```json
+    [{"column_id":"overall_score","filter_config":{"filter_type":"number","filter_op":"between","filter_value":[50,90]}}]
+    ```
   </ParamField>
   <ParamField query="row_groups" type="string" optional>
     JSON-encoded array of column IDs to group by, e.g. `["scenario"]`.
@@ -79,23 +121,13 @@ description: "Retrieves a test execution with its call executions."
 </ApiSection>
 
 <ApiSection title="Response" status={200} statusText="OK">
-  <ResponseField name="id" type="string">UUID of the test execution.</ResponseField>
-  <ResponseField name="run_test" type="string">UUID of the parent run test.</ResponseField>
-  <ResponseField name="run_test_name" type="string">Parent run test name.</ResponseField>
-  <ResponseField name="agent_definition_name" type="string">Agent definition name.</ResponseField>
-  <ResponseField name="status" type="string">Status: `pending`, `running`, `completed`, `failed`, `cancelled`, `cancelling`, or `evaluating`.</ResponseField>
-  <ResponseField name="error_reason" type="string or null">Failure reason.</ResponseField>
-  <ResponseField name="started_at" type="datetime">ISO 8601 execution start time.</ResponseField>
-  <ResponseField name="completed_at" type="datetime or null">ISO 8601 completion time.</ResponseField>
-  <ResponseField name="total_scenarios" type="integer">Number of distinct scenarios.</ResponseField>
-  <ResponseField name="total_calls" type="integer">Total call executions created.</ResponseField>
-  <ResponseField name="completed_calls" type="integer">Calls that completed.</ResponseField>
-  <ResponseField name="failed_calls" type="integer">Calls that failed.</ResponseField>
-  <ResponseField name="execution_metadata" type="object">Execution metadata.</ResponseField>
-  <ResponseField name="duration_seconds" type="integer or null">Elapsed time in seconds.</ResponseField>
-  <ResponseField name="success_rate" type="number or null">Percentage of calls completed successfully.</ResponseField>
-  <ResponseField name="calls" type="array">Paginated call execution objects.</ResponseField>
-  <ApiCollapsible title="Show 8 properties">
+  <ResponseField name="count" type="integer">Total number of call executions.</ResponseField>
+  <ResponseField name="next" type="string or null">URL for the next page, or `null` if on the last page.</ResponseField>
+  <ResponseField name="previous" type="string or null">URL for the previous page, or `null` if on the first page.</ResponseField>
+  <ResponseField name="total_pages" type="integer">Total number of pages.</ResponseField>
+  <ResponseField name="current_page" type="integer">Current page number.</ResponseField>
+  <ResponseField name="results" type="array">Paginated list of call execution objects.</ResponseField>
+  <ApiCollapsible title="Show call execution properties">
     <ResponseField name="id" type="string">UUID of the call execution.</ResponseField>
     <ResponseField name="status" type="string">Call status: `pending`, `queued`, `ongoing`, `completed`, `failed`, `analyzing`, or `cancelled`.</ResponseField>
     <ResponseField name="duration" type="number">Duration in seconds.</ResponseField>
@@ -105,18 +137,25 @@ description: "Retrieves a test execution with its call executions."
     <ResponseField name="scenario" type="string">Scenario name.</ResponseField>
     <ResponseField name="created_at" type="datetime">ISO 8601 creation timestamp.</ResponseField>
   </ApiCollapsible>
-  <ResponseField name="created_at" type="datetime">ISO 8601 creation timestamp.</ResponseField>
-  <ResponseField name="scenario_ids" type="array">Scenario UUIDs included.</ResponseField>
-  <ResponseField name="simulator_agent_name" type="string or null">Simulator agent name.</ResponseField>
-  <ResponseField name="simulator_agent_id" type="string or null">UUID of the simulator agent.</ResponseField>
-  <ResponseField name="agent_definition_used_name" type="string or null">Agent definition name used.</ResponseField>
-  <ResponseField name="agent_definition_used_id" type="string or null">UUID of the agent definition used.</ResponseField>
-  <ResponseField name="calls_attempted" type="integer">Total calls initiated.</ResponseField>
-  <ResponseField name="calls_connected_percentage" type="number">Percentage of calls that connected.</ResponseField>
+  <ResponseField name="column_order" type="array">Column configuration for the test execution grid.</ResponseField>
+  <ApiCollapsible title="Show column order properties">
+    <ResponseField name="id" type="string">UUID of the column.</ResponseField>
+    <ResponseField name="column_name" type="string">Display name of the column.</ResponseField>
+    <ResponseField name="visible" type="boolean">Whether the column is visible.</ResponseField>
+    <ResponseField name="data_type" type="string">Data type of the column.</ResponseField>
+    <ResponseField name="type" type="string">`scenario_dataset_column`, `evaluation`, or `tool_evaluation`.</ResponseField>
+    <ResponseField name="scenario_id" type="string or null">UUID of the associated scenario, if applicable.</ResponseField>
+    <ResponseField name="dataset_id" type="string">UUID of the associated dataset.</ResponseField>
+    <ResponseField name="eval_config" type="object or null">Eval configuration details, if applicable.</ResponseField>
+  </ApiCollapsible>
+  <ResponseField name="status" type="string">Test execution status: `pending`, `running`, `completed`, `failed`, `cancelled`, `cancelling`, or `evaluating`.</ResponseField>
+  <ResponseField name="error_messages" type="array">List of error message strings, if any.</ResponseField>
+  <ResponseField name="provider" type="string">Agent provider name (e.g. `vapi`, `prompt`).</ResponseField>
+  <ResponseField name="agent_type" type="string">`voice` or `text`.</ResponseField>
 </ApiSection>
 
 <ApiSection title="Errors">
   <ParamField name="401" type="Unauthorized">Invalid or missing credentials.</ParamField>
-  <ParamField name="404" type="Not Found">Test execution not found.</ParamField>
+  <ParamField name="404" type="Not Found">Test execution not found or organization not found.</ParamField>
   <ParamField name="500" type="Internal Server Error">Unexpected server error.</ParamField>
 </ApiSection>
diff --git a/src/pages/docs/api/test-executions/reruncalls.mdx b/src/pages/docs/api/test-executions/reruncalls.mdx
index 8c0a8ed2..5e79db3c 100644
--- a/src/pages/docs/api/test-executions/reruncalls.mdx
+++ b/src/pages/docs/api/test-executions/reruncalls.mdx
@@ -10,16 +10,21 @@ description: "Reruns call executions within a test execution."
   parameters={[
     {"name": "test_execution_id", "in": "path", "required": true, "description": "UUID of the test execution.", "type": "string"}
   ]}
-  requestBody={{"rerunType": "eval_only", "callExecutionIds": ["a1b2c3d4-e5f6-7890-abcd-ef1234567890"], "selectAll": false}}
+  requestBody={{"rerun_type": "eval_only", "call_execution_ids": ["a1b2c3d4-e5f6-7890-abcd-ef1234567890"], "select_all": false}}
   responseExample={{
     message: "Rerun initiated successfully",
-    testExecutionId: "f7a8b9c0-d1e2-3456-789a-bcdef0123456",
-    rerunType: "eval_only",
-    totalProcessed: 1,
-    successfulReruns: ["a1b2c3d4-e5f6-7890-abcd-ef1234567890"],
-    failedReruns: [],
-    successCount: 1,
-    failureCount: 0
+    test_execution_id: "f7a8b9c0-d1e2-3456-789a-bcdef0123456",
+    rerun_type: "eval_only",
+    total_processed: 2,
+    successful_reruns: ["a1b2c3d4-e5f6-7890-abcd-ef1234567890"],
+    failed_reruns: [
+      {
+        call_execution_id: "b2c3d4e5-f6a7-8901-bcde-f01234567891",
+        error: "Call execution is in an incompatible state for rerun"
+      }
+    ],
+    success_count: 1,
+    failure_count: 1
   }}
   responseStatus={200}
   responseStatusText="OK"
@@ -41,14 +46,14 @@ description: "Reruns call executions within a test execution."
 </ApiSection>
 
 <ApiSection title="Request body">
-  <ParamField body="rerunType" type="string" required>
+  <ParamField body="rerun_type" type="string" required>
     The type of rerun to perform. Use `eval_only` to re-evaluate existing call data without re-executing the actual calls -- this is useful when you have updated your evaluation configurations and want to see updated scores without the cost of re-running calls. Use `call_and_eval` to fully re-execute the calls and then evaluate the new results -- this produces fresh conversations and is useful when you have modified the agent under test. Note that text agents only support `eval_only` reruns; attempting `call_and_eval` on a text agent will return a 400 error.
   </ParamField>
-  <ParamField body="callExecutionIds" type="array of strings">
-    An array of call execution UUIDs to rerun. Required when `selectAll` is `false` or not provided. Each ID must correspond to a valid call execution within the specified test execution. If a provided ID does not exist or does not belong to the test execution, it will appear in the `failedReruns` array of the response.
+  <ParamField body="call_execution_ids" type="array of strings">
+    An array of call execution UUIDs to rerun. Required when `select_all` is `false` or not provided. Each ID must correspond to a valid call execution within the specified test execution. If a provided ID does not exist or does not belong to the test execution, it will appear in the `failed_reruns` array of the response.
   </ParamField>
-  <ParamField body="selectAll" type="boolean">
-    When set to `true`, all call executions within the test execution will be rerun, and the `callExecutionIds` field is ignored. Defaults to `false`. You must provide either `selectAll: true` or a non-empty `callExecutionIds` array -- the request will fail with a 400 error if neither is specified.
+  <ParamField body="select_all" type="boolean">
+    When set to `true`, all call executions within the test execution will be rerun, and the `call_execution_ids` field is ignored. Defaults to `false`. You must provide either `select_all: true` or a non-empty `call_execution_ids` array -- the request will fail with a 400 error if neither is specified.
   </ParamField>
 </ApiSection>
 
@@ -56,32 +61,32 @@ description: "Reruns call executions within a test execution."
   <ResponseField name="message" type="string">
     A human-readable confirmation message indicating that the rerun has been initiated. The actual rerun processing happens asynchronously after this response is returned.
   </ResponseField>
-  <ResponseField name="testExecutionId" type="string">
+  <ResponseField name="test_execution_id" type="string">
     The UUID of the test execution that the rerun was initiated for, echoed back for confirmation and reference.
   </ResponseField>
-  <ResponseField name="rerunType" type="string">
+  <ResponseField name="rerun_type" type="string">
     The type of rerun that was requested, either `eval_only` or `call_and_eval`. Echoed back from the request for confirmation.
   </ResponseField>
-  <ResponseField name="totalProcessed" type="integer">
+  <ResponseField name="total_processed" type="integer">
     The total number of call executions that were processed by the rerun request. This includes both successful and failed reruns.
   </ResponseField>
-  <ResponseField name="successfulReruns" type="array">
+  <ResponseField name="successful_reruns" type="array">
     An array of call execution UUIDs that were successfully queued for rerun. These calls will be re-executed or re-evaluated asynchronously.
   </ResponseField>
-  <ResponseField name="failedReruns" type="array">
-    An array of objects describing call executions that could not be rerun. Each object contains a `callExecutionId` (the UUID of the failed call) and an `error` (a human-readable description of why the rerun failed, such as the call being in an incompatible state).
+  <ResponseField name="failed_reruns" type="array">
+    An array of objects describing call executions that could not be rerun. Each object contains a `call_execution_id` (the UUID of the failed call) and an `error` (a human-readable description of why the rerun failed, such as the call being in an incompatible state).
   </ResponseField>
-  <ResponseField name="successCount" type="integer">
-    The number of call executions that were successfully queued for rerun. Equal to the length of the `successfulReruns` array.
+  <ResponseField name="success_count" type="integer">
+    The number of call executions that were successfully queued for rerun. Equal to the length of the `successful_reruns` array.
   </ResponseField>
-  <ResponseField name="failureCount" type="integer">
-    The number of call executions that failed to be queued for rerun. Equal to the length of the `failedReruns` array.
+  <ResponseField name="failure_count" type="integer">
+    The number of call executions that failed to be queued for rerun. Equal to the length of the `failed_reruns` array.
   </ResponseField>
 </ApiSection>
 
 <ApiSection title="Errors">
   <ParamField name="400" type="Bad Request">
-    The rerun request could not be processed. This error occurs when: the `rerunType` field is missing or contains an invalid value; neither `callExecutionIds` nor `selectAll` was provided; the test execution is still in an active state (`pending`, `running`, or `cancelling`) and cannot accept reruns; or a `call_and_eval` rerun was requested for a text agent, which only supports `eval_only` reruns. Check the error message in the response body for specific details on which validation failed.
+    The rerun request could not be processed. This error occurs when: the `rerun_type` field is missing or contains an invalid value; neither `call_execution_ids` nor `select_all` was provided; the test execution is still in an active state (`pending`, `running`, or `cancelling`) and cannot accept reruns; or a `call_and_eval` rerun was requested for a text agent, which only supports `eval_only` reruns. Check the error message in the response body for specific details on which validation failed.
   </ParamField>
   <ParamField name="401" type="Unauthorized">
     The request could not be authenticated. Verify that both `X-Api-Key` and `X-Secret-Key` headers are present and contain valid, non-expired credentials. Ensure the API key has access to the workspace that owns this test execution.
diff --git a/src/pages/docs/dataset/features/experiments.mdx b/src/pages/docs/dataset/features/experiments.mdx
index 5d81367f..c4387b1c 100644
--- a/src/pages/docs/dataset/features/experiments.mdx
+++ b/src/pages/docs/dataset/features/experiments.mdx
@@ -1,97 +1,133 @@
 ---
 title: "Experiments in Dataset"
-description: "To test, validate, and compare different prompt configurations"
+description: "Test, validate, and compare prompt and agent configurations side by side"
 ---
 
 ## About
 
-Experiments give you a structured way to answer questions like: *Which prompt performs better? Which model gives the best results for my use case?* You test different prompt and model combinations on the same dataset, score the outputs with evals, and compare results side by side so you can make data-driven decisions instead of guessing.
+Experiments give you a structured way to answer questions like: *Which prompt performs better? Which model gives the best results? Does my agent beat my prompt for this task?* You import prompts and agents, run them across multiple model and parameter configurations on the same dataset, score the outputs with evals, and compare results side by side so you can make data-driven decisions instead of guessing.
 
 ## When to use
 
-- **Compare prompts**: Run different prompt templates on the same rows and see which produces better answers or scores.
-- **Compare models**: Run the same prompt with multiple models (or custom models) and compare quality, speed, or cost.
-- **Validate before rollout**: Test prompt and model changes on a dataset before using them in production.
-- **Optimize with evals**: Add built-in or custom evals and use scores to rank prompt/model combinations and pick a winner.
+- **Compare prompts and agents**: Pull prompts from the [Prompt](/docs/prompt) section and agents from the [Agent Playground](/docs/agent-playground) into the same experiment and see which produces better outputs.
+- **Compare models and parameters**: Add the same prompt with multiple models, temperatures, or tool configs to compare quality, latency, and cost across configurations.
+- **Validate before rollout**: Test a prompt or agent change on a dataset before promoting it to production.
+- **Optimize with evals**: Attach built-in or custom evals and use scores to rank prompt/agent-model combinations and pick a winner.
+- **Iterate fast**: Stop a long run, edit a single config, or rerun just the failed cells without restarting the whole experiment.
 
 ## How to
 
-You pick a **base column** (the generated responses you want to compare against), add one or more **prompt templates** (each with one or more models), attach **evals**, and run. The system generates responses for each prompt–model pair, runs the evals, and surfaces scores and comparisons so you can choose the best setup.
+Experiment creation is a guided three-step flow: **Basic Info → Configuration → Evaluations**. Each step validates before you can move forward, and you can jump back to any completed step to edit it.
 
 <Steps>
   <Step title="Navigate to Experiments">
-    Click the "Experiments" button (e.g. in the top-right on the dataset dashboard) to open experiments for this dataset.
+    Open the dataset and click the **Experiments** button in the top-right of the dataset dashboard.
     ![Experiments](/screenshot/product/dataset/how-to/experiments-in-dataset/1.png)
   </Step>
 
-  <Step title="Create a new experiment">
-    Give the experiment a name and select the **base column** – the column whose generated responses you want to compare (e.g. an existing run-prompt column). All experiment runs will be evaluated and compared against this baseline.
+  <Step title="Step 1: Basic Info">
+    Give the experiment a **name** and pick the **experiment type**.
+
+    The name Set up the prompt and model configurations you want to compare. Each configuration becomes a separate column in the experiment grid. is pre-filled with an auto-suggested name based on your dataset. Accept it as-is or overwrite it with your own. Names must be unique within the dataset.
+
+    Pick the experiment type that matches the task you're testing:
+
+    <Tabs>
+      <Tab title="LLM" icon="robot">
+        Use **LLM** for text generation. You can import prompts *and* agents in the same experiment.
+      </Tab>
+      <Tab title="Text-to-Speech (TTS)" icon="microphone">
+        Use **TTS** to generate audio from text. Add prompts with different voices, models, and parameters to compare.
+      </Tab>
+      <Tab title="Speech-to-Text (STT)" icon="page">
+        Use **STT** to transcribe audio. Each prompt configuration must point at a dataset column containing the input audio.
+      </Tab>
+      <Tab title="Image Generation" icon="image">
+        Use **Image Generation** to create images from text (or text + image). Compare image models and prompts side by side.
+      </Tab>
+    </Tabs>
+
     ![Create Experiment](/screenshot/product/dataset/how-to/experiments-in-dataset/2.png)
   </Step>
 
-  <Step title="Prompt template">
-    In the prompt template section, define the prompts and models for the experiment. You can add multiple prompt templates; each can use one or more models so you compare many combinations.
-    ![Prompt Template](/screenshot/product/dataset/how-to/experiments-in-dataset/3.png)
+  <Step title="Step 2: Configuration">
+    Set up the prompt and model configurations you want to compare. Each configuration becomes a separate column in the experiment grid.
+    
 
-    Choose the model type and model(s) you want for the experiment. You can select multiple models to compare. You can also create a custom model via "Create Custom Model".
     <Tabs>
       <Tab title="LLM" icon="robot">
-        Select **LLM** for text generation (chat). Choose one or more chat models to compare prompt performance.
-        ![LLM](/screenshot/product/dataset/how-to/experiments-in-dataset/4.png)
-        <Tip>
-          Click [here](/docs/evaluation/features/custom-models) to learn how to create a custom model.
-        </Tip>
+        For LLM experiments, click **Add Prompt/Agents** to import a prompt or agent. You can mix prompts and agents in the same experiment and score them against the same evals.
+
+        - **Prompts**: pick a prompt from the [Prompt](/docs/prompt) section, select a published version, then attach **one or more models**. Each (prompt, model) pair becomes its own configuration, so adding three models to one prompt creates three columns to compare. For each model you can tune temperature, max tokens, top-p, response format, and tool config.
+        - **Agents**: pick an agent from the [Agent Playground](/docs/agent-playground) and select a published version. The agent's model, tools, and graph are captured at that version, so the run stays reproducible even if the agent is edited later. You don't pick a model again here.
+        ![LLM](/screenshot/product/dataset/how-to/experiments-in-dataset/3.png)
       </Tab>
       <Tab title="Text-to-Speech (TTS)" icon="microphone">
-        Select **Text-to-Speech** to generate audio from text. Choose TTS models to compare voice output across prompts.
-        ![TTS](/screenshot/product/dataset/how-to/experiments-in-dataset/5.png)
-        <Tip>
-          Click [here](/docs/evaluation/features/custom-models) to learn how to create a custom model.
-        </Tip>
+        For each prompt, write the instructions inline (use `{{column_name}}` to reference dataset columns) and attach one or more **TTS models** (with voice and format settings). Click **+ Add Prompt** to add more prompt entries. Each (prompt, model) pair becomes its own column. Output format is fixed to Audio.
+        ![TTS](/screenshot/product/dataset/how-to/experiments-in-dataset/4.png)
       </Tab>
       <Tab title="Speech-to-Text (STT)" icon="page">
-        Select **Speech-to-Text** to transcribe audio into text. Choose STT models to compare transcription quality.
-        ![STT](/screenshot/product/dataset/how-to/experiments-in-dataset/6.png)
-        <Tip>
-          Click [here](/docs/evaluation/features/custom-models) to learn how to create a custom model.
-        </Tip>
+        For each prompt, write the instructions inline (use `{{column_name}}` to reference dataset columns), pick the dataset column containing the input audio, and attach one or more **STT models**. Click **+ Add Prompt** to add more entries to compare transcription quality.
+        ![STT](/screenshot/product/dataset/how-to/experiments-in-dataset/5.png)
       </Tab>
       <Tab title="Image Generation" icon="image">
-        Select **Image Generation** to create images from text (or image + text). Choose image models to compare output quality.
-        ![Image Generation](/screenshot/product/dataset/how-to/experiments-in-dataset/7.png)
+        For each prompt, write the instructions inline (use `{{column_name}}` to reference dataset columns) and attach one or more **image models**. Click **+ Add Prompt** to add more entries and compare output quality across models and parameters.
+        ![Image Generation](/screenshot/product/dataset/how-to/experiments-in-dataset/6.png)
+      </Tab>
+      <Tab title="Custom models" icon="gear">
+        Models you've added through Custom Models show up in the model picker for prompt configurations across all experiment types.
         <Tip>
-          Click [here](/docs/evaluation/features/custom-models) to learn how to create a custom model.
+          See [Custom Models](/docs/evaluation/features/custom-models) for how to register a custom or self-hosted model.
         </Tip>
       </Tab>
     </Tabs>
 
-    Use an existing prompt template or create a new one. You can add as many prompt templates as you need.
-    <Tip>
-      Click [here](/docs/prompt-workbench) to learn more about prompts.
-    </Tip>
+    For prompts, you can also configure **tool calling** with **Auto**, **Required**, or **None**, and add tool definitions the model can invoke.
   </Step>
 
-  <Step title="Choosing evals">
-    Experiments compare prompt–model performance using evals. Add the evals you want to run on the generated responses.
+  <Step title="Step 3: Evaluations">
+    The final step has two parts: an optional **base column** and the **evals** you want to score outputs with.
+
+    **Compare against baseline (optional)**: pick a column from the dataset to compare model outputs against (typically a ground-truth or existing run-prompt column). Skip it if you don't have a reference output yet; you can still run the experiment, attach evals that don't need a baseline, and add a base column later by editing the experiment.
+
+    **Add evaluations**: click **Add Evaluation** and pick from the [built-in eval](/docs/evaluation/builtin) catalog or [create a custom eval](/docs/evaluation/features/custom). Add as many as you need. Every eval runs on every configuration so the results are directly comparable.
+    ![Choosing Evals](/screenshot/product/dataset/how-to/experiments-in-dataset/7.png)
+
+    For each eval, map its inputs (e.g. `output`, `input`, `expected`) to the model output or to dataset columns. Mapping is required before the experiment can run.
     ![Choosing Evals](/screenshot/product/dataset/how-to/experiments-in-dataset/8.png)
-    Click "Add Evaluation" and pick from [existing eval](/docs/evaluation/builtin) templates or [create a custom eval](/docs/evaluation/features/custom). You can add as many evals as you want.
-    ![Choosing Evals](/screenshot/product/dataset/how-to/experiments-in-dataset/9.png)
   </Step>
 
-  <Step title="Run experiment">
-    After configuring prompts, models, and evals, click "Run" to start the experiment. The system will generate responses for each prompt–model pair, run the evals, and show results and comparisons when complete.
+  <Step title="Run the experiment">
+    Click **Run** to start. The experiment processes every row across every prompt/agent-model configuration in parallel, running the evals on each output as it arrives. The grid streams results live so you can watch progress without waiting for the whole run to finish.
+  </Step>
+
+  <Step title="Stop a running experiment">
+    If you spot a misconfiguration or want to abort, click **Stop** on a running experiment from the Experiments tab. Any in-flight cells are marked as errored, and you can then edit the experiment and rerun without waiting for the full run to complete.
   </Step>
 
-  <Step title="Update and re-run">
-    You can change the experiment at any time: edit the name, base column, prompt templates, models, or evals, then save. Use **Re-run** to run the experiment again with the same or updated config (e.g. after adding rows to the dataset or changing a prompt). Re-run processes all rows again and refreshes the experiment dataset results.
-    ![Update](/screenshot/product/dataset/how-to/experiments-in-dataset/10.png)
+  <Step title="Edit and rerun">
+    Use **Rerun Experiment** to re-execute the entire experiment after editing prompts, models, evals, or the base column. Editing is granular: only the configurations you actually changed are re-executed, and results from untouched configurations are preserved.
+
+    For more targeted reruns:
+
+    - **Rerun a single cell**: hover any output or eval cell in the grid and click the rerun icon. Useful when one row failed transiently or you've tweaked a single configuration.
+    - **Rerun a column**: from the column header, choose **Run all cells in the column** or **Run only failed cells in the column**. Failed-only is the fastest way to recover from API hiccups without redoing successful work.
+    - **Rerun an eval**: re-execute a single eval across all rows after changing its config or mapping, without re-generating any model outputs.
+
+    ![Update](/screenshot/product/dataset/how-to/experiments-in-dataset/9.png)
   </Step>
 
-  <Step title="Compare results">
-    When the experiment has finished, use the **Compare** (or comparison) view to see how each prompt–model combination performed. Set weights for eval scores and metrics (e.g. response time, token usage) to compute an overall ranking. The comparison shows which combination ranks best so you can choose a winner.
+  <Step title="Compare results and choose a winner">
+    Open the **Compare** view to see how every configuration performed. Set weights (0-10) for each eval score and for response time, completion tokens, and total tokens. The system normalizes the metrics, computes an overall rating per configuration, and ranks them so the winner is clear. Adjust the weights to match what matters for your use case (e.g. prioritize quality over cost) and the ranking updates in place.
   </Step>
 </Steps>
 
+## Tips
+
+- **Use published versions**: experiments only run published prompt and agent versions. Publish the version you want to test before importing it.
+- **Mix prompts and agents**: an **LLM** experiment can contain prompts and agents side by side, scored against the same evals. Useful when you're deciding whether an agent is worth the extra complexity over a prompt. TTS, STT, and Image experiments accept prompts only.
+- **Failed-only rerun**: when transient failures (rate limits, network blips) leave a few cells errored, use the failed-only rerun on the column to recover them without redoing successful rows.
+
 ## Next Steps
 
 <CardGroup cols={2}>