diff --git a/public/screenshot/product/dataset/how-to/experiments-in-dataset/2.png b/public/screenshot/product/dataset/how-to/experiments-in-dataset/2.png index 73210f02..58e3c804 100644 Binary files a/public/screenshot/product/dataset/how-to/experiments-in-dataset/2.png and b/public/screenshot/product/dataset/how-to/experiments-in-dataset/2.png differ diff --git a/public/screenshot/product/dataset/how-to/experiments-in-dataset/3.png b/public/screenshot/product/dataset/how-to/experiments-in-dataset/3.png index 19ae84cb..a4cd484e 100644 Binary files a/public/screenshot/product/dataset/how-to/experiments-in-dataset/3.png and b/public/screenshot/product/dataset/how-to/experiments-in-dataset/3.png differ diff --git a/public/screenshot/product/dataset/how-to/experiments-in-dataset/4.png b/public/screenshot/product/dataset/how-to/experiments-in-dataset/4.png index a5e7602b..c617c25c 100644 Binary files a/public/screenshot/product/dataset/how-to/experiments-in-dataset/4.png and b/public/screenshot/product/dataset/how-to/experiments-in-dataset/4.png differ diff --git a/public/screenshot/product/dataset/how-to/experiments-in-dataset/5.png b/public/screenshot/product/dataset/how-to/experiments-in-dataset/5.png index 12ddf231..a239538d 100644 Binary files a/public/screenshot/product/dataset/how-to/experiments-in-dataset/5.png and b/public/screenshot/product/dataset/how-to/experiments-in-dataset/5.png differ diff --git a/public/screenshot/product/dataset/how-to/experiments-in-dataset/6.png b/public/screenshot/product/dataset/how-to/experiments-in-dataset/6.png index c10714ab..2eec4f4b 100644 Binary files a/public/screenshot/product/dataset/how-to/experiments-in-dataset/6.png and b/public/screenshot/product/dataset/how-to/experiments-in-dataset/6.png differ diff --git a/public/screenshot/product/dataset/how-to/experiments-in-dataset/7.png b/public/screenshot/product/dataset/how-to/experiments-in-dataset/7.png index 72fe6ad0..29510fed 100644 Binary files a/public/screenshot/product/dataset/how-to/experiments-in-dataset/7.png and b/public/screenshot/product/dataset/how-to/experiments-in-dataset/7.png differ diff --git a/public/screenshot/product/dataset/how-to/experiments-in-dataset/8.png b/public/screenshot/product/dataset/how-to/experiments-in-dataset/8.png index 1366a68e..c2b950b2 100644 Binary files a/public/screenshot/product/dataset/how-to/experiments-in-dataset/8.png and b/public/screenshot/product/dataset/how-to/experiments-in-dataset/8.png differ diff --git a/public/screenshot/product/dataset/how-to/experiments-in-dataset/9.png b/public/screenshot/product/dataset/how-to/experiments-in-dataset/9.png index eebc3ab3..9952a0ff 100644 Binary files a/public/screenshot/product/dataset/how-to/experiments-in-dataset/9.png and b/public/screenshot/product/dataset/how-to/experiments-in-dataset/9.png differ diff --git a/src/components/docs/ApiExplorer.astro b/src/components/docs/ApiExplorer.astro index ed80aaa3..9f0e3398 100644 --- a/src/components/docs/ApiExplorer.astro +++ b/src/components/docs/ApiExplorer.astro @@ -9,16 +9,27 @@ * target pages and extracts .apg data to update the drawer in-place (no page reload). */ import { tabNavigation } from '../../lib/navigation.ts'; +import { apiNavigation } from '../../lib/api-navigation.ts'; +// Build a href → method lookup from api-navigation.ts (source of truth for HTTP methods) +const methodByHref: Record = {}; +for (const group of apiNavigation) { + for (const item of group.items) { + if (item.href && item.method) { + methodByHref[item.href] = item.method; + } + } +} // Extract the API tab navigation at build time const apiTab = tabNavigation.find(t => t.tab === 'API'); const apiGroups = apiTab ? apiTab.groups : []; // Flatten the API groups into a structure suitable for the sidebar: -// { group: string, items: { title: string, href: string, method?: string }[] }[] +// { group: string, items: { title: string, href: string, method: string }[] }[] interface SidebarEndpoint { title: string; href: string; + method: string; } interface SidebarGroup { group: string; @@ -29,7 +40,7 @@ function flattenItems(items: any[]): SidebarEndpoint[] { const result: SidebarEndpoint[] = []; for (const item of items) { if (item.href && item.href !== '/docs/api') { - result.push({ title: item.title, href: item.href }); + result.push({ title: item.title, href: item.href, method: methodByHref[item.href] || 'GET' }); } if (item.items) { result.push(...flattenItems(item.items)); @@ -476,8 +487,8 @@ const sidebarDataJson = JSON.stringify(sidebarGroups); for (var j = 0; j < groupItems.length; j++) { var ep = groupItems[j]; - // Infer method from title - var method = inferMethodFromTitle(ep.title); + // Use method from api-navigation.ts (baked in at build time), fall back to title inference + var method = ep.method || inferMethodFromTitle(ep.title); var isActive = ep.href === window.location.pathname.replace(/\/$/, ''); html += ''; html += '' + shortMethod(method) + ''; @@ -494,7 +505,7 @@ const sidebarDataJson = JSON.stringify(sidebarGroups); function inferMethodFromTitle(title) { var t = title.toLowerCase(); - if (t.indexOf('create') === 0 || t.indexOf('add') === 0 || t.indexOf('submit') === 0 || t.indexOf('apply') === 0 || t.indexOf('bulk') === 0 || t.indexOf('upload') === 0 || t.indexOf('run') === 0 || t.indexOf('execute') === 0 || t.indexOf('send') === 0 || t.indexOf('generate') === 0 || t.indexOf('clone') === 0) return 'POST'; + if (t.indexOf('create') === 0 || t.indexOf('add') === 0 || t.indexOf('submit') === 0 || t.indexOf('apply') === 0 || t.indexOf('bulk') === 0 || t.indexOf('upload') === 0 || t.indexOf('run') === 0 || t.indexOf('execute') === 0 || t.indexOf('send') === 0 || t.indexOf('generate') === 0 || t.indexOf('clone') === 0 || t.indexOf('rerun') === 0 || t.indexOf('cancel') === 0 || t.indexOf('duplicate') === 0 || t.indexOf('restore') === 0 || t.indexOf('pause') === 0 || t.indexOf('unpause') === 0 || t.indexOf('release') === 0 || t.indexOf('assign') === 0 || t.indexOf('complete') === 0 || t.indexOf('skip') === 0 || t.indexOf('fetch') === 0 || t.indexOf('export') === 0) return 'POST'; if (t.indexOf('update') === 0 || t.indexOf('edit') === 0 || t.indexOf('merge') === 0) return 'PATCH'; if (t.indexOf('delete') === 0 || t.indexOf('remove') === 0) return 'DELETE'; return 'GET'; @@ -680,13 +691,29 @@ const sidebarDataJson = JSON.stringify(sidebarGroups); function buildBodyForm(body, method) { if (!bodySection || !bodyFields) return; - if (!body || typeof body !== 'object' || Array.isArray(body) || !/POST|PUT|PATCH/.test(method)) { + if (!/POST|PUT|PATCH|DELETE/.test(method)) { bodySection.style.display = 'none'; bodyFields.innerHTML = ''; if (bodyIntro) bodyIntro.style.display = 'none'; return; } + // Merge ParamField body metadata keys into body so fields documented + // via but missing from requestBody still appear. + body = body && typeof body === 'object' && !Array.isArray(body) ? Object.assign({}, body) : {}; + for (var mk in currentBodyMeta) { + if (currentBodyMeta.hasOwnProperty(mk) && mk.indexOf('.') === -1 && !(mk in body)) { + var mt = currentBodyMeta[mk].type || 'string'; + if (/array|list/i.test(mt)) body[mk] = []; + else if (/object/i.test(mt)) body[mk] = {}; + else if (/bool/i.test(mt)) body[mk] = false; + else if (/int|number|double|float/i.test(mt)) body[mk] = 0; + else body[mk] = ''; + } + } + // Keep currentReqBody in sync so optional picker and code gen use merged body + currentReqBody = body; + var keys = Object.keys(body); if (keys.length === 0) { bodySection.style.display = 'none'; @@ -2011,7 +2038,7 @@ const sidebarDataJson = JSON.stringify(sidebarGroups); } else { lines.push(' -H "Authorization: Bearer ' + authInfo.token + '"'); } - if (body && /POST|PUT|PATCH/.test(method)) { + if (body && /POST|PUT|PATCH|DELETE/.test(method)) { lines[lines.length - 1] += ' \\'; lines.push(' -H "Content-Type: application/json" \\'); lines.push(" -d '" + JSON.stringify(body, null, 2).replace(/'/g, "'\\''") + "'"); @@ -2026,7 +2053,7 @@ const sidebarDataJson = JSON.stringify(sidebarGroups); if (parts.length > 1 && method === 'GET') sdkMethod = 'get'; var apiKeyVal = isApiKey ? authInfo.apiKey : authInfo.token; var lines = ['from fi.client import FutureAGI', '', 'client = FutureAGI(api_key="' + apiKeyVal + '")', '']; - if (body && typeof body === 'object' && /POST|PUT|PATCH/.test(method)) { + if (body && typeof body === 'object' && /POST|PUT|PATCH|DELETE/.test(method)) { lines.push('response = client.' + resource + '.' + sdkMethod + '('); var bKeys = Object.keys(body); for (var ki = 0; ki < bKeys.length; ki++) { @@ -2047,16 +2074,16 @@ const sidebarDataJson = JSON.stringify(sidebarGroups); headerStr = ' headers={"Authorization": "Bearer ' + authInfo.token + '"},'; } var lines = ['import requests', '', 'response = requests.' + method.toLowerCase() + '(', ' "' + url + '",', headerStr]; - if (body && /POST|PUT|PATCH/.test(method)) lines.push(' json=' + JSON.stringify(body, null, 4) + ','); + if (body && /POST|PUT|PATCH|DELETE/.test(method)) lines.push(' json=' + JSON.stringify(body, null, 4) + ','); lines.push(')'); lines.push('print(response.json())'); return lines.join('\n'); } if (lang === 'go') { var lines = ['package main', '', 'import (', ' "bytes"', ' "fmt"', ' "io"', ' "net/http"']; - if (body && /POST|PUT|PATCH/.test(method)) lines.push(' "encoding/json"'); + if (body && /POST|PUT|PATCH|DELETE/.test(method)) lines.push(' "encoding/json"'); lines.push(')', ''); lines.push('func main() {'); - if (body && /POST|PUT|PATCH/.test(method)) { + if (body && /POST|PUT|PATCH|DELETE/.test(method)) { lines.push(' body, _ := json.Marshal(' + JSON.stringify(body) + ')'); lines.push(' req, _ := http.NewRequest("' + method + '", "' + url + '", bytes.NewBuffer(body))'); } else { @@ -2089,7 +2116,7 @@ const sidebarDataJson = JSON.stringify(sidebarGroups); builderLines.push(' .header("Authorization", "Bearer ' + authInfo.token + '")'); } builderLines.push(' .header("Content-Type", "application/json")'); - if (body && /POST|PUT|PATCH/.test(method)) { + if (body && /POST|PUT|PATCH|DELETE/.test(method)) { builderLines.push(' .' + method + '(HttpRequest.BodyPublishers.ofString(' + JSON.stringify(JSON.stringify(body)) + '))'); } else if (method === 'DELETE') { builderLines.push(' .DELETE()'); @@ -2105,7 +2132,7 @@ const sidebarDataJson = JSON.stringify(sidebarGroups); } if (lang === 'php') { var lines = [' - - - - Your Future AGI API key used to authenticate requests. You can find and manage your API keys in the [Dashboard](https://app.futureagi.com) under Settings. - - - Your Future AGI secret key, used alongside the API key for request authentication. This is generated when you create an API key in the [Dashboard](https://app.futureagi.com). - - - - - - The call execution ID. Must have at least one rerun snapshot. - - - - - Performance metrics for current and previous sessions. - - Current execution metrics. - Previous rerun snapshot metrics. - - Transcripts for current and previous sessions. - - Current session transcript with `role` and `content`. - Previous rerun snapshot transcript. - - Audio recording URLs. Voice only. - - Current execution recording URL. - Previous rerun snapshot recording URL. - - Evaluation results for both sessions. - - Current execution eval results. - Previous rerun snapshot eval results. - - - - - No rerun snapshot available for comparison. - Invalid or missing credentials. - Call execution not found. - Unexpected server error. - diff --git a/src/pages/docs/api/index.mdx b/src/pages/docs/api/index.mdx index 4eb23010..6bbc4508 100644 --- a/src/pages/docs/api/index.mdx +++ b/src/pages/docs/api/index.mdx @@ -50,9 +50,6 @@ Get your API key from the [Future AGI Dashboard](https://app.futureagi.com/setti Test execution tracking and analytics - - Individual call execution details - Dataset creation, modification, and data management diff --git a/src/pages/docs/api/run-tests/deletetestexecutions.mdx b/src/pages/docs/api/run-tests/deletetestexecutions.mdx index 6855a0c9..bd748e69 100644 --- a/src/pages/docs/api/run-tests/deletetestexecutions.mdx +++ b/src/pages/docs/api/run-tests/deletetestexecutions.mdx @@ -10,8 +10,8 @@ description: "Bulk-deletes test executions from a test run." parameters={[ {"name": "run_test_id", "in": "path", "required": true, "description": "UUID of the test run from which to delete test executions.", "type": "string"} ]} - requestBody={{"testExecutionIds": ["execution-uuid-1", "execution-uuid-2"], "selectAll": false}} - responseExample={{"message": "Successfully deleted 2 test execution(s).", "runTestId": "run-test-uuid", "deletedCount": 2, "deletedIds": ["execution-uuid-1", "execution-uuid-2"]}} + requestBody={{"test_execution_ids": ["execution-uuid-1", "execution-uuid-2"], "select_all": false}} + responseExample={{"message": "Successfully deleted 2 test execution(s).", "run_test_id": "run-test-uuid", "deleted_count": 2, "deleted_ids": ["execution-uuid-1", "execution-uuid-2"]}} responseStatus={200} responseStatusText="OK" /> @@ -32,19 +32,19 @@ description: "Bulk-deletes test executions from a test run." - - Array of test execution UUIDs to delete. Required when `selectAll` is `false`. Executions in `RUNNING`, `PENDING`, or `CANCELLING` status cannot be deleted. + + Array of test execution UUIDs to delete. Required when `select_all` is `false`. Executions in `RUNNING`, `PENDING`, or `CANCELLING` status cannot be deleted. - - When `true`, deletes all eligible executions, ignoring `testExecutionIds`. Defaults to `false`. + + When `true`, deletes all eligible executions, ignoring `test_execution_ids`. Defaults to `false`. Confirmation message with deletion count. - UUID of the parent test run. - Number of executions deleted. - UUIDs of the deleted executions. + UUID of the parent test run. + Number of executions deleted. + UUIDs of the deleted executions. diff --git a/src/pages/docs/api/test-executions/cancelexecution.mdx b/src/pages/docs/api/test-executions/cancelexecution.mdx index be2f3888..724759b0 100644 --- a/src/pages/docs/api/test-executions/cancelexecution.mdx +++ b/src/pages/docs/api/test-executions/cancelexecution.mdx @@ -10,7 +10,6 @@ description: "Cancels a test execution." parameters={[ {"name": "test_execution_id", "in": "path", "required": true, "description": "UUID of the test execution to cancel.", "type": "string"} ]} - requestBody={{}} responseExample={{ success: true, message: "Test execution cancellation initiated", @@ -35,11 +34,6 @@ description: "Cancels a test execution." - - - No body required. Send `{}`. - - Whether the cancellation was accepted. diff --git a/src/pages/docs/api/call-executions/getcallexecutiondetails.mdx b/src/pages/docs/api/test-executions/getcallexecutiondetails.mdx similarity index 75% rename from src/pages/docs/api/call-executions/getcallexecutiondetails.mdx rename to src/pages/docs/api/test-executions/getcallexecutiondetails.mdx index 33e8eea0..3f1632ca 100644 --- a/src/pages/docs/api/call-executions/getcallexecutiondetails.mdx +++ b/src/pages/docs/api/test-executions/getcallexecutiondetails.mdx @@ -35,26 +35,46 @@ description: "Retrieves a specific call execution." customer_name: "Jane Doe", call_summary: "Customer inquired about billing charges.", ended_reason: "customer_hangup", - simulatorAgentName: "Billing Simulator", - simulatorAgentId: "sim-agent-uuid", + simulator_agent_name: "Billing Simulator", + simulator_agent_id: "sim-agent-uuid", agent_definition_used_name: "Support Agent v2", agent_definition_used_id: "agent-def-uuid", tool_outputs: null, rerun_snapshots: [], + provider: "vapi", + phone_number: "+14155550100", + simulation_call_type: "voice", + processing_skipped: false, + processing_skip_reason: null, + is_snapshot: false, + snapshot_timestamp: null, + rerun_type: null, + original_call_execution_id: null, avg_agent_latency: 0.85, user_interruption_count: 1, user_interruption_rate: 0.05, user_wpm: 130, bot_wpm: 145, talk_ratio: 0.55, + agent_talk_percentage: 55.0, ai_interruption_count: 0, ai_interruption_rate: 0.0, avg_stop_time_after_interruption: 0.3, + total_tokens: null, + input_tokens: null, + output_tokens: null, + avg_latency_ms: null, + turn_count: null, + csat_score: null, stt_cost: 0.012, llm_cost: 0.045, tts_cost: 0.008, storage_cost: 0.002, - total_cost: 0.067 + total_cost: 0.067, + customer_cost_cents: null, + customer_cost_breakdown: null, + customer_latency_metrics: null, + customer_call_id: null }} responseStatus={200} responseStatusText="OK" @@ -101,12 +121,21 @@ description: "Retrieves a specific call execution." Simulated customer persona name. AI-generated conversation summary. Reason the call ended, e.g. `customer_hangup`, `agent_hangup`, `timeout`, `error`. - Simulator agent name. - UUID of the simulator agent. + Simulator agent name. + UUID of the simulator agent. Agent definition name. UUID of the agent definition. Tool call outputs from the conversation. Snapshots from previous reruns. + Telephony or chat provider used for this call, e.g. `vapi`, `retell`. + Phone number dialed for this call. Voice only. + Simulation mode: `voice` or `text`. + Whether post-call processing was skipped. + Reason processing was skipped, if applicable. + Whether this record is a rerun snapshot rather than the live call. + Timestamp when the snapshot was taken. + Type of the most recent rerun: `eval_only` or `call_and_eval`. `null` if never rerun. + UUID of the original call execution this is a snapshot of. Average agent response latency in seconds. Voice only. User interruption count. Voice only. Proportion of agent turns interrupted by user (0-1). Voice only. @@ -121,7 +150,12 @@ description: "Retrieves a specific call execution." Output tokens generated. Text only. Average response latency in milliseconds. Text only. Total conversation turns. Text only. + Percentage of conversation time the agent was talking (0-100). Voice only. Customer satisfaction score. Text only. + Cost of the call in cents as reported by the customer's telephony provider. + Detailed cost breakdown from the customer's provider. + Latency metrics as reported by the customer's provider. + Call ID assigned by the customer's telephony provider. Speech-to-text cost in USD. Voice only. LLM inference cost in USD. Text-to-speech cost in USD. Voice only. diff --git a/src/pages/docs/api/test-executions/getevalexplanationsummary.mdx b/src/pages/docs/api/test-executions/getevalexplanationsummary.mdx deleted file mode 100644 index 21bc87ce..00000000 --- a/src/pages/docs/api/test-executions/getevalexplanationsummary.mdx +++ /dev/null @@ -1,58 +0,0 @@ ---- -title: "Get eval explanation summary" -description: "Retrieves the eval explanation summary for a test execution." ---- - - - - - - Your Future AGI API key used to authenticate requests. You can find and manage your API keys in the [Dashboard](https://app.futureagi.com) under Settings. - - - Your Future AGI secret key, used alongside the API key for request authentication. This is generated when you create an API key in the [Dashboard](https://app.futureagi.com). - - - - - - The test execution ID. Triggers async generation if not yet available. - - - - - Summary data and generation metadata. - - Summary with performance overview, issues, and recommendations. - ISO 8601 timestamp of last generation. - Generation status: `pending`, `running`, `completed`, or `failed`. - - Whether the request succeeded. - - - - Invalid or missing credentials. - Test execution not found. - Unexpected server error. - diff --git a/src/pages/docs/api/test-executions/getkpis.mdx b/src/pages/docs/api/test-executions/getkpis.mdx index 4e5cfd91..90b06778 100644 --- a/src/pages/docs/api/test-executions/getkpis.mdx +++ b/src/pages/docs/api/test-executions/getkpis.mdx @@ -14,14 +14,14 @@ description: "Retrieves KPI metrics for a test execution." total_calls: 50, avg_score: 8.2, avg_response: 1.15, - callsAttempted: 50, - connectedCalls: 47, - callsConnectedPercentage: 94.0, + calls_attempted: 50, + connected_calls: 47, + calls_connected_percentage: 94.0, failed_calls: 3, total_duration: 6250, agent_type: "voice", is_inbound: false, - scenarioGraphs: {}, + scenario_graphs: {}, avg_agent_latency: 0.92, avg_user_interruption_count: 1.4, avg_user_interruption_rate: 0.08, @@ -31,8 +31,8 @@ description: "Retrieves KPI metrics for a test execution." avg_ai_interruption_count: 0.3, avg_ai_interruption_rate: 0.02, avg_stop_time_after_interruption: 0.35, - agentTalkPercentage: 55.0, - customerTalkPercentage: 45.0, + agent_talk_percentage: 55.0, + customer_talk_percentage: 45.0, avg_tone_check: 8.7, avg_accuracy: 7.9 }} @@ -59,14 +59,14 @@ description: "Retrieves KPI metrics for a test execution." Total call executions. Average evaluation score across completed calls. Average response time in seconds. - Total calls initiated. - Calls that connected. - Percentage of calls that connected. + Total calls initiated. + Calls that connected. + Percentage of calls that connected. Calls that failed. Combined duration in seconds. `voice` or `text`. `true` for inbound, `false` for outbound. `null` for text agents. - Per-scenario performance data. + Per-scenario performance data. Average agent latency in seconds. Voice only. Average user interruptions per call. Voice only. Average user interruption rate (0-1). Voice only. @@ -76,8 +76,8 @@ description: "Retrieves KPI metrics for a test execution." Average agent interruptions per call. Voice only. Average agent interruption rate (0-1). Voice only. Average seconds to stop after interruption. Voice only. - Agent talk time percentage (0-100). Voice only. - Customer talk time percentage (0-100). Voice only. + Agent talk time percentage (0-100). Voice only. + Customer talk time percentage (0-100). Voice only. Average total tokens per call. Text only. Average input tokens per call. Text only. Average output tokens per call. Text only. diff --git a/src/pages/docs/api/test-executions/gettestexecutiondetails.mdx b/src/pages/docs/api/test-executions/gettestexecutiondetails.mdx index 0df874e3..f914bbbf 100644 --- a/src/pages/docs/api/test-executions/gettestexecutiondetails.mdx +++ b/src/pages/docs/api/test-executions/gettestexecutiondetails.mdx @@ -1,6 +1,6 @@ --- title: "Get test execution details" -description: "Retrieves a test execution with its call executions." +description: "Retrieves a test execution with paginated call executions and column configuration." --- - Filter by scenario name, transcript content, or status. + Filter call executions by phone number or scenario name. Page number. Defaults to `1`. + + Number of call executions per page. Defaults to `30`. + - JSON-encoded filter array, e.g. `[{"colId":"status","filterType":"text","type":"equals","filter":"completed"}]`. + JSON-encoded array of filter objects. Each object must contain a `column_id` and a `filter_config` object. + + **Structure:** + ```json + [ + { + "column_id": "", + "filter_config": { + "filter_type": "", + "filter_op": "", + "filter_value": "" + } + } + ] + ``` + + **`column_id` values:** `status`, `timestamp`, `call_execution_id`, `overall_score`, `response_time`, `call_type`, `scenario`, or an eval config UUID. + + **`filter_type` values:** `text`, `number`, `datetime`, `boolean`, `list`. + + **`filter_op` values:** `equals`, `not_equals`, `contains`, `not_contains`, `greater_than`, `less_than`, `greater_than_or_equal`, `less_than_or_equal`, `between`, `not_in_between`, `in`. + + **`filter_value`:** A string, number, ISO 8601 datetime string, or array (for `between` / `in` operators). + + **Example — filter by status:** + ```json + [{"column_id":"status","filter_config":{"filter_type":"text","filter_op":"equals","filter_value":"completed"}}] + ``` + + **Example — filter by score range:** + ```json + [{"column_id":"overall_score","filter_config":{"filter_type":"number","filter_op":"between","filter_value":[50,90]}}] + ``` JSON-encoded array of column IDs to group by, e.g. `["scenario"]`. @@ -79,23 +121,13 @@ description: "Retrieves a test execution with its call executions." - UUID of the test execution. - UUID of the parent run test. - Parent run test name. - Agent definition name. - Status: `pending`, `running`, `completed`, `failed`, `cancelled`, `cancelling`, or `evaluating`. - Failure reason. - ISO 8601 execution start time. - ISO 8601 completion time. - Number of distinct scenarios. - Total call executions created. - Calls that completed. - Calls that failed. - Execution metadata. - Elapsed time in seconds. - Percentage of calls completed successfully. - Paginated call execution objects. - + Total number of call executions. + URL for the next page, or `null` if on the last page. + URL for the previous page, or `null` if on the first page. + Total number of pages. + Current page number. + Paginated list of call execution objects. + UUID of the call execution. Call status: `pending`, `queued`, `ongoing`, `completed`, `failed`, `analyzing`, or `cancelled`. Duration in seconds. @@ -105,18 +137,25 @@ description: "Retrieves a test execution with its call executions." Scenario name. ISO 8601 creation timestamp. - ISO 8601 creation timestamp. - Scenario UUIDs included. - Simulator agent name. - UUID of the simulator agent. - Agent definition name used. - UUID of the agent definition used. - Total calls initiated. - Percentage of calls that connected. + Column configuration for the test execution grid. + + UUID of the column. + Display name of the column. + Whether the column is visible. + Data type of the column. + `scenario_dataset_column`, `evaluation`, or `tool_evaluation`. + UUID of the associated scenario, if applicable. + UUID of the associated dataset. + Eval configuration details, if applicable. + + Test execution status: `pending`, `running`, `completed`, `failed`, `cancelled`, `cancelling`, or `evaluating`. + List of error message strings, if any. + Agent provider name (e.g. `vapi`, `prompt`). + `voice` or `text`. Invalid or missing credentials. - Test execution not found. + Test execution not found or organization not found. Unexpected server error. diff --git a/src/pages/docs/api/test-executions/reruncalls.mdx b/src/pages/docs/api/test-executions/reruncalls.mdx index 8c0a8ed2..5e79db3c 100644 --- a/src/pages/docs/api/test-executions/reruncalls.mdx +++ b/src/pages/docs/api/test-executions/reruncalls.mdx @@ -10,16 +10,21 @@ description: "Reruns call executions within a test execution." parameters={[ {"name": "test_execution_id", "in": "path", "required": true, "description": "UUID of the test execution.", "type": "string"} ]} - requestBody={{"rerunType": "eval_only", "callExecutionIds": ["a1b2c3d4-e5f6-7890-abcd-ef1234567890"], "selectAll": false}} + requestBody={{"rerun_type": "eval_only", "call_execution_ids": ["a1b2c3d4-e5f6-7890-abcd-ef1234567890"], "select_all": false}} responseExample={{ message: "Rerun initiated successfully", - testExecutionId: "f7a8b9c0-d1e2-3456-789a-bcdef0123456", - rerunType: "eval_only", - totalProcessed: 1, - successfulReruns: ["a1b2c3d4-e5f6-7890-abcd-ef1234567890"], - failedReruns: [], - successCount: 1, - failureCount: 0 + test_execution_id: "f7a8b9c0-d1e2-3456-789a-bcdef0123456", + rerun_type: "eval_only", + total_processed: 2, + successful_reruns: ["a1b2c3d4-e5f6-7890-abcd-ef1234567890"], + failed_reruns: [ + { + call_execution_id: "b2c3d4e5-f6a7-8901-bcde-f01234567891", + error: "Call execution is in an incompatible state for rerun" + } + ], + success_count: 1, + failure_count: 1 }} responseStatus={200} responseStatusText="OK" @@ -41,14 +46,14 @@ description: "Reruns call executions within a test execution." - + The type of rerun to perform. Use `eval_only` to re-evaluate existing call data without re-executing the actual calls -- this is useful when you have updated your evaluation configurations and want to see updated scores without the cost of re-running calls. Use `call_and_eval` to fully re-execute the calls and then evaluate the new results -- this produces fresh conversations and is useful when you have modified the agent under test. Note that text agents only support `eval_only` reruns; attempting `call_and_eval` on a text agent will return a 400 error. - - An array of call execution UUIDs to rerun. Required when `selectAll` is `false` or not provided. Each ID must correspond to a valid call execution within the specified test execution. If a provided ID does not exist or does not belong to the test execution, it will appear in the `failedReruns` array of the response. + + An array of call execution UUIDs to rerun. Required when `select_all` is `false` or not provided. Each ID must correspond to a valid call execution within the specified test execution. If a provided ID does not exist or does not belong to the test execution, it will appear in the `failed_reruns` array of the response. - - When set to `true`, all call executions within the test execution will be rerun, and the `callExecutionIds` field is ignored. Defaults to `false`. You must provide either `selectAll: true` or a non-empty `callExecutionIds` array -- the request will fail with a 400 error if neither is specified. + + When set to `true`, all call executions within the test execution will be rerun, and the `call_execution_ids` field is ignored. Defaults to `false`. You must provide either `select_all: true` or a non-empty `call_execution_ids` array -- the request will fail with a 400 error if neither is specified. @@ -56,32 +61,32 @@ description: "Reruns call executions within a test execution." A human-readable confirmation message indicating that the rerun has been initiated. The actual rerun processing happens asynchronously after this response is returned. - + The UUID of the test execution that the rerun was initiated for, echoed back for confirmation and reference. - + The type of rerun that was requested, either `eval_only` or `call_and_eval`. Echoed back from the request for confirmation. - + The total number of call executions that were processed by the rerun request. This includes both successful and failed reruns. - + An array of call execution UUIDs that were successfully queued for rerun. These calls will be re-executed or re-evaluated asynchronously. - - An array of objects describing call executions that could not be rerun. Each object contains a `callExecutionId` (the UUID of the failed call) and an `error` (a human-readable description of why the rerun failed, such as the call being in an incompatible state). + + An array of objects describing call executions that could not be rerun. Each object contains a `call_execution_id` (the UUID of the failed call) and an `error` (a human-readable description of why the rerun failed, such as the call being in an incompatible state). - - The number of call executions that were successfully queued for rerun. Equal to the length of the `successfulReruns` array. + + The number of call executions that were successfully queued for rerun. Equal to the length of the `successful_reruns` array. - - The number of call executions that failed to be queued for rerun. Equal to the length of the `failedReruns` array. + + The number of call executions that failed to be queued for rerun. Equal to the length of the `failed_reruns` array. - The rerun request could not be processed. This error occurs when: the `rerunType` field is missing or contains an invalid value; neither `callExecutionIds` nor `selectAll` was provided; the test execution is still in an active state (`pending`, `running`, or `cancelling`) and cannot accept reruns; or a `call_and_eval` rerun was requested for a text agent, which only supports `eval_only` reruns. Check the error message in the response body for specific details on which validation failed. + The rerun request could not be processed. This error occurs when: the `rerun_type` field is missing or contains an invalid value; neither `call_execution_ids` nor `select_all` was provided; the test execution is still in an active state (`pending`, `running`, or `cancelling`) and cannot accept reruns; or a `call_and_eval` rerun was requested for a text agent, which only supports `eval_only` reruns. Check the error message in the response body for specific details on which validation failed. The request could not be authenticated. Verify that both `X-Api-Key` and `X-Secret-Key` headers are present and contain valid, non-expired credentials. Ensure the API key has access to the workspace that owns this test execution. diff --git a/src/pages/docs/dataset/features/experiments.mdx b/src/pages/docs/dataset/features/experiments.mdx index 5d81367f..c4387b1c 100644 --- a/src/pages/docs/dataset/features/experiments.mdx +++ b/src/pages/docs/dataset/features/experiments.mdx @@ -1,97 +1,133 @@ --- title: "Experiments in Dataset" -description: "To test, validate, and compare different prompt configurations" +description: "Test, validate, and compare prompt and agent configurations side by side" --- ## About -Experiments give you a structured way to answer questions like: *Which prompt performs better? Which model gives the best results for my use case?* You test different prompt and model combinations on the same dataset, score the outputs with evals, and compare results side by side so you can make data-driven decisions instead of guessing. +Experiments give you a structured way to answer questions like: *Which prompt performs better? Which model gives the best results? Does my agent beat my prompt for this task?* You import prompts and agents, run them across multiple model and parameter configurations on the same dataset, score the outputs with evals, and compare results side by side so you can make data-driven decisions instead of guessing. ## When to use -- **Compare prompts**: Run different prompt templates on the same rows and see which produces better answers or scores. -- **Compare models**: Run the same prompt with multiple models (or custom models) and compare quality, speed, or cost. -- **Validate before rollout**: Test prompt and model changes on a dataset before using them in production. -- **Optimize with evals**: Add built-in or custom evals and use scores to rank prompt/model combinations and pick a winner. +- **Compare prompts and agents**: Pull prompts from the [Prompt](/docs/prompt) section and agents from the [Agent Playground](/docs/agent-playground) into the same experiment and see which produces better outputs. +- **Compare models and parameters**: Add the same prompt with multiple models, temperatures, or tool configs to compare quality, latency, and cost across configurations. +- **Validate before rollout**: Test a prompt or agent change on a dataset before promoting it to production. +- **Optimize with evals**: Attach built-in or custom evals and use scores to rank prompt/agent-model combinations and pick a winner. +- **Iterate fast**: Stop a long run, edit a single config, or rerun just the failed cells without restarting the whole experiment. ## How to -You pick a **base column** (the generated responses you want to compare against), add one or more **prompt templates** (each with one or more models), attach **evals**, and run. The system generates responses for each prompt–model pair, runs the evals, and surfaces scores and comparisons so you can choose the best setup. +Experiment creation is a guided three-step flow: **Basic Info → Configuration → Evaluations**. Each step validates before you can move forward, and you can jump back to any completed step to edit it. - Click the "Experiments" button (e.g. in the top-right on the dataset dashboard) to open experiments for this dataset. + Open the dataset and click the **Experiments** button in the top-right of the dataset dashboard. ![Experiments](/screenshot/product/dataset/how-to/experiments-in-dataset/1.png) - - Give the experiment a name and select the **base column** – the column whose generated responses you want to compare (e.g. an existing run-prompt column). All experiment runs will be evaluated and compared against this baseline. + + Give the experiment a **name** and pick the **experiment type**. + + The name Set up the prompt and model configurations you want to compare. Each configuration becomes a separate column in the experiment grid. is pre-filled with an auto-suggested name based on your dataset. Accept it as-is or overwrite it with your own. Names must be unique within the dataset. + + Pick the experiment type that matches the task you're testing: + + + + Use **LLM** for text generation. You can import prompts *and* agents in the same experiment. + + + Use **TTS** to generate audio from text. Add prompts with different voices, models, and parameters to compare. + + + Use **STT** to transcribe audio. Each prompt configuration must point at a dataset column containing the input audio. + + + Use **Image Generation** to create images from text (or text + image). Compare image models and prompts side by side. + + + ![Create Experiment](/screenshot/product/dataset/how-to/experiments-in-dataset/2.png) - - In the prompt template section, define the prompts and models for the experiment. You can add multiple prompt templates; each can use one or more models so you compare many combinations. - ![Prompt Template](/screenshot/product/dataset/how-to/experiments-in-dataset/3.png) + + Set up the prompt and model configurations you want to compare. Each configuration becomes a separate column in the experiment grid. + - Choose the model type and model(s) you want for the experiment. You can select multiple models to compare. You can also create a custom model via "Create Custom Model". - Select **LLM** for text generation (chat). Choose one or more chat models to compare prompt performance. - ![LLM](/screenshot/product/dataset/how-to/experiments-in-dataset/4.png) - - Click [here](/docs/evaluation/features/custom-models) to learn how to create a custom model. - + For LLM experiments, click **Add Prompt/Agents** to import a prompt or agent. You can mix prompts and agents in the same experiment and score them against the same evals. + + - **Prompts**: pick a prompt from the [Prompt](/docs/prompt) section, select a published version, then attach **one or more models**. Each (prompt, model) pair becomes its own configuration, so adding three models to one prompt creates three columns to compare. For each model you can tune temperature, max tokens, top-p, response format, and tool config. + - **Agents**: pick an agent from the [Agent Playground](/docs/agent-playground) and select a published version. The agent's model, tools, and graph are captured at that version, so the run stays reproducible even if the agent is edited later. You don't pick a model again here. + ![LLM](/screenshot/product/dataset/how-to/experiments-in-dataset/3.png) - Select **Text-to-Speech** to generate audio from text. Choose TTS models to compare voice output across prompts. - ![TTS](/screenshot/product/dataset/how-to/experiments-in-dataset/5.png) - - Click [here](/docs/evaluation/features/custom-models) to learn how to create a custom model. - + For each prompt, write the instructions inline (use `{{column_name}}` to reference dataset columns) and attach one or more **TTS models** (with voice and format settings). Click **+ Add Prompt** to add more prompt entries. Each (prompt, model) pair becomes its own column. Output format is fixed to Audio. + ![TTS](/screenshot/product/dataset/how-to/experiments-in-dataset/4.png) - Select **Speech-to-Text** to transcribe audio into text. Choose STT models to compare transcription quality. - ![STT](/screenshot/product/dataset/how-to/experiments-in-dataset/6.png) - - Click [here](/docs/evaluation/features/custom-models) to learn how to create a custom model. - + For each prompt, write the instructions inline (use `{{column_name}}` to reference dataset columns), pick the dataset column containing the input audio, and attach one or more **STT models**. Click **+ Add Prompt** to add more entries to compare transcription quality. + ![STT](/screenshot/product/dataset/how-to/experiments-in-dataset/5.png) - Select **Image Generation** to create images from text (or image + text). Choose image models to compare output quality. - ![Image Generation](/screenshot/product/dataset/how-to/experiments-in-dataset/7.png) + For each prompt, write the instructions inline (use `{{column_name}}` to reference dataset columns) and attach one or more **image models**. Click **+ Add Prompt** to add more entries and compare output quality across models and parameters. + ![Image Generation](/screenshot/product/dataset/how-to/experiments-in-dataset/6.png) + + + Models you've added through Custom Models show up in the model picker for prompt configurations across all experiment types. - Click [here](/docs/evaluation/features/custom-models) to learn how to create a custom model. + See [Custom Models](/docs/evaluation/features/custom-models) for how to register a custom or self-hosted model. - Use an existing prompt template or create a new one. You can add as many prompt templates as you need. - - Click [here](/docs/prompt-workbench) to learn more about prompts. - + For prompts, you can also configure **tool calling** with **Auto**, **Required**, or **None**, and add tool definitions the model can invoke. - - Experiments compare prompt–model performance using evals. Add the evals you want to run on the generated responses. + + The final step has two parts: an optional **base column** and the **evals** you want to score outputs with. + + **Compare against baseline (optional)**: pick a column from the dataset to compare model outputs against (typically a ground-truth or existing run-prompt column). Skip it if you don't have a reference output yet; you can still run the experiment, attach evals that don't need a baseline, and add a base column later by editing the experiment. + + **Add evaluations**: click **Add Evaluation** and pick from the [built-in eval](/docs/evaluation/builtin) catalog or [create a custom eval](/docs/evaluation/features/custom). Add as many as you need. Every eval runs on every configuration so the results are directly comparable. + ![Choosing Evals](/screenshot/product/dataset/how-to/experiments-in-dataset/7.png) + + For each eval, map its inputs (e.g. `output`, `input`, `expected`) to the model output or to dataset columns. Mapping is required before the experiment can run. ![Choosing Evals](/screenshot/product/dataset/how-to/experiments-in-dataset/8.png) - Click "Add Evaluation" and pick from [existing eval](/docs/evaluation/builtin) templates or [create a custom eval](/docs/evaluation/features/custom). You can add as many evals as you want. - ![Choosing Evals](/screenshot/product/dataset/how-to/experiments-in-dataset/9.png) - - After configuring prompts, models, and evals, click "Run" to start the experiment. The system will generate responses for each prompt–model pair, run the evals, and show results and comparisons when complete. + + Click **Run** to start. The experiment processes every row across every prompt/agent-model configuration in parallel, running the evals on each output as it arrives. The grid streams results live so you can watch progress without waiting for the whole run to finish. + + + + If you spot a misconfiguration or want to abort, click **Stop** on a running experiment from the Experiments tab. Any in-flight cells are marked as errored, and you can then edit the experiment and rerun without waiting for the full run to complete. - - You can change the experiment at any time: edit the name, base column, prompt templates, models, or evals, then save. Use **Re-run** to run the experiment again with the same or updated config (e.g. after adding rows to the dataset or changing a prompt). Re-run processes all rows again and refreshes the experiment dataset results. - ![Update](/screenshot/product/dataset/how-to/experiments-in-dataset/10.png) + + Use **Rerun Experiment** to re-execute the entire experiment after editing prompts, models, evals, or the base column. Editing is granular: only the configurations you actually changed are re-executed, and results from untouched configurations are preserved. + + For more targeted reruns: + + - **Rerun a single cell**: hover any output or eval cell in the grid and click the rerun icon. Useful when one row failed transiently or you've tweaked a single configuration. + - **Rerun a column**: from the column header, choose **Run all cells in the column** or **Run only failed cells in the column**. Failed-only is the fastest way to recover from API hiccups without redoing successful work. + - **Rerun an eval**: re-execute a single eval across all rows after changing its config or mapping, without re-generating any model outputs. + + ![Update](/screenshot/product/dataset/how-to/experiments-in-dataset/9.png) - - When the experiment has finished, use the **Compare** (or comparison) view to see how each prompt–model combination performed. Set weights for eval scores and metrics (e.g. response time, token usage) to compute an overall ranking. The comparison shows which combination ranks best so you can choose a winner. + + Open the **Compare** view to see how every configuration performed. Set weights (0-10) for each eval score and for response time, completion tokens, and total tokens. The system normalizes the metrics, computes an overall rating per configuration, and ranks them so the winner is clear. Adjust the weights to match what matters for your use case (e.g. prioritize quality over cost) and the ranking updates in place. +## Tips + +- **Use published versions**: experiments only run published prompt and agent versions. Publish the version you want to test before importing it. +- **Mix prompts and agents**: an **LLM** experiment can contain prompts and agents side by side, scored against the same evals. Useful when you're deciding whether an agent is worth the extra complexity over a prompt. TTS, STT, and Image experiments accept prompts only. +- **Failed-only rerun**: when transient failures (rate limits, network blips) leave a few cells errored, use the failed-only rerun on the column to recover them without redoing successful rows. + ## Next Steps