diff --git a/.release-please-manifest.json b/.release-please-manifest.json index f471069..c412e97 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,3 +1,3 @@ { - ".": "0.1.0-alpha.28" + ".": "0.1.0-alpha.29" } \ No newline at end of file diff --git a/.stats.yml b/.stats.yml index 5b7840a..4739e91 100644 --- a/.stats.yml +++ b/.stats.yml @@ -1,3 +1,3 @@ -configured_endpoints: 54 -openapi_spec_hash: 43ecb34eaf8efd3fe94b23f2c859fe05 -config_hash: 04312af86542d1127f09d3f3cbe5bb50 +configured_endpoints: 55 +openapi_spec_hash: b54b36ebcaf88c1ddb6d51d24da75420 +config_hash: 48c3812186c899cdef23cc8de76bd2aa diff --git a/CHANGELOG.md b/CHANGELOG.md index 86e2b46..4a3238c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,18 @@ # Changelog +## 0.1.0-alpha.29 (2025-10-06) + +Full Changelog: [v0.1.0-alpha.28...v0.1.0-alpha.29](https://github.com/cleanlab/codex-python/compare/v0.1.0-alpha.28...v0.1.0-alpha.29) + +### Features + +* **api:** add /detect to accessible routes ([73dd089](https://github.com/cleanlab/codex-python/commit/73dd08944de0337c7fca6d131931b1d89b75be26)) + + +### Chores + +* do not install brew dependencies in ./scripts/bootstrap by default ([a8f1181](https://github.com/cleanlab/codex-python/commit/a8f1181ff0440c76fcef8f7ed4ea56468b77ee44)) + ## 0.1.0-alpha.28 (2025-09-19) Full Changelog: [v0.1.0-alpha.27...v0.1.0-alpha.28](https://github.com/cleanlab/codex-python/compare/v0.1.0-alpha.27...v0.1.0-alpha.28) diff --git a/api.md b/api.md index dc60687..5d40e15 100644 --- a/api.md +++ b/api.md @@ -139,6 +139,7 @@ from codex.types import ( ProjectReturnSchema, ProjectRetrieveResponse, ProjectListResponse, + ProjectDetectResponse, ProjectInviteSmeResponse, ProjectRetrieveAnalyticsResponse, ProjectValidateResponse, @@ -152,6 +153,7 @@ Methods: - client.projects.update(project_id, \*\*params) -> ProjectReturnSchema - client.projects.list(\*\*params) -> ProjectListResponse - client.projects.delete(project_id) -> None +- client.projects.detect(project_id, \*\*params) -> ProjectDetectResponse - client.projects.export(project_id) -> object - client.projects.invite_sme(project_id, \*\*params) -> ProjectInviteSmeResponse - client.projects.retrieve_analytics(project_id, \*\*params) -> ProjectRetrieveAnalyticsResponse diff --git a/pyproject.toml b/pyproject.toml index a93fd5c..1a7d218 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "codex-sdk" -version = "0.1.0-alpha.28" +version = "0.1.0-alpha.29" description = "Internal SDK used within cleanlab-codex package. Refer to https://pypi.org/project/cleanlab-codex/ instead." dynamic = ["readme"] license = "MIT" diff --git a/scripts/bootstrap b/scripts/bootstrap index e84fe62..b430fee 100755 --- a/scripts/bootstrap +++ b/scripts/bootstrap @@ -4,10 +4,18 @@ set -e cd "$(dirname "$0")/.." -if ! command -v rye >/dev/null 2>&1 && [ -f "Brewfile" ] && [ "$(uname -s)" = "Darwin" ]; then +if [ -f "Brewfile" ] && [ "$(uname -s)" = "Darwin" ] && [ "$SKIP_BREW" != "1" ] && [ -t 0 ]; then brew bundle check >/dev/null 2>&1 || { - echo "==> Installing Homebrew dependencies…" - brew bundle + echo -n "==> Install Homebrew dependencies? (y/N): " + read -r response + case "$response" in + [yY][eE][sS]|[yY]) + brew bundle + ;; + *) + ;; + esac + echo } fi diff --git a/src/codex/_version.py b/src/codex/_version.py index 82d84e8..77a43df 100644 --- a/src/codex/_version.py +++ b/src/codex/_version.py @@ -1,4 +1,4 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. __title__ = "codex" -__version__ = "0.1.0-alpha.28" # x-release-please-version +__version__ = "0.1.0-alpha.29" # x-release-please-version diff --git a/src/codex/resources/projects/projects.py b/src/codex/resources/projects/projects.py index 2ed4dd2..4575e8a 100644 --- a/src/codex/resources/projects/projects.py +++ b/src/codex/resources/projects/projects.py @@ -18,6 +18,7 @@ from ...types import ( project_list_params, project_create_params, + project_detect_params, project_update_params, project_validate_params, project_invite_sme_params, @@ -60,6 +61,7 @@ from ..._base_client import make_request_options from ...types.project_list_response import ProjectListResponse from ...types.project_return_schema import ProjectReturnSchema +from ...types.project_detect_response import ProjectDetectResponse from ...types.project_retrieve_response import ProjectRetrieveResponse from ...types.project_validate_response import ProjectValidateResponse from ...types.project_invite_sme_response import ProjectInviteSmeResponse @@ -314,6 +316,171 @@ def delete( cast_to=NoneType, ) + def detect( + self, + project_id: str, + *, + context: str, + query: str, + response: project_detect_params.Response, + constrain_outputs: Optional[SequenceNotStr[str]] | Omit = omit, + eval_config: project_detect_params.EvalConfig | Omit = omit, + messages: Iterable[project_detect_params.Message] | Omit = omit, + options: Optional[project_detect_params.Options] | Omit = omit, + quality_preset: Literal["best", "high", "medium", "low", "base"] | Omit = omit, + rewritten_question: Optional[str] | Omit = omit, + task: Optional[str] | Omit = omit, + tools: Optional[Iterable[project_detect_params.Tool]] | Omit = omit, + # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. + # The extra values given here take precedence over values defined on the client or passed to this method. + extra_headers: Headers | None = None, + extra_query: Query | None = None, + extra_body: Body | None = None, + timeout: float | httpx.Timeout | None | NotGiven = not_given, + ) -> ProjectDetectResponse: + """ + Detect whether a response, given the provided query and context, is potentially + bad. No query is logged in the project for this API route. Optionally, users can + add custom evals for each request, or swap in different settings for the current + project's evals. + + Args: + eval_config: All of the evals that should be used for this query + + messages: Message history to provide conversation context for the query. Messages contain + up to and including the latest user prompt to the LLM. + + options: Typed dict of advanced configuration options for the Trustworthy Language Model. + Many of these configurations are determined by the quality preset selected + (learn about quality presets in the TLM [initialization method](./#class-tlm)). + Specifying TLMOptions values directly overrides any default values set from the + quality preset. + + For all options described below, higher settings will lead to longer runtimes + and may consume more tokens internally. You may not be able to run long prompts + (or prompts with long responses) in your account, unless your token/rate limits + are increased. If you hit token limit issues, try lower/less expensive + TLMOptions to be able to run longer prompts/responses, or contact Cleanlab to + increase your limits. + + The default values corresponding to each quality preset are: + + - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3, + `reasoning_effort` = `"high"`. + - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3, + `reasoning_effort` = `"high"`. + - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3, + `reasoning_effort` = `"high"`. + - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3, + `reasoning_effort` = `"none"`. + - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1, + `reasoning_effort` = `"none"`. + + By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base + `model`, and `max_tokens` is set to 512. You can set custom values for these + arguments regardless of the quality preset specified. + + Args: model ({"gpt-5", "gpt-5-mini", "gpt-5-nano", "gpt-4.1", "gpt-4.1-mini", + "gpt-4.1-nano", "o4-mini", "o3", "gpt-4.5-preview", "gpt-4o-mini", "gpt-4o", + "o3-mini", "o1", "o1-mini", "gpt-4", "gpt-3.5-turbo-16k", "claude-opus-4-0", + "claude-sonnet-4-0", "claude-3.7-sonnet", "claude-3.5-sonnet-v2", + "claude-3.5-sonnet", "claude-3.5-haiku", "claude-3-haiku", "nova-micro", + "nova-lite", "nova-pro"}, default = "gpt-4.1-mini"): Underlying base LLM to use + (better models yield better results, faster models yield faster results). - + Models still in beta: "o3", "o1", "o4-mini", "o3-mini", "o1-mini", + "gpt-4.5-preview", "claude-opus-4-0", "claude-sonnet-4-0", "claude-3.7-sonnet", + "claude-3.5-haiku". - Recommended models for accuracy: "gpt-5", "gpt-4.1", + "o4-mini", "o3", "claude-opus-4-0", "claude-sonnet-4-0". - Recommended models + for low latency/costs: "gpt-4.1-nano", "nova-micro". + + log (list[str], default = []): optionally specify additional logs or metadata that TLM should return. + For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness. + + custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring. + The expected input format is a list of dictionaries, where each dictionary has the following keys: + - name: Name of the evaluation criteria. + - criteria: Instructions specifying the evaluation criteria. + + max_tokens (int, default = 512): the maximum number of tokens that can be generated in the response from `TLM.prompt()` as well as during internal trustworthiness scoring. + If you experience token/rate-limit errors, try lowering this number. + For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512. + + reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens) + when generating alternative possible responses and reflecting on responses during trustworthiness scoring. + Reduce this value to reduce runtimes. Higher values may improve trust scoring. + + num_self_reflections (int, default = 3): the number of different evaluations to perform where the LLM reflects on the response, a factor affecting trust scoring. + The maximum number currently supported is 3. Lower values can reduce runtimes. + Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis. + This parameter has no effect when `disable_trustworthiness` is True. + + num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trust scoring. + Must be between 0 and 20. Lower values can reduce runtimes. + Measuring consistency helps quantify the epistemic uncertainty associated with + strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response. + TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible. + This parameter has no effect when `disable_trustworthiness` is True. + + similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the + trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model. + Supported similarity measures include - "semantic" (based on natural language inference), + "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model), + "code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies), + and "string" (based on character/word overlap). Set this to "string" for minimal runtimes. + This parameter has no effect when `num_consistency_samples = 0`. + + num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`. + `TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one. + You can auto-improve responses by increasing this parameter, but at higher runtimes/costs. + This parameter must be between 1 and 20. It has no effect on `TLM.score()`. + When this parameter is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it. + This parameter has no effect when `disable_trustworthiness` is True. + + disable_trustworthiness (bool, default = False): if True, TLM will not compute trust scores, + useful if you only want to compute custom evaluation criteria. + + quality_preset: The quality preset to use for the TLM or Trustworthy RAG API. + + rewritten_question: The re-written query if it was provided by the client to Codex from a user to be + used instead of the original query. + + tools: Tools to use for the LLM call. If not provided, it is assumed no tools were + provided to the LLM. + + extra_headers: Send extra headers + + extra_query: Add additional query parameters to the request + + extra_body: Add additional JSON properties to the request + + timeout: Override the client-level default timeout for this request, in seconds + """ + if not project_id: + raise ValueError(f"Expected a non-empty value for `project_id` but received {project_id!r}") + return self._post( + f"/api/projects/{project_id}/detect", + body=maybe_transform( + { + "context": context, + "query": query, + "response": response, + "constrain_outputs": constrain_outputs, + "eval_config": eval_config, + "messages": messages, + "options": options, + "quality_preset": quality_preset, + "rewritten_question": rewritten_question, + "task": task, + "tools": tools, + }, + project_detect_params.ProjectDetectParams, + ), + options=make_request_options( + extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout + ), + cast_to=ProjectDetectResponse, + ) + def export( self, project_id: str, @@ -886,6 +1053,171 @@ async def delete( cast_to=NoneType, ) + async def detect( + self, + project_id: str, + *, + context: str, + query: str, + response: project_detect_params.Response, + constrain_outputs: Optional[SequenceNotStr[str]] | Omit = omit, + eval_config: project_detect_params.EvalConfig | Omit = omit, + messages: Iterable[project_detect_params.Message] | Omit = omit, + options: Optional[project_detect_params.Options] | Omit = omit, + quality_preset: Literal["best", "high", "medium", "low", "base"] | Omit = omit, + rewritten_question: Optional[str] | Omit = omit, + task: Optional[str] | Omit = omit, + tools: Optional[Iterable[project_detect_params.Tool]] | Omit = omit, + # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. + # The extra values given here take precedence over values defined on the client or passed to this method. + extra_headers: Headers | None = None, + extra_query: Query | None = None, + extra_body: Body | None = None, + timeout: float | httpx.Timeout | None | NotGiven = not_given, + ) -> ProjectDetectResponse: + """ + Detect whether a response, given the provided query and context, is potentially + bad. No query is logged in the project for this API route. Optionally, users can + add custom evals for each request, or swap in different settings for the current + project's evals. + + Args: + eval_config: All of the evals that should be used for this query + + messages: Message history to provide conversation context for the query. Messages contain + up to and including the latest user prompt to the LLM. + + options: Typed dict of advanced configuration options for the Trustworthy Language Model. + Many of these configurations are determined by the quality preset selected + (learn about quality presets in the TLM [initialization method](./#class-tlm)). + Specifying TLMOptions values directly overrides any default values set from the + quality preset. + + For all options described below, higher settings will lead to longer runtimes + and may consume more tokens internally. You may not be able to run long prompts + (or prompts with long responses) in your account, unless your token/rate limits + are increased. If you hit token limit issues, try lower/less expensive + TLMOptions to be able to run longer prompts/responses, or contact Cleanlab to + increase your limits. + + The default values corresponding to each quality preset are: + + - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3, + `reasoning_effort` = `"high"`. + - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3, + `reasoning_effort` = `"high"`. + - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3, + `reasoning_effort` = `"high"`. + - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3, + `reasoning_effort` = `"none"`. + - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1, + `reasoning_effort` = `"none"`. + + By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base + `model`, and `max_tokens` is set to 512. You can set custom values for these + arguments regardless of the quality preset specified. + + Args: model ({"gpt-5", "gpt-5-mini", "gpt-5-nano", "gpt-4.1", "gpt-4.1-mini", + "gpt-4.1-nano", "o4-mini", "o3", "gpt-4.5-preview", "gpt-4o-mini", "gpt-4o", + "o3-mini", "o1", "o1-mini", "gpt-4", "gpt-3.5-turbo-16k", "claude-opus-4-0", + "claude-sonnet-4-0", "claude-3.7-sonnet", "claude-3.5-sonnet-v2", + "claude-3.5-sonnet", "claude-3.5-haiku", "claude-3-haiku", "nova-micro", + "nova-lite", "nova-pro"}, default = "gpt-4.1-mini"): Underlying base LLM to use + (better models yield better results, faster models yield faster results). - + Models still in beta: "o3", "o1", "o4-mini", "o3-mini", "o1-mini", + "gpt-4.5-preview", "claude-opus-4-0", "claude-sonnet-4-0", "claude-3.7-sonnet", + "claude-3.5-haiku". - Recommended models for accuracy: "gpt-5", "gpt-4.1", + "o4-mini", "o3", "claude-opus-4-0", "claude-sonnet-4-0". - Recommended models + for low latency/costs: "gpt-4.1-nano", "nova-micro". + + log (list[str], default = []): optionally specify additional logs or metadata that TLM should return. + For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness. + + custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring. + The expected input format is a list of dictionaries, where each dictionary has the following keys: + - name: Name of the evaluation criteria. + - criteria: Instructions specifying the evaluation criteria. + + max_tokens (int, default = 512): the maximum number of tokens that can be generated in the response from `TLM.prompt()` as well as during internal trustworthiness scoring. + If you experience token/rate-limit errors, try lowering this number. + For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512. + + reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens) + when generating alternative possible responses and reflecting on responses during trustworthiness scoring. + Reduce this value to reduce runtimes. Higher values may improve trust scoring. + + num_self_reflections (int, default = 3): the number of different evaluations to perform where the LLM reflects on the response, a factor affecting trust scoring. + The maximum number currently supported is 3. Lower values can reduce runtimes. + Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis. + This parameter has no effect when `disable_trustworthiness` is True. + + num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trust scoring. + Must be between 0 and 20. Lower values can reduce runtimes. + Measuring consistency helps quantify the epistemic uncertainty associated with + strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response. + TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible. + This parameter has no effect when `disable_trustworthiness` is True. + + similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the + trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model. + Supported similarity measures include - "semantic" (based on natural language inference), + "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model), + "code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies), + and "string" (based on character/word overlap). Set this to "string" for minimal runtimes. + This parameter has no effect when `num_consistency_samples = 0`. + + num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`. + `TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one. + You can auto-improve responses by increasing this parameter, but at higher runtimes/costs. + This parameter must be between 1 and 20. It has no effect on `TLM.score()`. + When this parameter is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it. + This parameter has no effect when `disable_trustworthiness` is True. + + disable_trustworthiness (bool, default = False): if True, TLM will not compute trust scores, + useful if you only want to compute custom evaluation criteria. + + quality_preset: The quality preset to use for the TLM or Trustworthy RAG API. + + rewritten_question: The re-written query if it was provided by the client to Codex from a user to be + used instead of the original query. + + tools: Tools to use for the LLM call. If not provided, it is assumed no tools were + provided to the LLM. + + extra_headers: Send extra headers + + extra_query: Add additional query parameters to the request + + extra_body: Add additional JSON properties to the request + + timeout: Override the client-level default timeout for this request, in seconds + """ + if not project_id: + raise ValueError(f"Expected a non-empty value for `project_id` but received {project_id!r}") + return await self._post( + f"/api/projects/{project_id}/detect", + body=await async_maybe_transform( + { + "context": context, + "query": query, + "response": response, + "constrain_outputs": constrain_outputs, + "eval_config": eval_config, + "messages": messages, + "options": options, + "quality_preset": quality_preset, + "rewritten_question": rewritten_question, + "task": task, + "tools": tools, + }, + project_detect_params.ProjectDetectParams, + ), + options=make_request_options( + extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout + ), + cast_to=ProjectDetectResponse, + ) + async def export( self, project_id: str, @@ -1231,6 +1563,9 @@ def __init__(self, projects: ProjectsResource) -> None: self.delete = to_raw_response_wrapper( projects.delete, ) + self.detect = to_raw_response_wrapper( + projects.detect, + ) self.export = to_raw_response_wrapper( projects.export, ) @@ -1280,6 +1615,9 @@ def __init__(self, projects: AsyncProjectsResource) -> None: self.delete = async_to_raw_response_wrapper( projects.delete, ) + self.detect = async_to_raw_response_wrapper( + projects.detect, + ) self.export = async_to_raw_response_wrapper( projects.export, ) @@ -1329,6 +1667,9 @@ def __init__(self, projects: ProjectsResource) -> None: self.delete = to_streamed_response_wrapper( projects.delete, ) + self.detect = to_streamed_response_wrapper( + projects.detect, + ) self.export = to_streamed_response_wrapper( projects.export, ) @@ -1378,6 +1719,9 @@ def __init__(self, projects: AsyncProjectsResource) -> None: self.delete = async_to_streamed_response_wrapper( projects.delete, ) + self.detect = async_to_streamed_response_wrapper( + projects.detect, + ) self.export = async_to_streamed_response_wrapper( projects.export, ) diff --git a/src/codex/types/__init__.py b/src/codex/types/__init__.py index 322b513..ca9129a 100644 --- a/src/codex/types/__init__.py +++ b/src/codex/types/__init__.py @@ -5,9 +5,11 @@ from .project_list_params import ProjectListParams as ProjectListParams from .health_check_response import HealthCheckResponse as HealthCheckResponse from .project_create_params import ProjectCreateParams as ProjectCreateParams +from .project_detect_params import ProjectDetectParams as ProjectDetectParams from .project_list_response import ProjectListResponse as ProjectListResponse from .project_return_schema import ProjectReturnSchema as ProjectReturnSchema from .project_update_params import ProjectUpdateParams as ProjectUpdateParams +from .project_detect_response import ProjectDetectResponse as ProjectDetectResponse from .project_validate_params import ProjectValidateParams as ProjectValidateParams from .project_invite_sme_params import ProjectInviteSmeParams as ProjectInviteSmeParams from .project_retrieve_response import ProjectRetrieveResponse as ProjectRetrieveResponse diff --git a/src/codex/types/project_detect_params.py b/src/codex/types/project_detect_params.py new file mode 100644 index 0000000..f29d3e0 --- /dev/null +++ b/src/codex/types/project_detect_params.py @@ -0,0 +1,992 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +import builtins +from typing import Dict, Union, Iterable, Optional +from typing_extensions import Literal, Required, TypeAlias, TypedDict + +from .._types import SequenceNotStr + +__all__ = [ + "ProjectDetectParams", + "Response", + "ResponseChatCompletion", + "ResponseChatCompletionChoice", + "ResponseChatCompletionChoiceMessage", + "ResponseChatCompletionChoiceMessageAnnotation", + "ResponseChatCompletionChoiceMessageAnnotationURLCitation", + "ResponseChatCompletionChoiceMessageAudio", + "ResponseChatCompletionChoiceMessageFunctionCall", + "ResponseChatCompletionChoiceMessageToolCall", + "ResponseChatCompletionChoiceMessageToolCallFunction", + "ResponseChatCompletionChoiceLogprobs", + "ResponseChatCompletionChoiceLogprobsContent", + "ResponseChatCompletionChoiceLogprobsContentTopLogprob", + "ResponseChatCompletionChoiceLogprobsRefusal", + "ResponseChatCompletionChoiceLogprobsRefusalTopLogprob", + "ResponseChatCompletionUsage", + "ResponseChatCompletionUsageCompletionTokensDetails", + "ResponseChatCompletionUsagePromptTokensDetails", + "EvalConfig", + "EvalConfigCustomEvals", + "EvalConfigCustomEvalsEvals", + "EvalConfigDefaultEvals", + "EvalConfigDefaultEvalsContextSufficiency", + "EvalConfigDefaultEvalsQueryEase", + "EvalConfigDefaultEvalsResponseGroundedness", + "EvalConfigDefaultEvalsResponseHelpfulness", + "EvalConfigDefaultEvalsTrustworthiness", + "Message", + "MessageChatCompletionAssistantMessageParamInput", + "MessageChatCompletionAssistantMessageParamInputAudio", + "MessageChatCompletionAssistantMessageParamInputContentUnionMember1", + "MessageChatCompletionAssistantMessageParamInputContentUnionMember1ChatCompletionContentPartTextParam", + "MessageChatCompletionAssistantMessageParamInputContentUnionMember1ChatCompletionContentPartRefusalParam", + "MessageChatCompletionAssistantMessageParamInputFunctionCall", + "MessageChatCompletionAssistantMessageParamInputToolCall", + "MessageChatCompletionAssistantMessageParamInputToolCallFunction", + "MessageChatCompletionToolMessageParam", + "MessageChatCompletionToolMessageParamContentUnionMember1", + "MessageChatCompletionUserMessageParamInput", + "MessageChatCompletionUserMessageParamInputContentUnionMember1", + "MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartTextParam", + "MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartImageParam", + "MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartImageParamImageURL", + "MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartInputAudioParam", + "MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio", + "MessageChatCompletionUserMessageParamInputContentUnionMember1File", + "MessageChatCompletionUserMessageParamInputContentUnionMember1FileFile", + "MessageChatCompletionSystemMessageParam", + "MessageChatCompletionSystemMessageParamContentUnionMember1", + "MessageChatCompletionFunctionMessageParam", + "MessageChatCompletionDeveloperMessageParam", + "MessageChatCompletionDeveloperMessageParamContentUnionMember1", + "Options", + "Tool", + "ToolFunction", +] + + +class ProjectDetectParams(TypedDict, total=False): + context: Required[str] + + query: Required[str] + + response: Required[Response] + + constrain_outputs: Optional[SequenceNotStr[str]] + + eval_config: EvalConfig + """All of the evals that should be used for this query""" + + messages: Iterable[Message] + """Message history to provide conversation context for the query. + + Messages contain up to and including the latest user prompt to the LLM. + """ + + options: Optional[Options] + """ + Typed dict of advanced configuration options for the Trustworthy Language Model. + Many of these configurations are determined by the quality preset selected + (learn about quality presets in the TLM [initialization method](./#class-tlm)). + Specifying TLMOptions values directly overrides any default values set from the + quality preset. + + For all options described below, higher settings will lead to longer runtimes + and may consume more tokens internally. You may not be able to run long prompts + (or prompts with long responses) in your account, unless your token/rate limits + are increased. If you hit token limit issues, try lower/less expensive + TLMOptions to be able to run longer prompts/responses, or contact Cleanlab to + increase your limits. + + The default values corresponding to each quality preset are: + + - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3, + `reasoning_effort` = `"high"`. + - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3, + `reasoning_effort` = `"high"`. + - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3, + `reasoning_effort` = `"high"`. + - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3, + `reasoning_effort` = `"none"`. + - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1, + `reasoning_effort` = `"none"`. + + By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base + `model`, and `max_tokens` is set to 512. You can set custom values for these + arguments regardless of the quality preset specified. + + Args: model ({"gpt-5", "gpt-5-mini", "gpt-5-nano", "gpt-4.1", "gpt-4.1-mini", + "gpt-4.1-nano", "o4-mini", "o3", "gpt-4.5-preview", "gpt-4o-mini", "gpt-4o", + "o3-mini", "o1", "o1-mini", "gpt-4", "gpt-3.5-turbo-16k", "claude-opus-4-0", + "claude-sonnet-4-0", "claude-3.7-sonnet", "claude-3.5-sonnet-v2", + "claude-3.5-sonnet", "claude-3.5-haiku", "claude-3-haiku", "nova-micro", + "nova-lite", "nova-pro"}, default = "gpt-4.1-mini"): Underlying base LLM to use + (better models yield better results, faster models yield faster results). - + Models still in beta: "o3", "o1", "o4-mini", "o3-mini", "o1-mini", + "gpt-4.5-preview", "claude-opus-4-0", "claude-sonnet-4-0", "claude-3.7-sonnet", + "claude-3.5-haiku". - Recommended models for accuracy: "gpt-5", "gpt-4.1", + "o4-mini", "o3", "claude-opus-4-0", "claude-sonnet-4-0". - Recommended models + for low latency/costs: "gpt-4.1-nano", "nova-micro". + + log (list[str], default = []): optionally specify additional logs or metadata that TLM should return. + For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness. + + custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring. + The expected input format is a list of dictionaries, where each dictionary has the following keys: + - name: Name of the evaluation criteria. + - criteria: Instructions specifying the evaluation criteria. + + max_tokens (int, default = 512): the maximum number of tokens that can be generated in the response from `TLM.prompt()` as well as during internal trustworthiness scoring. + If you experience token/rate-limit errors, try lowering this number. + For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512. + + reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens) + when generating alternative possible responses and reflecting on responses during trustworthiness scoring. + Reduce this value to reduce runtimes. Higher values may improve trust scoring. + + num_self_reflections (int, default = 3): the number of different evaluations to perform where the LLM reflects on the response, a factor affecting trust scoring. + The maximum number currently supported is 3. Lower values can reduce runtimes. + Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis. + This parameter has no effect when `disable_trustworthiness` is True. + + num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trust scoring. + Must be between 0 and 20. Lower values can reduce runtimes. + Measuring consistency helps quantify the epistemic uncertainty associated with + strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response. + TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible. + This parameter has no effect when `disable_trustworthiness` is True. + + similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the + trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model. + Supported similarity measures include - "semantic" (based on natural language inference), + "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model), + "code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies), + and "string" (based on character/word overlap). Set this to "string" for minimal runtimes. + This parameter has no effect when `num_consistency_samples = 0`. + + num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`. + `TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one. + You can auto-improve responses by increasing this parameter, but at higher runtimes/costs. + This parameter must be between 1 and 20. It has no effect on `TLM.score()`. + When this parameter is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it. + This parameter has no effect when `disable_trustworthiness` is True. + + disable_trustworthiness (bool, default = False): if True, TLM will not compute trust scores, + useful if you only want to compute custom evaluation criteria. + """ + + quality_preset: Literal["best", "high", "medium", "low", "base"] + """The quality preset to use for the TLM or Trustworthy RAG API.""" + + rewritten_question: Optional[str] + """ + The re-written query if it was provided by the client to Codex from a user to be + used instead of the original query. + """ + + task: Optional[str] + + tools: Optional[Iterable[Tool]] + """Tools to use for the LLM call. + + If not provided, it is assumed no tools were provided to the LLM. + """ + + +class ResponseChatCompletionChoiceMessageAnnotationURLCitationTyped(TypedDict, total=False): + end_index: Required[int] + + start_index: Required[int] + + title: Required[str] + + url: Required[str] + + +ResponseChatCompletionChoiceMessageAnnotationURLCitation: TypeAlias = Union[ + ResponseChatCompletionChoiceMessageAnnotationURLCitationTyped, Dict[str, object] +] + + +class ResponseChatCompletionChoiceMessageAnnotationTyped(TypedDict, total=False): + type: Required[Literal["url_citation"]] + + url_citation: Required[ResponseChatCompletionChoiceMessageAnnotationURLCitation] + + +ResponseChatCompletionChoiceMessageAnnotation: TypeAlias = Union[ + ResponseChatCompletionChoiceMessageAnnotationTyped, Dict[str, object] +] + + +class ResponseChatCompletionChoiceMessageAudioTyped(TypedDict, total=False): + id: Required[str] + + data: Required[str] + + expires_at: Required[int] + + transcript: Required[str] + + +ResponseChatCompletionChoiceMessageAudio: TypeAlias = Union[ + ResponseChatCompletionChoiceMessageAudioTyped, Dict[str, object] +] + + +class ResponseChatCompletionChoiceMessageFunctionCallTyped(TypedDict, total=False): + arguments: Required[str] + + name: Required[str] + + +ResponseChatCompletionChoiceMessageFunctionCall: TypeAlias = Union[ + ResponseChatCompletionChoiceMessageFunctionCallTyped, Dict[str, object] +] + + +class ResponseChatCompletionChoiceMessageToolCallFunctionTyped(TypedDict, total=False): + arguments: Required[str] + + name: Required[str] + + +ResponseChatCompletionChoiceMessageToolCallFunction: TypeAlias = Union[ + ResponseChatCompletionChoiceMessageToolCallFunctionTyped, Dict[str, object] +] + + +class ResponseChatCompletionChoiceMessageToolCallTyped(TypedDict, total=False): + id: Required[str] + + function: Required[ResponseChatCompletionChoiceMessageToolCallFunction] + + type: Required[Literal["function"]] + + +ResponseChatCompletionChoiceMessageToolCall: TypeAlias = Union[ + ResponseChatCompletionChoiceMessageToolCallTyped, Dict[str, object] +] + + +class ResponseChatCompletionChoiceMessageTyped(TypedDict, total=False): + role: Required[Literal["assistant"]] + + annotations: Optional[Iterable[ResponseChatCompletionChoiceMessageAnnotation]] + + audio: Optional[ResponseChatCompletionChoiceMessageAudio] + + content: Optional[str] + + function_call: Optional[ResponseChatCompletionChoiceMessageFunctionCall] + + refusal: Optional[str] + + tool_calls: Optional[Iterable[ResponseChatCompletionChoiceMessageToolCall]] + + +ResponseChatCompletionChoiceMessage: TypeAlias = Union[ResponseChatCompletionChoiceMessageTyped, Dict[str, object]] + + +class ResponseChatCompletionChoiceLogprobsContentTopLogprobTyped(TypedDict, total=False): + token: Required[str] + + logprob: Required[float] + + bytes: Optional[Iterable[int]] + + +ResponseChatCompletionChoiceLogprobsContentTopLogprob: TypeAlias = Union[ + ResponseChatCompletionChoiceLogprobsContentTopLogprobTyped, Dict[str, object] +] + + +class ResponseChatCompletionChoiceLogprobsContentTyped(TypedDict, total=False): + token: Required[str] + + logprob: Required[float] + + top_logprobs: Required[Iterable[ResponseChatCompletionChoiceLogprobsContentTopLogprob]] + + bytes: Optional[Iterable[int]] + + +ResponseChatCompletionChoiceLogprobsContent: TypeAlias = Union[ + ResponseChatCompletionChoiceLogprobsContentTyped, Dict[str, object] +] + + +class ResponseChatCompletionChoiceLogprobsRefusalTopLogprobTyped(TypedDict, total=False): + token: Required[str] + + logprob: Required[float] + + bytes: Optional[Iterable[int]] + + +ResponseChatCompletionChoiceLogprobsRefusalTopLogprob: TypeAlias = Union[ + ResponseChatCompletionChoiceLogprobsRefusalTopLogprobTyped, Dict[str, object] +] + + +class ResponseChatCompletionChoiceLogprobsRefusalTyped(TypedDict, total=False): + token: Required[str] + + logprob: Required[float] + + top_logprobs: Required[Iterable[ResponseChatCompletionChoiceLogprobsRefusalTopLogprob]] + + bytes: Optional[Iterable[int]] + + +ResponseChatCompletionChoiceLogprobsRefusal: TypeAlias = Union[ + ResponseChatCompletionChoiceLogprobsRefusalTyped, Dict[str, object] +] + + +class ResponseChatCompletionChoiceLogprobsTyped(TypedDict, total=False): + content: Optional[Iterable[ResponseChatCompletionChoiceLogprobsContent]] + + refusal: Optional[Iterable[ResponseChatCompletionChoiceLogprobsRefusal]] + + +ResponseChatCompletionChoiceLogprobs: TypeAlias = Union[ResponseChatCompletionChoiceLogprobsTyped, Dict[str, object]] + + +class ResponseChatCompletionChoiceTyped(TypedDict, total=False): + finish_reason: Required[Literal["stop", "length", "tool_calls", "content_filter", "function_call"]] + + index: Required[int] + + message: Required[ResponseChatCompletionChoiceMessage] + + logprobs: Optional[ResponseChatCompletionChoiceLogprobs] + + +ResponseChatCompletionChoice: TypeAlias = Union[ResponseChatCompletionChoiceTyped, Dict[str, object]] + + +class ResponseChatCompletionUsageCompletionTokensDetailsTyped(TypedDict, total=False): + accepted_prediction_tokens: Optional[int] + + audio_tokens: Optional[int] + + reasoning_tokens: Optional[int] + + rejected_prediction_tokens: Optional[int] + + +ResponseChatCompletionUsageCompletionTokensDetails: TypeAlias = Union[ + ResponseChatCompletionUsageCompletionTokensDetailsTyped, Dict[str, object] +] + + +class ResponseChatCompletionUsagePromptTokensDetailsTyped(TypedDict, total=False): + audio_tokens: Optional[int] + + cached_tokens: Optional[int] + + +ResponseChatCompletionUsagePromptTokensDetails: TypeAlias = Union[ + ResponseChatCompletionUsagePromptTokensDetailsTyped, Dict[str, object] +] + + +class ResponseChatCompletionUsageTyped(TypedDict, total=False): + completion_tokens: Required[int] + + prompt_tokens: Required[int] + + total_tokens: Required[int] + + completion_tokens_details: Optional[ResponseChatCompletionUsageCompletionTokensDetails] + + prompt_tokens_details: Optional[ResponseChatCompletionUsagePromptTokensDetails] + + +ResponseChatCompletionUsage: TypeAlias = Union[ResponseChatCompletionUsageTyped, Dict[str, object]] + + +class ResponseChatCompletionTyped(TypedDict, total=False): + id: Required[str] + + choices: Required[Iterable[ResponseChatCompletionChoice]] + + created: Required[int] + + model: Required[str] + + object: Required[Literal["chat.completion"]] + + service_tier: Optional[Literal["scale", "default"]] + + system_fingerprint: Optional[str] + + usage: Optional[ResponseChatCompletionUsage] + + +ResponseChatCompletion: TypeAlias = Union[ResponseChatCompletionTyped, Dict[str, builtins.object]] + +Response: TypeAlias = Union[str, ResponseChatCompletion] + + +class EvalConfigCustomEvalsEvals(TypedDict, total=False): + criteria: Required[str] + """ + The evaluation criteria text that describes what aspect is being evaluated and + how + """ + + eval_key: Required[str] + """ + Unique key for eval metric - currently maps to the TrustworthyRAG name property + and eval_scores dictionary key to check against threshold + """ + + name: Required[str] + """Display name/label for the evaluation metric""" + + context_identifier: Optional[str] + """ + The exact string used in your evaluation criteria to reference the retrieved + context. + """ + + enabled: bool + """Allows the evaluation to be disabled without removing it""" + + guardrailed_fallback_message: Optional[str] + """ + Fallback message to use if this eval fails and causes the response to be + guardrailed + """ + + is_default: bool + """Whether the eval is a default, built-in eval or a custom eval""" + + priority: Optional[int] + """ + Priority order for evals (lower number = higher priority) to determine primary + eval issue to surface + """ + + query_identifier: Optional[str] + """ + The exact string used in your evaluation criteria to reference the user's query. + """ + + response_identifier: Optional[str] + """ + The exact string used in your evaluation criteria to reference the RAG/LLM + response. + """ + + should_escalate: bool + """ + If true, failing this eval means the question should be escalated to Codex for + an SME to review + """ + + should_guardrail: bool + """If true, failing this eval means the response should be guardrailed""" + + threshold: float + """Threshold value that determines if the evaluation fails""" + + threshold_direction: Literal["above", "below"] + """Whether the evaluation fails when score is above or below the threshold""" + + +class EvalConfigCustomEvals(TypedDict, total=False): + evals: Dict[str, EvalConfigCustomEvalsEvals] + + +class EvalConfigDefaultEvalsContextSufficiency(TypedDict, total=False): + eval_key: Required[str] + """ + Unique key for eval metric - currently maps to the TrustworthyRAG name property + and eval_scores dictionary key to check against threshold + """ + + name: Required[str] + """Display name/label for the evaluation metric""" + + enabled: bool + """Allows the evaluation to be disabled without removing it""" + + guardrailed_fallback_message: Optional[str] + """ + Fallback message to use if this eval fails and causes the response to be + guardrailed + """ + + priority: Optional[int] + """ + Priority order for evals (lower number = higher priority) to determine primary + eval issue to surface + """ + + should_escalate: bool + """ + If true, failing this eval means the question should be escalated to Codex for + an SME to review + """ + + should_guardrail: bool + """If true, failing this eval means the response should be guardrailed""" + + threshold: float + """Threshold value that determines if the evaluation fails""" + + threshold_direction: Literal["above", "below"] + """Whether the evaluation fails when score is above or below the threshold""" + + +class EvalConfigDefaultEvalsQueryEase(TypedDict, total=False): + eval_key: Required[str] + """ + Unique key for eval metric - currently maps to the TrustworthyRAG name property + and eval_scores dictionary key to check against threshold + """ + + name: Required[str] + """Display name/label for the evaluation metric""" + + enabled: bool + """Allows the evaluation to be disabled without removing it""" + + guardrailed_fallback_message: Optional[str] + """ + Fallback message to use if this eval fails and causes the response to be + guardrailed + """ + + priority: Optional[int] + """ + Priority order for evals (lower number = higher priority) to determine primary + eval issue to surface + """ + + should_escalate: bool + """ + If true, failing this eval means the question should be escalated to Codex for + an SME to review + """ + + should_guardrail: bool + """If true, failing this eval means the response should be guardrailed""" + + threshold: float + """Threshold value that determines if the evaluation fails""" + + threshold_direction: Literal["above", "below"] + """Whether the evaluation fails when score is above or below the threshold""" + + +class EvalConfigDefaultEvalsResponseGroundedness(TypedDict, total=False): + eval_key: Required[str] + """ + Unique key for eval metric - currently maps to the TrustworthyRAG name property + and eval_scores dictionary key to check against threshold + """ + + name: Required[str] + """Display name/label for the evaluation metric""" + + enabled: bool + """Allows the evaluation to be disabled without removing it""" + + guardrailed_fallback_message: Optional[str] + """ + Fallback message to use if this eval fails and causes the response to be + guardrailed + """ + + priority: Optional[int] + """ + Priority order for evals (lower number = higher priority) to determine primary + eval issue to surface + """ + + should_escalate: bool + """ + If true, failing this eval means the question should be escalated to Codex for + an SME to review + """ + + should_guardrail: bool + """If true, failing this eval means the response should be guardrailed""" + + threshold: float + """Threshold value that determines if the evaluation fails""" + + threshold_direction: Literal["above", "below"] + """Whether the evaluation fails when score is above or below the threshold""" + + +class EvalConfigDefaultEvalsResponseHelpfulness(TypedDict, total=False): + eval_key: Required[str] + """ + Unique key for eval metric - currently maps to the TrustworthyRAG name property + and eval_scores dictionary key to check against threshold + """ + + name: Required[str] + """Display name/label for the evaluation metric""" + + enabled: bool + """Allows the evaluation to be disabled without removing it""" + + guardrailed_fallback_message: Optional[str] + """ + Fallback message to use if this eval fails and causes the response to be + guardrailed + """ + + priority: Optional[int] + """ + Priority order for evals (lower number = higher priority) to determine primary + eval issue to surface + """ + + should_escalate: bool + """ + If true, failing this eval means the question should be escalated to Codex for + an SME to review + """ + + should_guardrail: bool + """If true, failing this eval means the response should be guardrailed""" + + threshold: float + """Threshold value that determines if the evaluation fails""" + + threshold_direction: Literal["above", "below"] + """Whether the evaluation fails when score is above or below the threshold""" + + +class EvalConfigDefaultEvalsTrustworthiness(TypedDict, total=False): + eval_key: Required[str] + """ + Unique key for eval metric - currently maps to the TrustworthyRAG name property + and eval_scores dictionary key to check against threshold + """ + + name: Required[str] + """Display name/label for the evaluation metric""" + + enabled: bool + """Allows the evaluation to be disabled without removing it""" + + guardrailed_fallback_message: Optional[str] + """ + Fallback message to use if this eval fails and causes the response to be + guardrailed + """ + + priority: Optional[int] + """ + Priority order for evals (lower number = higher priority) to determine primary + eval issue to surface + """ + + should_escalate: bool + """ + If true, failing this eval means the question should be escalated to Codex for + an SME to review + """ + + should_guardrail: bool + """If true, failing this eval means the response should be guardrailed""" + + threshold: float + """Threshold value that determines if the evaluation fails""" + + threshold_direction: Literal["above", "below"] + """Whether the evaluation fails when score is above or below the threshold""" + + +class EvalConfigDefaultEvals(TypedDict, total=False): + context_sufficiency: EvalConfigDefaultEvalsContextSufficiency + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, while + other properties like thresholds and priorities can be configured. + """ + + query_ease: EvalConfigDefaultEvalsQueryEase + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, while + other properties like thresholds and priorities can be configured. + """ + + response_groundedness: EvalConfigDefaultEvalsResponseGroundedness + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, while + other properties like thresholds and priorities can be configured. + """ + + response_helpfulness: EvalConfigDefaultEvalsResponseHelpfulness + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, while + other properties like thresholds and priorities can be configured. + """ + + trustworthiness: EvalConfigDefaultEvalsTrustworthiness + """A pre-configured evaluation metric from TrustworthyRAG or built into the system. + + The evaluation criteria and identifiers are immutable and system-managed, while + other properties like thresholds and priorities can be configured. + """ + + +class EvalConfig(TypedDict, total=False): + custom_evals: EvalConfigCustomEvals + """Configuration for custom evaluation metrics.""" + + default_evals: EvalConfigDefaultEvals + """Configuration for default evaluation metrics.""" + + +class MessageChatCompletionAssistantMessageParamInputAudio(TypedDict, total=False): + id: Required[str] + + +class MessageChatCompletionAssistantMessageParamInputContentUnionMember1ChatCompletionContentPartTextParam( + TypedDict, total=False +): + text: Required[str] + + type: Required[Literal["text"]] + + +class MessageChatCompletionAssistantMessageParamInputContentUnionMember1ChatCompletionContentPartRefusalParam( + TypedDict, total=False +): + refusal: Required[str] + + type: Required[Literal["refusal"]] + + +MessageChatCompletionAssistantMessageParamInputContentUnionMember1: TypeAlias = Union[ + MessageChatCompletionAssistantMessageParamInputContentUnionMember1ChatCompletionContentPartTextParam, + MessageChatCompletionAssistantMessageParamInputContentUnionMember1ChatCompletionContentPartRefusalParam, +] + + +class MessageChatCompletionAssistantMessageParamInputFunctionCall(TypedDict, total=False): + arguments: Required[str] + + name: Required[str] + + +class MessageChatCompletionAssistantMessageParamInputToolCallFunction(TypedDict, total=False): + arguments: Required[str] + + name: Required[str] + + +class MessageChatCompletionAssistantMessageParamInputToolCall(TypedDict, total=False): + id: Required[str] + + function: Required[MessageChatCompletionAssistantMessageParamInputToolCallFunction] + + type: Required[Literal["function"]] + + +class MessageChatCompletionAssistantMessageParamInput(TypedDict, total=False): + role: Required[Literal["assistant"]] + + audio: Optional[MessageChatCompletionAssistantMessageParamInputAudio] + + content: Union[str, Iterable[MessageChatCompletionAssistantMessageParamInputContentUnionMember1], None] + + function_call: Optional[MessageChatCompletionAssistantMessageParamInputFunctionCall] + + name: str + + refusal: Optional[str] + + tool_calls: Iterable[MessageChatCompletionAssistantMessageParamInputToolCall] + + +class MessageChatCompletionToolMessageParamContentUnionMember1(TypedDict, total=False): + text: Required[str] + + type: Required[Literal["text"]] + + +class MessageChatCompletionToolMessageParam(TypedDict, total=False): + content: Required[Union[str, Iterable[MessageChatCompletionToolMessageParamContentUnionMember1]]] + + role: Required[Literal["tool"]] + + tool_call_id: Required[str] + + +class MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartTextParam( + TypedDict, total=False +): + text: Required[str] + + type: Required[Literal["text"]] + + +class MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartImageParamImageURL( + TypedDict, total=False +): + url: Required[str] + + detail: Literal["auto", "low", "high"] + + +class MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartImageParam( + TypedDict, total=False +): + image_url: Required[ + MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartImageParamImageURL + ] + + type: Required[Literal["image_url"]] + + +class MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio( + TypedDict, total=False +): + data: Required[str] + + format: Required[Literal["wav", "mp3"]] + + +class MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartInputAudioParam( + TypedDict, total=False +): + input_audio: Required[ + MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartInputAudioParamInputAudio + ] + + type: Required[Literal["input_audio"]] + + +class MessageChatCompletionUserMessageParamInputContentUnionMember1FileFile(TypedDict, total=False): + file_data: str + + file_id: str + + filename: str + + +class MessageChatCompletionUserMessageParamInputContentUnionMember1File(TypedDict, total=False): + file: Required[MessageChatCompletionUserMessageParamInputContentUnionMember1FileFile] + + type: Required[Literal["file"]] + + +MessageChatCompletionUserMessageParamInputContentUnionMember1: TypeAlias = Union[ + MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartTextParam, + MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartImageParam, + MessageChatCompletionUserMessageParamInputContentUnionMember1ChatCompletionContentPartInputAudioParam, + MessageChatCompletionUserMessageParamInputContentUnionMember1File, +] + + +class MessageChatCompletionUserMessageParamInput(TypedDict, total=False): + content: Required[Union[str, Iterable[MessageChatCompletionUserMessageParamInputContentUnionMember1]]] + + role: Required[Literal["user"]] + + name: str + + +class MessageChatCompletionSystemMessageParamContentUnionMember1(TypedDict, total=False): + text: Required[str] + + type: Required[Literal["text"]] + + +class MessageChatCompletionSystemMessageParam(TypedDict, total=False): + content: Required[Union[str, Iterable[MessageChatCompletionSystemMessageParamContentUnionMember1]]] + + role: Required[Literal["system"]] + + name: str + + +class MessageChatCompletionFunctionMessageParam(TypedDict, total=False): + content: Required[Optional[str]] + + name: Required[str] + + role: Required[Literal["function"]] + + +class MessageChatCompletionDeveloperMessageParamContentUnionMember1(TypedDict, total=False): + text: Required[str] + + type: Required[Literal["text"]] + + +class MessageChatCompletionDeveloperMessageParam(TypedDict, total=False): + content: Required[Union[str, Iterable[MessageChatCompletionDeveloperMessageParamContentUnionMember1]]] + + role: Required[Literal["developer"]] + + name: str + + +Message: TypeAlias = Union[ + MessageChatCompletionAssistantMessageParamInput, + MessageChatCompletionToolMessageParam, + MessageChatCompletionUserMessageParamInput, + MessageChatCompletionSystemMessageParam, + MessageChatCompletionFunctionMessageParam, + MessageChatCompletionDeveloperMessageParam, +] + + +class Options(TypedDict, total=False): + custom_eval_criteria: Iterable[object] + + disable_persistence: bool + + disable_trustworthiness: bool + + log: SequenceNotStr[str] + + max_tokens: int + + model: str + + num_candidate_responses: int + + num_consistency_samples: int + + num_self_reflections: int + + reasoning_effort: str + + similarity_measure: str + + use_self_reflection: bool + + +class ToolFunction(TypedDict, total=False): + name: Required[str] + + description: str + + parameters: object + + strict: Optional[bool] + + +class Tool(TypedDict, total=False): + function: Required[ToolFunction] + + type: Required[Literal["function"]] diff --git a/src/codex/types/project_detect_response.py b/src/codex/types/project_detect_response.py new file mode 100644 index 0000000..27044c1 --- /dev/null +++ b/src/codex/types/project_detect_response.py @@ -0,0 +1,66 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Dict, List, Optional + +from .._models import BaseModel + +__all__ = ["ProjectDetectResponse", "DeterministicGuardrailsResults", "EvalScores"] + + +class DeterministicGuardrailsResults(BaseModel): + guardrail_name: str + + should_guardrail: bool + + fallback_message: Optional[str] = None + + matches: Optional[List[str]] = None + + +class EvalScores(BaseModel): + guardrailed_fallback_message: Optional[str] = None + + score: Optional[float] = None + + triggered: bool + + triggered_escalation: bool + + triggered_guardrail: bool + + failed: Optional[bool] = None + + log: Optional[object] = None + + +class ProjectDetectResponse(BaseModel): + deterministic_guardrails_results: Optional[Dict[str, DeterministicGuardrailsResults]] = None + """Results from deterministic guardrails applied to the response.""" + + escalated_to_sme: bool + """ + True if the question should be escalated to Codex for an SME to review, False + otherwise. When True, a lookup is performed, which logs this query in the + project for SMEs to answer, if it does not already exist. + """ + + eval_scores: Dict[str, EvalScores] + """ + Evaluation scores for the original response along with a boolean flag, `failed`, + indicating whether the score is below the threshold. + """ + + expert_answer: Optional[str] = None + """ + Alternate SME-provided answer from Codex if a relevant answer was found in the + Codex Project, or None otherwise. + """ + + expert_review_guardrail_explanation: Optional[str] = None + """Explanation from a similar bad query log that caused this to be guardrailed""" + + should_guardrail: bool + """ + True if the response should be guardrailed by the AI system, False if the + response is okay to return to the user. + """ diff --git a/tests/api_resources/test_projects.py b/tests/api_resources/test_projects.py index 84ad6cd..cfbfe41 100644 --- a/tests/api_resources/test_projects.py +++ b/tests/api_resources/test_projects.py @@ -11,6 +11,7 @@ from codex.types import ( ProjectListResponse, ProjectReturnSchema, + ProjectDetectResponse, ProjectRetrieveResponse, ProjectValidateResponse, ProjectInviteSmeResponse, @@ -436,6 +437,202 @@ def test_path_params_delete(self, client: Codex) -> None: "", ) + @pytest.mark.skip(reason="Prism tests are disabled") + @parametrize + def test_method_detect(self, client: Codex) -> None: + project = client.projects.detect( + project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e", + context="context", + query="x", + response="string", + ) + assert_matches_type(ProjectDetectResponse, project, path=["response"]) + + @pytest.mark.skip(reason="Prism tests are disabled") + @parametrize + def test_method_detect_with_all_params(self, client: Codex) -> None: + project = client.projects.detect( + project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e", + context="context", + query="x", + response="string", + constrain_outputs=["string"], + eval_config={ + "custom_evals": { + "evals": { + "foo": { + "criteria": "criteria", + "eval_key": "eval_key", + "name": "name", + "context_identifier": "context_identifier", + "enabled": True, + "guardrailed_fallback_message": "guardrailed_fallback_message", + "is_default": True, + "priority": 0, + "query_identifier": "query_identifier", + "response_identifier": "response_identifier", + "should_escalate": True, + "should_guardrail": True, + "threshold": 0, + "threshold_direction": "above", + } + } + }, + "default_evals": { + "context_sufficiency": { + "eval_key": "eval_key", + "name": "name", + "enabled": True, + "guardrailed_fallback_message": "guardrailed_fallback_message", + "priority": 0, + "should_escalate": True, + "should_guardrail": True, + "threshold": 0, + "threshold_direction": "above", + }, + "query_ease": { + "eval_key": "eval_key", + "name": "name", + "enabled": True, + "guardrailed_fallback_message": "guardrailed_fallback_message", + "priority": 0, + "should_escalate": True, + "should_guardrail": True, + "threshold": 0, + "threshold_direction": "above", + }, + "response_groundedness": { + "eval_key": "eval_key", + "name": "name", + "enabled": True, + "guardrailed_fallback_message": "guardrailed_fallback_message", + "priority": 0, + "should_escalate": True, + "should_guardrail": True, + "threshold": 0, + "threshold_direction": "above", + }, + "response_helpfulness": { + "eval_key": "eval_key", + "name": "name", + "enabled": True, + "guardrailed_fallback_message": "guardrailed_fallback_message", + "priority": 0, + "should_escalate": True, + "should_guardrail": True, + "threshold": 0, + "threshold_direction": "above", + }, + "trustworthiness": { + "eval_key": "eval_key", + "name": "name", + "enabled": True, + "guardrailed_fallback_message": "guardrailed_fallback_message", + "priority": 0, + "should_escalate": True, + "should_guardrail": True, + "threshold": 0, + "threshold_direction": "above", + }, + }, + }, + messages=[ + { + "role": "assistant", + "audio": {"id": "id"}, + "content": "string", + "function_call": { + "arguments": "arguments", + "name": "name", + }, + "name": "name", + "refusal": "refusal", + "tool_calls": [ + { + "id": "id", + "function": { + "arguments": "arguments", + "name": "name", + }, + "type": "function", + } + ], + } + ], + options={ + "custom_eval_criteria": [{}], + "disable_persistence": True, + "disable_trustworthiness": True, + "log": ["string"], + "max_tokens": 0, + "model": "model", + "num_candidate_responses": 0, + "num_consistency_samples": 0, + "num_self_reflections": 0, + "reasoning_effort": "reasoning_effort", + "similarity_measure": "similarity_measure", + "use_self_reflection": True, + }, + quality_preset="best", + rewritten_question="rewritten_question", + task="task", + tools=[ + { + "function": { + "name": "name", + "description": "description", + "parameters": {}, + "strict": True, + }, + "type": "function", + } + ], + ) + assert_matches_type(ProjectDetectResponse, project, path=["response"]) + + @pytest.mark.skip(reason="Prism tests are disabled") + @parametrize + def test_raw_response_detect(self, client: Codex) -> None: + response = client.projects.with_raw_response.detect( + project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e", + context="context", + query="x", + response="string", + ) + + assert response.is_closed is True + assert response.http_request.headers.get("X-Stainless-Lang") == "python" + project = response.parse() + assert_matches_type(ProjectDetectResponse, project, path=["response"]) + + @pytest.mark.skip(reason="Prism tests are disabled") + @parametrize + def test_streaming_response_detect(self, client: Codex) -> None: + with client.projects.with_streaming_response.detect( + project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e", + context="context", + query="x", + response="string", + ) as response: + assert not response.is_closed + assert response.http_request.headers.get("X-Stainless-Lang") == "python" + + project = response.parse() + assert_matches_type(ProjectDetectResponse, project, path=["response"]) + + assert cast(Any, response.is_closed) is True + + @pytest.mark.skip(reason="Prism tests are disabled") + @parametrize + def test_path_params_detect(self, client: Codex) -> None: + with pytest.raises(ValueError, match=r"Expected a non-empty value for `project_id` but received ''"): + client.projects.with_raw_response.detect( + project_id="", + context="context", + query="x", + response="string", + ) + @pytest.mark.skip(reason="Prism tests are disabled") @parametrize def test_method_export(self, client: Codex) -> None: @@ -1127,6 +1324,202 @@ async def test_path_params_delete(self, async_client: AsyncCodex) -> None: "", ) + @pytest.mark.skip(reason="Prism tests are disabled") + @parametrize + async def test_method_detect(self, async_client: AsyncCodex) -> None: + project = await async_client.projects.detect( + project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e", + context="context", + query="x", + response="string", + ) + assert_matches_type(ProjectDetectResponse, project, path=["response"]) + + @pytest.mark.skip(reason="Prism tests are disabled") + @parametrize + async def test_method_detect_with_all_params(self, async_client: AsyncCodex) -> None: + project = await async_client.projects.detect( + project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e", + context="context", + query="x", + response="string", + constrain_outputs=["string"], + eval_config={ + "custom_evals": { + "evals": { + "foo": { + "criteria": "criteria", + "eval_key": "eval_key", + "name": "name", + "context_identifier": "context_identifier", + "enabled": True, + "guardrailed_fallback_message": "guardrailed_fallback_message", + "is_default": True, + "priority": 0, + "query_identifier": "query_identifier", + "response_identifier": "response_identifier", + "should_escalate": True, + "should_guardrail": True, + "threshold": 0, + "threshold_direction": "above", + } + } + }, + "default_evals": { + "context_sufficiency": { + "eval_key": "eval_key", + "name": "name", + "enabled": True, + "guardrailed_fallback_message": "guardrailed_fallback_message", + "priority": 0, + "should_escalate": True, + "should_guardrail": True, + "threshold": 0, + "threshold_direction": "above", + }, + "query_ease": { + "eval_key": "eval_key", + "name": "name", + "enabled": True, + "guardrailed_fallback_message": "guardrailed_fallback_message", + "priority": 0, + "should_escalate": True, + "should_guardrail": True, + "threshold": 0, + "threshold_direction": "above", + }, + "response_groundedness": { + "eval_key": "eval_key", + "name": "name", + "enabled": True, + "guardrailed_fallback_message": "guardrailed_fallback_message", + "priority": 0, + "should_escalate": True, + "should_guardrail": True, + "threshold": 0, + "threshold_direction": "above", + }, + "response_helpfulness": { + "eval_key": "eval_key", + "name": "name", + "enabled": True, + "guardrailed_fallback_message": "guardrailed_fallback_message", + "priority": 0, + "should_escalate": True, + "should_guardrail": True, + "threshold": 0, + "threshold_direction": "above", + }, + "trustworthiness": { + "eval_key": "eval_key", + "name": "name", + "enabled": True, + "guardrailed_fallback_message": "guardrailed_fallback_message", + "priority": 0, + "should_escalate": True, + "should_guardrail": True, + "threshold": 0, + "threshold_direction": "above", + }, + }, + }, + messages=[ + { + "role": "assistant", + "audio": {"id": "id"}, + "content": "string", + "function_call": { + "arguments": "arguments", + "name": "name", + }, + "name": "name", + "refusal": "refusal", + "tool_calls": [ + { + "id": "id", + "function": { + "arguments": "arguments", + "name": "name", + }, + "type": "function", + } + ], + } + ], + options={ + "custom_eval_criteria": [{}], + "disable_persistence": True, + "disable_trustworthiness": True, + "log": ["string"], + "max_tokens": 0, + "model": "model", + "num_candidate_responses": 0, + "num_consistency_samples": 0, + "num_self_reflections": 0, + "reasoning_effort": "reasoning_effort", + "similarity_measure": "similarity_measure", + "use_self_reflection": True, + }, + quality_preset="best", + rewritten_question="rewritten_question", + task="task", + tools=[ + { + "function": { + "name": "name", + "description": "description", + "parameters": {}, + "strict": True, + }, + "type": "function", + } + ], + ) + assert_matches_type(ProjectDetectResponse, project, path=["response"]) + + @pytest.mark.skip(reason="Prism tests are disabled") + @parametrize + async def test_raw_response_detect(self, async_client: AsyncCodex) -> None: + response = await async_client.projects.with_raw_response.detect( + project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e", + context="context", + query="x", + response="string", + ) + + assert response.is_closed is True + assert response.http_request.headers.get("X-Stainless-Lang") == "python" + project = await response.parse() + assert_matches_type(ProjectDetectResponse, project, path=["response"]) + + @pytest.mark.skip(reason="Prism tests are disabled") + @parametrize + async def test_streaming_response_detect(self, async_client: AsyncCodex) -> None: + async with async_client.projects.with_streaming_response.detect( + project_id="182bd5e5-6e1a-4fe4-a799-aa6d9a6ab26e", + context="context", + query="x", + response="string", + ) as response: + assert not response.is_closed + assert response.http_request.headers.get("X-Stainless-Lang") == "python" + + project = await response.parse() + assert_matches_type(ProjectDetectResponse, project, path=["response"]) + + assert cast(Any, response.is_closed) is True + + @pytest.mark.skip(reason="Prism tests are disabled") + @parametrize + async def test_path_params_detect(self, async_client: AsyncCodex) -> None: + with pytest.raises(ValueError, match=r"Expected a non-empty value for `project_id` but received ''"): + await async_client.projects.with_raw_response.detect( + project_id="", + context="context", + query="x", + response="string", + ) + @pytest.mark.skip(reason="Prism tests are disabled") @parametrize async def test_method_export(self, async_client: AsyncCodex) -> None: