From e0c210a0ffde24bd2c5877689f8ab222288cc597 Mon Sep 17 00:00:00 2001 From: "stainless-app[bot]" <142633134+stainless-app[bot]@users.noreply.github.com> Date: Sat, 28 Jun 2025 00:58:18 +0000 Subject: [PATCH 1/2] feat(api): manual updates add streaming to chat completions --- .stats.yml | 2 +- api.md | 4 +- .../resources/agents/chat/completions.py | 18 +- src/gradientai/resources/chat/completions.py | 537 +++++++++++++++++- src/gradientai/types/agents/chat/__init__.py | 2 +- .../chat/agent_chat_completion_chunk.py | 93 +++ src/gradientai/types/chat/__init__.py | 1 + .../chat/chat_completion_chunk.py | 4 +- .../types/chat/completion_create_params.py | 31 +- tests/api_resources/chat/test_completions.py | 186 +++++- 10 files changed, 840 insertions(+), 38 deletions(-) create mode 100644 src/gradientai/types/agents/chat/agent_chat_completion_chunk.py rename src/gradientai/types/{agents => }/chat/chat_completion_chunk.py (96%) diff --git a/.stats.yml b/.stats.yml index 8eb4144d..fcc630a7 100644 --- a/.stats.yml +++ b/.stats.yml @@ -1,4 +1,4 @@ configured_endpoints: 76 openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/digitalocean%2Fgradientai-e8b3cbc80e18e4f7f277010349f25e1319156704f359911dc464cc21a0d077a6.yml openapi_spec_hash: c773d792724f5647ae25a5ae4ccec208 -config_hash: e1b3d85ba9ae21d729a914c789422ba7 +config_hash: 0bc3af28d4abd9be8bcc81f615bc832d diff --git a/api.md b/api.md index b1ac8b43..52551f1f 100644 --- a/api.md +++ b/api.md @@ -65,7 +65,7 @@ Methods: Types: ```python -from gradientai.types.agents.chat import ChatCompletionChunk, CompletionCreateResponse +from gradientai.types.agents.chat import AgentChatCompletionChunk, CompletionCreateResponse ``` Methods: @@ -396,7 +396,7 @@ Methods: Types: ```python -from gradientai.types.chat import CompletionCreateResponse +from gradientai.types.chat import ChatCompletionChunk, CompletionCreateResponse ``` Methods: diff --git a/src/gradientai/resources/agents/chat/completions.py b/src/gradientai/resources/agents/chat/completions.py index 92431cdf..7acba243 100644 --- a/src/gradientai/resources/agents/chat/completions.py +++ b/src/gradientai/resources/agents/chat/completions.py @@ -20,8 +20,8 @@ from ...._streaming import Stream, AsyncStream from ...._base_client import make_request_options from ....types.agents.chat import completion_create_params -from ....types.agents.chat.chat_completion_chunk import ChatCompletionChunk from ....types.agents.chat.completion_create_response import CompletionCreateResponse +from ....types.agents.chat.agent_chat_completion_chunk import AgentChatCompletionChunk __all__ = ["CompletionsResource", "AsyncCompletionsResource"] @@ -186,7 +186,7 @@ def create( extra_query: Query | None = None, extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, - ) -> Stream[ChatCompletionChunk]: + ) -> Stream[AgentChatCompletionChunk]: """ Creates a model response for the given chat conversation. @@ -299,7 +299,7 @@ def create( extra_query: Query | None = None, extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, - ) -> CompletionCreateResponse | Stream[ChatCompletionChunk]: + ) -> CompletionCreateResponse | Stream[AgentChatCompletionChunk]: """ Creates a model response for the given chat conversation. @@ -412,7 +412,7 @@ def create( extra_query: Query | None = None, extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, - ) -> CompletionCreateResponse | Stream[ChatCompletionChunk]: + ) -> CompletionCreateResponse | Stream[AgentChatCompletionChunk]: return self._post( "/chat/completions" if self._client._base_url_overridden @@ -446,7 +446,7 @@ def create( ), cast_to=CompletionCreateResponse, stream=stream or False, - stream_cls=Stream[ChatCompletionChunk], + stream_cls=Stream[AgentChatCompletionChunk], ) @@ -610,7 +610,7 @@ async def create( extra_query: Query | None = None, extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, - ) -> AsyncStream[ChatCompletionChunk]: + ) -> AsyncStream[AgentChatCompletionChunk]: """ Creates a model response for the given chat conversation. @@ -723,7 +723,7 @@ async def create( extra_query: Query | None = None, extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, - ) -> CompletionCreateResponse | AsyncStream[ChatCompletionChunk]: + ) -> CompletionCreateResponse | AsyncStream[AgentChatCompletionChunk]: """ Creates a model response for the given chat conversation. @@ -836,7 +836,7 @@ async def create( extra_query: Query | None = None, extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, - ) -> CompletionCreateResponse | AsyncStream[ChatCompletionChunk]: + ) -> CompletionCreateResponse | AsyncStream[AgentChatCompletionChunk]: return await self._post( "/chat/completions" if self._client._base_url_overridden @@ -870,7 +870,7 @@ async def create( ), cast_to=CompletionCreateResponse, stream=stream or False, - stream_cls=AsyncStream[ChatCompletionChunk], + stream_cls=AsyncStream[AgentChatCompletionChunk], ) diff --git a/src/gradientai/resources/chat/completions.py b/src/gradientai/resources/chat/completions.py index a3a47429..7993f757 100644 --- a/src/gradientai/resources/chat/completions.py +++ b/src/gradientai/resources/chat/completions.py @@ -3,11 +3,12 @@ from __future__ import annotations from typing import Dict, List, Union, Iterable, Optional +from typing_extensions import Literal, overload import httpx from ..._types import NOT_GIVEN, Body, Query, Headers, NotGiven -from ..._utils import maybe_transform, async_maybe_transform +from ..._utils import required_args, maybe_transform, async_maybe_transform from ..._compat import cached_property from ..._resource import SyncAPIResource, AsyncAPIResource from ..._response import ( @@ -16,8 +17,10 @@ async_to_raw_response_wrapper, async_to_streamed_response_wrapper, ) +from ..._streaming import Stream, AsyncStream from ...types.chat import completion_create_params from ..._base_client import make_request_options +from ...types.chat.chat_completion_chunk import ChatCompletionChunk from ...types.chat.completion_create_response import CompletionCreateResponse __all__ = ["CompletionsResource", "AsyncCompletionsResource"] @@ -43,6 +46,7 @@ def with_streaming_response(self) -> CompletionsResourceWithStreamingResponse: """ return CompletionsResourceWithStreamingResponse(self) + @overload def create( self, *, @@ -57,7 +61,7 @@ def create( n: Optional[int] | NotGiven = NOT_GIVEN, presence_penalty: Optional[float] | NotGiven = NOT_GIVEN, stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN, - stream: Optional[bool] | NotGiven = NOT_GIVEN, + stream: Optional[Literal[False]] | NotGiven = NOT_GIVEN, stream_options: Optional[completion_create_params.StreamOptions] | NotGiven = NOT_GIVEN, temperature: Optional[float] | NotGiven = NOT_GIVEN, top_logprobs: Optional[int] | NotGiven = NOT_GIVEN, @@ -154,6 +158,263 @@ def create( timeout: Override the client-level default timeout for this request, in seconds """ + ... + + @overload + def create( + self, + *, + messages: Iterable[completion_create_params.Message], + model: str, + stream: Literal[True], + frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN, + logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN, + logprobs: Optional[bool] | NotGiven = NOT_GIVEN, + max_completion_tokens: Optional[int] | NotGiven = NOT_GIVEN, + max_tokens: Optional[int] | NotGiven = NOT_GIVEN, + metadata: Optional[Dict[str, str]] | NotGiven = NOT_GIVEN, + n: Optional[int] | NotGiven = NOT_GIVEN, + presence_penalty: Optional[float] | NotGiven = NOT_GIVEN, + stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN, + stream_options: Optional[completion_create_params.StreamOptions] | NotGiven = NOT_GIVEN, + temperature: Optional[float] | NotGiven = NOT_GIVEN, + top_logprobs: Optional[int] | NotGiven = NOT_GIVEN, + top_p: Optional[float] | NotGiven = NOT_GIVEN, + user: str | NotGiven = NOT_GIVEN, + # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. + # The extra values given here take precedence over values defined on the client or passed to this method. + extra_headers: Headers | None = None, + extra_query: Query | None = None, + extra_body: Body | None = None, + timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, + ) -> Stream[ChatCompletionChunk]: + """ + Creates a model response for the given chat conversation. + + Args: + messages: A list of messages comprising the conversation so far. + + model: Model ID used to generate the response. + + stream: If set to true, the model response data will be streamed to the client as it is + generated using server-sent events. + + frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their + existing frequency in the text so far, decreasing the model's likelihood to + repeat the same line verbatim. + + logit_bias: Modify the likelihood of specified tokens appearing in the completion. + + Accepts a JSON object that maps tokens (specified by their token ID in the + tokenizer) to an associated bias value from -100 to 100. Mathematically, the + bias is added to the logits generated by the model prior to sampling. The exact + effect will vary per model, but values between -1 and 1 should decrease or + increase likelihood of selection; values like -100 or 100 should result in a ban + or exclusive selection of the relevant token. + + logprobs: Whether to return log probabilities of the output tokens or not. If true, + returns the log probabilities of each output token returned in the `content` of + `message`. + + max_completion_tokens: The maximum number of completion tokens that may be used over the course of the + run. The run will make a best effort to use only the number of completion tokens + specified, across multiple turns of the run. + + max_tokens: The maximum number of tokens that can be generated in the completion. + + The token count of your prompt plus `max_tokens` cannot exceed the model's + context length. + + metadata: Set of 16 key-value pairs that can be attached to an object. This can be useful + for storing additional information about the object in a structured format, and + querying for objects via API or the dashboard. + + Keys are strings with a maximum length of 64 characters. Values are strings with + a maximum length of 512 characters. + + n: How many chat completion choices to generate for each input message. Note that + you will be charged based on the number of generated tokens across all of the + choices. Keep `n` as `1` to minimize costs. + + presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on + whether they appear in the text so far, increasing the model's likelihood to + talk about new topics. + + stop: Up to 4 sequences where the API will stop generating further tokens. The + returned text will not contain the stop sequence. + + stream_options: Options for streaming response. Only set this when you set `stream: true`. + + temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will + make the output more random, while lower values like 0.2 will make it more + focused and deterministic. We generally recommend altering this or `top_p` but + not both. + + top_logprobs: An integer between 0 and 20 specifying the number of most likely tokens to + return at each token position, each with an associated log probability. + `logprobs` must be set to `true` if this parameter is used. + + top_p: An alternative to sampling with temperature, called nucleus sampling, where the + model considers the results of the tokens with top_p probability mass. So 0.1 + means only the tokens comprising the top 10% probability mass are considered. + + We generally recommend altering this or `temperature` but not both. + + user: A unique identifier representing your end-user, which can help DigitalOcean to + monitor and detect abuse. + + extra_headers: Send extra headers + + extra_query: Add additional query parameters to the request + + extra_body: Add additional JSON properties to the request + + timeout: Override the client-level default timeout for this request, in seconds + """ + ... + + @overload + def create( + self, + *, + messages: Iterable[completion_create_params.Message], + model: str, + stream: bool, + frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN, + logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN, + logprobs: Optional[bool] | NotGiven = NOT_GIVEN, + max_completion_tokens: Optional[int] | NotGiven = NOT_GIVEN, + max_tokens: Optional[int] | NotGiven = NOT_GIVEN, + metadata: Optional[Dict[str, str]] | NotGiven = NOT_GIVEN, + n: Optional[int] | NotGiven = NOT_GIVEN, + presence_penalty: Optional[float] | NotGiven = NOT_GIVEN, + stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN, + stream_options: Optional[completion_create_params.StreamOptions] | NotGiven = NOT_GIVEN, + temperature: Optional[float] | NotGiven = NOT_GIVEN, + top_logprobs: Optional[int] | NotGiven = NOT_GIVEN, + top_p: Optional[float] | NotGiven = NOT_GIVEN, + user: str | NotGiven = NOT_GIVEN, + # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. + # The extra values given here take precedence over values defined on the client or passed to this method. + extra_headers: Headers | None = None, + extra_query: Query | None = None, + extra_body: Body | None = None, + timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, + ) -> CompletionCreateResponse | Stream[ChatCompletionChunk]: + """ + Creates a model response for the given chat conversation. + + Args: + messages: A list of messages comprising the conversation so far. + + model: Model ID used to generate the response. + + stream: If set to true, the model response data will be streamed to the client as it is + generated using server-sent events. + + frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their + existing frequency in the text so far, decreasing the model's likelihood to + repeat the same line verbatim. + + logit_bias: Modify the likelihood of specified tokens appearing in the completion. + + Accepts a JSON object that maps tokens (specified by their token ID in the + tokenizer) to an associated bias value from -100 to 100. Mathematically, the + bias is added to the logits generated by the model prior to sampling. The exact + effect will vary per model, but values between -1 and 1 should decrease or + increase likelihood of selection; values like -100 or 100 should result in a ban + or exclusive selection of the relevant token. + + logprobs: Whether to return log probabilities of the output tokens or not. If true, + returns the log probabilities of each output token returned in the `content` of + `message`. + + max_completion_tokens: The maximum number of completion tokens that may be used over the course of the + run. The run will make a best effort to use only the number of completion tokens + specified, across multiple turns of the run. + + max_tokens: The maximum number of tokens that can be generated in the completion. + + The token count of your prompt plus `max_tokens` cannot exceed the model's + context length. + + metadata: Set of 16 key-value pairs that can be attached to an object. This can be useful + for storing additional information about the object in a structured format, and + querying for objects via API or the dashboard. + + Keys are strings with a maximum length of 64 characters. Values are strings with + a maximum length of 512 characters. + + n: How many chat completion choices to generate for each input message. Note that + you will be charged based on the number of generated tokens across all of the + choices. Keep `n` as `1` to minimize costs. + + presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on + whether they appear in the text so far, increasing the model's likelihood to + talk about new topics. + + stop: Up to 4 sequences where the API will stop generating further tokens. The + returned text will not contain the stop sequence. + + stream_options: Options for streaming response. Only set this when you set `stream: true`. + + temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will + make the output more random, while lower values like 0.2 will make it more + focused and deterministic. We generally recommend altering this or `top_p` but + not both. + + top_logprobs: An integer between 0 and 20 specifying the number of most likely tokens to + return at each token position, each with an associated log probability. + `logprobs` must be set to `true` if this parameter is used. + + top_p: An alternative to sampling with temperature, called nucleus sampling, where the + model considers the results of the tokens with top_p probability mass. So 0.1 + means only the tokens comprising the top 10% probability mass are considered. + + We generally recommend altering this or `temperature` but not both. + + user: A unique identifier representing your end-user, which can help DigitalOcean to + monitor and detect abuse. + + extra_headers: Send extra headers + + extra_query: Add additional query parameters to the request + + extra_body: Add additional JSON properties to the request + + timeout: Override the client-level default timeout for this request, in seconds + """ + ... + + @required_args(["messages", "model"], ["messages", "model", "stream"]) + def create( + self, + *, + messages: Iterable[completion_create_params.Message], + model: str, + frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN, + logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN, + logprobs: Optional[bool] | NotGiven = NOT_GIVEN, + max_completion_tokens: Optional[int] | NotGiven = NOT_GIVEN, + max_tokens: Optional[int] | NotGiven = NOT_GIVEN, + metadata: Optional[Dict[str, str]] | NotGiven = NOT_GIVEN, + n: Optional[int] | NotGiven = NOT_GIVEN, + presence_penalty: Optional[float] | NotGiven = NOT_GIVEN, + stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN, + stream: Optional[Literal[False]] | Literal[True] | NotGiven = NOT_GIVEN, + stream_options: Optional[completion_create_params.StreamOptions] | NotGiven = NOT_GIVEN, + temperature: Optional[float] | NotGiven = NOT_GIVEN, + top_logprobs: Optional[int] | NotGiven = NOT_GIVEN, + top_p: Optional[float] | NotGiven = NOT_GIVEN, + user: str | NotGiven = NOT_GIVEN, + # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. + # The extra values given here take precedence over values defined on the client or passed to this method. + extra_headers: Headers | None = None, + extra_query: Query | None = None, + extra_body: Body | None = None, + timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, + ) -> CompletionCreateResponse | Stream[ChatCompletionChunk]: + # This method requires an inference_key to be set via client argument or environment variable if not self._client.inference_key: raise TypeError( @@ -186,12 +447,16 @@ def create( "top_p": top_p, "user": user, }, - completion_create_params.CompletionCreateParams, + completion_create_params.CompletionCreateParamsStreaming + if stream + else completion_create_params.CompletionCreateParamsNonStreaming, ), options=make_request_options( extra_headers=headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), cast_to=CompletionCreateResponse, + stream=stream or False, + stream_cls=Stream[ChatCompletionChunk], ) @@ -215,6 +480,7 @@ def with_streaming_response(self) -> AsyncCompletionsResourceWithStreamingRespon """ return AsyncCompletionsResourceWithStreamingResponse(self) + @overload async def create( self, *, @@ -229,7 +495,7 @@ async def create( n: Optional[int] | NotGiven = NOT_GIVEN, presence_penalty: Optional[float] | NotGiven = NOT_GIVEN, stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN, - stream: Optional[bool] | NotGiven = NOT_GIVEN, + stream: Optional[Literal[False]] | NotGiven = NOT_GIVEN, stream_options: Optional[completion_create_params.StreamOptions] | NotGiven = NOT_GIVEN, temperature: Optional[float] | NotGiven = NOT_GIVEN, top_logprobs: Optional[int] | NotGiven = NOT_GIVEN, @@ -326,6 +592,263 @@ async def create( timeout: Override the client-level default timeout for this request, in seconds """ + ... + + @overload + async def create( + self, + *, + messages: Iterable[completion_create_params.Message], + model: str, + stream: Literal[True], + frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN, + logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN, + logprobs: Optional[bool] | NotGiven = NOT_GIVEN, + max_completion_tokens: Optional[int] | NotGiven = NOT_GIVEN, + max_tokens: Optional[int] | NotGiven = NOT_GIVEN, + metadata: Optional[Dict[str, str]] | NotGiven = NOT_GIVEN, + n: Optional[int] | NotGiven = NOT_GIVEN, + presence_penalty: Optional[float] | NotGiven = NOT_GIVEN, + stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN, + stream_options: Optional[completion_create_params.StreamOptions] | NotGiven = NOT_GIVEN, + temperature: Optional[float] | NotGiven = NOT_GIVEN, + top_logprobs: Optional[int] | NotGiven = NOT_GIVEN, + top_p: Optional[float] | NotGiven = NOT_GIVEN, + user: str | NotGiven = NOT_GIVEN, + # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. + # The extra values given here take precedence over values defined on the client or passed to this method. + extra_headers: Headers | None = None, + extra_query: Query | None = None, + extra_body: Body | None = None, + timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, + ) -> AsyncStream[ChatCompletionChunk]: + """ + Creates a model response for the given chat conversation. + + Args: + messages: A list of messages comprising the conversation so far. + + model: Model ID used to generate the response. + + stream: If set to true, the model response data will be streamed to the client as it is + generated using server-sent events. + + frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their + existing frequency in the text so far, decreasing the model's likelihood to + repeat the same line verbatim. + + logit_bias: Modify the likelihood of specified tokens appearing in the completion. + + Accepts a JSON object that maps tokens (specified by their token ID in the + tokenizer) to an associated bias value from -100 to 100. Mathematically, the + bias is added to the logits generated by the model prior to sampling. The exact + effect will vary per model, but values between -1 and 1 should decrease or + increase likelihood of selection; values like -100 or 100 should result in a ban + or exclusive selection of the relevant token. + + logprobs: Whether to return log probabilities of the output tokens or not. If true, + returns the log probabilities of each output token returned in the `content` of + `message`. + + max_completion_tokens: The maximum number of completion tokens that may be used over the course of the + run. The run will make a best effort to use only the number of completion tokens + specified, across multiple turns of the run. + + max_tokens: The maximum number of tokens that can be generated in the completion. + + The token count of your prompt plus `max_tokens` cannot exceed the model's + context length. + + metadata: Set of 16 key-value pairs that can be attached to an object. This can be useful + for storing additional information about the object in a structured format, and + querying for objects via API or the dashboard. + + Keys are strings with a maximum length of 64 characters. Values are strings with + a maximum length of 512 characters. + + n: How many chat completion choices to generate for each input message. Note that + you will be charged based on the number of generated tokens across all of the + choices. Keep `n` as `1` to minimize costs. + + presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on + whether they appear in the text so far, increasing the model's likelihood to + talk about new topics. + + stop: Up to 4 sequences where the API will stop generating further tokens. The + returned text will not contain the stop sequence. + + stream_options: Options for streaming response. Only set this when you set `stream: true`. + + temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will + make the output more random, while lower values like 0.2 will make it more + focused and deterministic. We generally recommend altering this or `top_p` but + not both. + + top_logprobs: An integer between 0 and 20 specifying the number of most likely tokens to + return at each token position, each with an associated log probability. + `logprobs` must be set to `true` if this parameter is used. + + top_p: An alternative to sampling with temperature, called nucleus sampling, where the + model considers the results of the tokens with top_p probability mass. So 0.1 + means only the tokens comprising the top 10% probability mass are considered. + + We generally recommend altering this or `temperature` but not both. + + user: A unique identifier representing your end-user, which can help DigitalOcean to + monitor and detect abuse. + + extra_headers: Send extra headers + + extra_query: Add additional query parameters to the request + + extra_body: Add additional JSON properties to the request + + timeout: Override the client-level default timeout for this request, in seconds + """ + ... + + @overload + async def create( + self, + *, + messages: Iterable[completion_create_params.Message], + model: str, + stream: bool, + frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN, + logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN, + logprobs: Optional[bool] | NotGiven = NOT_GIVEN, + max_completion_tokens: Optional[int] | NotGiven = NOT_GIVEN, + max_tokens: Optional[int] | NotGiven = NOT_GIVEN, + metadata: Optional[Dict[str, str]] | NotGiven = NOT_GIVEN, + n: Optional[int] | NotGiven = NOT_GIVEN, + presence_penalty: Optional[float] | NotGiven = NOT_GIVEN, + stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN, + stream_options: Optional[completion_create_params.StreamOptions] | NotGiven = NOT_GIVEN, + temperature: Optional[float] | NotGiven = NOT_GIVEN, + top_logprobs: Optional[int] | NotGiven = NOT_GIVEN, + top_p: Optional[float] | NotGiven = NOT_GIVEN, + user: str | NotGiven = NOT_GIVEN, + # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. + # The extra values given here take precedence over values defined on the client or passed to this method. + extra_headers: Headers | None = None, + extra_query: Query | None = None, + extra_body: Body | None = None, + timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, + ) -> CompletionCreateResponse | AsyncStream[ChatCompletionChunk]: + """ + Creates a model response for the given chat conversation. + + Args: + messages: A list of messages comprising the conversation so far. + + model: Model ID used to generate the response. + + stream: If set to true, the model response data will be streamed to the client as it is + generated using server-sent events. + + frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their + existing frequency in the text so far, decreasing the model's likelihood to + repeat the same line verbatim. + + logit_bias: Modify the likelihood of specified tokens appearing in the completion. + + Accepts a JSON object that maps tokens (specified by their token ID in the + tokenizer) to an associated bias value from -100 to 100. Mathematically, the + bias is added to the logits generated by the model prior to sampling. The exact + effect will vary per model, but values between -1 and 1 should decrease or + increase likelihood of selection; values like -100 or 100 should result in a ban + or exclusive selection of the relevant token. + + logprobs: Whether to return log probabilities of the output tokens or not. If true, + returns the log probabilities of each output token returned in the `content` of + `message`. + + max_completion_tokens: The maximum number of completion tokens that may be used over the course of the + run. The run will make a best effort to use only the number of completion tokens + specified, across multiple turns of the run. + + max_tokens: The maximum number of tokens that can be generated in the completion. + + The token count of your prompt plus `max_tokens` cannot exceed the model's + context length. + + metadata: Set of 16 key-value pairs that can be attached to an object. This can be useful + for storing additional information about the object in a structured format, and + querying for objects via API or the dashboard. + + Keys are strings with a maximum length of 64 characters. Values are strings with + a maximum length of 512 characters. + + n: How many chat completion choices to generate for each input message. Note that + you will be charged based on the number of generated tokens across all of the + choices. Keep `n` as `1` to minimize costs. + + presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on + whether they appear in the text so far, increasing the model's likelihood to + talk about new topics. + + stop: Up to 4 sequences where the API will stop generating further tokens. The + returned text will not contain the stop sequence. + + stream_options: Options for streaming response. Only set this when you set `stream: true`. + + temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will + make the output more random, while lower values like 0.2 will make it more + focused and deterministic. We generally recommend altering this or `top_p` but + not both. + + top_logprobs: An integer between 0 and 20 specifying the number of most likely tokens to + return at each token position, each with an associated log probability. + `logprobs` must be set to `true` if this parameter is used. + + top_p: An alternative to sampling with temperature, called nucleus sampling, where the + model considers the results of the tokens with top_p probability mass. So 0.1 + means only the tokens comprising the top 10% probability mass are considered. + + We generally recommend altering this or `temperature` but not both. + + user: A unique identifier representing your end-user, which can help DigitalOcean to + monitor and detect abuse. + + extra_headers: Send extra headers + + extra_query: Add additional query parameters to the request + + extra_body: Add additional JSON properties to the request + + timeout: Override the client-level default timeout for this request, in seconds + """ + ... + + @required_args(["messages", "model"], ["messages", "model", "stream"]) + async def create( + self, + *, + messages: Iterable[completion_create_params.Message], + model: str, + frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN, + logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN, + logprobs: Optional[bool] | NotGiven = NOT_GIVEN, + max_completion_tokens: Optional[int] | NotGiven = NOT_GIVEN, + max_tokens: Optional[int] | NotGiven = NOT_GIVEN, + metadata: Optional[Dict[str, str]] | NotGiven = NOT_GIVEN, + n: Optional[int] | NotGiven = NOT_GIVEN, + presence_penalty: Optional[float] | NotGiven = NOT_GIVEN, + stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN, + stream: Optional[Literal[False]] | Literal[True] | NotGiven = NOT_GIVEN, + stream_options: Optional[completion_create_params.StreamOptions] | NotGiven = NOT_GIVEN, + temperature: Optional[float] | NotGiven = NOT_GIVEN, + top_logprobs: Optional[int] | NotGiven = NOT_GIVEN, + top_p: Optional[float] | NotGiven = NOT_GIVEN, + user: str | NotGiven = NOT_GIVEN, + # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. + # The extra values given here take precedence over values defined on the client or passed to this method. + extra_headers: Headers | None = None, + extra_query: Query | None = None, + extra_body: Body | None = None, + timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, + ) -> CompletionCreateResponse | AsyncStream[ChatCompletionChunk]: + # This method requires an inference_key to be set via client argument or environment variable if not hasattr(self._client, "inference_key") or not self._client.inference_key: raise TypeError( @@ -358,12 +881,16 @@ async def create( "top_p": top_p, "user": user, }, - completion_create_params.CompletionCreateParams, + completion_create_params.CompletionCreateParamsStreaming + if stream + else completion_create_params.CompletionCreateParamsNonStreaming, ), options=make_request_options( extra_headers=headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), cast_to=CompletionCreateResponse, + stream=stream or False, + stream_cls=AsyncStream[ChatCompletionChunk], ) diff --git a/src/gradientai/types/agents/chat/__init__.py b/src/gradientai/types/agents/chat/__init__.py index f0243162..305ba0af 100644 --- a/src/gradientai/types/agents/chat/__init__.py +++ b/src/gradientai/types/agents/chat/__init__.py @@ -2,6 +2,6 @@ from __future__ import annotations -from .chat_completion_chunk import ChatCompletionChunk as ChatCompletionChunk from .completion_create_params import CompletionCreateParams as CompletionCreateParams from .completion_create_response import CompletionCreateResponse as CompletionCreateResponse +from .agent_chat_completion_chunk import AgentChatCompletionChunk as AgentChatCompletionChunk diff --git a/src/gradientai/types/agents/chat/agent_chat_completion_chunk.py b/src/gradientai/types/agents/chat/agent_chat_completion_chunk.py new file mode 100644 index 00000000..36ee3d9e --- /dev/null +++ b/src/gradientai/types/agents/chat/agent_chat_completion_chunk.py @@ -0,0 +1,93 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import List, Optional +from typing_extensions import Literal + +from ...._models import BaseModel +from ...shared.chat_completion_token_logprob import ChatCompletionTokenLogprob + +__all__ = ["AgentChatCompletionChunk", "Choice", "ChoiceDelta", "ChoiceLogprobs", "Usage"] + + +class ChoiceDelta(BaseModel): + content: Optional[str] = None + """The contents of the chunk message.""" + + refusal: Optional[str] = None + """The refusal message generated by the model.""" + + role: Optional[Literal["developer", "user", "assistant"]] = None + """The role of the author of this message.""" + + +class ChoiceLogprobs(BaseModel): + content: Optional[List[ChatCompletionTokenLogprob]] = None + """A list of message content tokens with log probability information.""" + + refusal: Optional[List[ChatCompletionTokenLogprob]] = None + """A list of message refusal tokens with log probability information.""" + + +class Choice(BaseModel): + delta: ChoiceDelta + """A chat completion delta generated by streamed model responses.""" + + finish_reason: Optional[Literal["stop", "length"]] = None + """The reason the model stopped generating tokens. + + This will be `stop` if the model hit a natural stop point or a provided stop + sequence, or `length` if the maximum number of tokens specified in the request + was reached + """ + + index: int + """The index of the choice in the list of choices.""" + + logprobs: Optional[ChoiceLogprobs] = None + """Log probability information for the choice.""" + + +class Usage(BaseModel): + completion_tokens: int + """Number of tokens in the generated completion.""" + + prompt_tokens: int + """Number of tokens in the prompt.""" + + total_tokens: int + """Total number of tokens used in the request (prompt + completion).""" + + +class AgentChatCompletionChunk(BaseModel): + id: str + """A unique identifier for the chat completion. Each chunk has the same ID.""" + + choices: List[Choice] + """A list of chat completion choices. + + Can contain more than one elements if `n` is greater than 1. Can also be empty + for the last chunk if you set `stream_options: {"include_usage": true}`. + """ + + created: int + """The Unix timestamp (in seconds) of when the chat completion was created. + + Each chunk has the same timestamp. + """ + + model: str + """The model to generate the completion.""" + + object: Literal["chat.completion.chunk"] + """The object type, which is always `chat.completion.chunk`.""" + + usage: Optional[Usage] = None + """ + An optional field that will only be present when you set + `stream_options: {"include_usage": true}` in your request. When present, it + contains a null value **except for the last chunk** which contains the token + usage statistics for the entire request. + + **NOTE:** If the stream is interrupted or cancelled, you may not receive the + final usage chunk which contains the total token usage for the request. + """ diff --git a/src/gradientai/types/chat/__init__.py b/src/gradientai/types/chat/__init__.py index 9384ac14..f0243162 100644 --- a/src/gradientai/types/chat/__init__.py +++ b/src/gradientai/types/chat/__init__.py @@ -2,5 +2,6 @@ from __future__ import annotations +from .chat_completion_chunk import ChatCompletionChunk as ChatCompletionChunk from .completion_create_params import CompletionCreateParams as CompletionCreateParams from .completion_create_response import CompletionCreateResponse as CompletionCreateResponse diff --git a/src/gradientai/types/agents/chat/chat_completion_chunk.py b/src/gradientai/types/chat/chat_completion_chunk.py similarity index 96% rename from src/gradientai/types/agents/chat/chat_completion_chunk.py rename to src/gradientai/types/chat/chat_completion_chunk.py index b81aef72..4adcc63d 100644 --- a/src/gradientai/types/agents/chat/chat_completion_chunk.py +++ b/src/gradientai/types/chat/chat_completion_chunk.py @@ -3,8 +3,8 @@ from typing import List, Optional from typing_extensions import Literal -from ...._models import BaseModel -from ...shared.chat_completion_token_logprob import ChatCompletionTokenLogprob +from ..._models import BaseModel +from ..shared.chat_completion_token_logprob import ChatCompletionTokenLogprob __all__ = ["ChatCompletionChunk", "Choice", "ChoiceDelta", "ChoiceLogprobs", "Usage"] diff --git a/src/gradientai/types/chat/completion_create_params.py b/src/gradientai/types/chat/completion_create_params.py index 11d032ff..ec5c6b70 100644 --- a/src/gradientai/types/chat/completion_create_params.py +++ b/src/gradientai/types/chat/completion_create_params.py @@ -6,17 +6,19 @@ from typing_extensions import Literal, Required, TypeAlias, TypedDict __all__ = [ - "CompletionCreateParams", + "CompletionCreateParamsBase", "Message", "MessageChatCompletionRequestSystemMessage", "MessageChatCompletionRequestDeveloperMessage", "MessageChatCompletionRequestUserMessage", "MessageChatCompletionRequestAssistantMessage", "StreamOptions", + "CompletionCreateParamsNonStreaming", + "CompletionCreateParamsStreaming", ] -class CompletionCreateParams(TypedDict, total=False): +class CompletionCreateParamsBase(TypedDict, total=False): messages: Required[Iterable[Message]] """A list of messages comprising the conversation so far.""" @@ -92,12 +94,6 @@ class CompletionCreateParams(TypedDict, total=False): The returned text will not contain the stop sequence. """ - stream: Optional[bool] - """ - If set to true, the model response data will be streamed to the client as it is - generated using server-sent events. - """ - stream_options: Optional[StreamOptions] """Options for streaming response. Only set this when you set `stream: true`.""" @@ -183,3 +179,22 @@ class StreamOptions(TypedDict, total=False): **NOTE:** If the stream is interrupted, you may not receive the final usage chunk which contains the total token usage for the request. """ + + +class CompletionCreateParamsNonStreaming(CompletionCreateParamsBase, total=False): + stream: Optional[Literal[False]] + """ + If set to true, the model response data will be streamed to the client as it is + generated using server-sent events. + """ + + +class CompletionCreateParamsStreaming(CompletionCreateParamsBase): + stream: Required[Literal[True]] + """ + If set to true, the model response data will be streamed to the client as it is + generated using server-sent events. + """ + + +CompletionCreateParams = Union[CompletionCreateParamsNonStreaming, CompletionCreateParamsStreaming] diff --git a/tests/api_resources/chat/test_completions.py b/tests/api_resources/chat/test_completions.py index b4c09579..25b8419a 100644 --- a/tests/api_resources/chat/test_completions.py +++ b/tests/api_resources/chat/test_completions.py @@ -19,7 +19,7 @@ class TestCompletions: @pytest.mark.skip() @parametrize - def test_method_create(self, client: GradientAI) -> None: + def test_method_create_overload_1(self, client: GradientAI) -> None: completion = client.chat.completions.create( messages=[ { @@ -33,7 +33,7 @@ def test_method_create(self, client: GradientAI) -> None: @pytest.mark.skip() @parametrize - def test_method_create_with_all_params(self, client: GradientAI) -> None: + def test_method_create_with_all_params_overload_1(self, client: GradientAI) -> None: completion = client.chat.completions.create( messages=[ { @@ -51,7 +51,7 @@ def test_method_create_with_all_params(self, client: GradientAI) -> None: n=1, presence_penalty=-2, stop="\n", - stream=True, + stream=False, stream_options={"include_usage": True}, temperature=1, top_logprobs=0, @@ -62,7 +62,7 @@ def test_method_create_with_all_params(self, client: GradientAI) -> None: @pytest.mark.skip() @parametrize - def test_raw_response_create(self, client: GradientAI) -> None: + def test_raw_response_create_overload_1(self, client: GradientAI) -> None: response = client.chat.completions.with_raw_response.create( messages=[ { @@ -80,7 +80,7 @@ def test_raw_response_create(self, client: GradientAI) -> None: @pytest.mark.skip() @parametrize - def test_streaming_response_create(self, client: GradientAI) -> None: + def test_streaming_response_create_overload_1(self, client: GradientAI) -> None: with client.chat.completions.with_streaming_response.create( messages=[ { @@ -98,6 +98,89 @@ def test_streaming_response_create(self, client: GradientAI) -> None: assert cast(Any, response.is_closed) is True + @pytest.mark.skip() + @parametrize + def test_method_create_overload_2(self, client: GradientAI) -> None: + completion_stream = client.chat.completions.create( + messages=[ + { + "content": "string", + "role": "system", + } + ], + model="llama3-8b-instruct", + stream=True, + ) + completion_stream.response.close() + + @pytest.mark.skip() + @parametrize + def test_method_create_with_all_params_overload_2(self, client: GradientAI) -> None: + completion_stream = client.chat.completions.create( + messages=[ + { + "content": "string", + "role": "system", + } + ], + model="llama3-8b-instruct", + stream=True, + frequency_penalty=-2, + logit_bias={"foo": 0}, + logprobs=True, + max_completion_tokens=256, + max_tokens=0, + metadata={"foo": "string"}, + n=1, + presence_penalty=-2, + stop="\n", + stream_options={"include_usage": True}, + temperature=1, + top_logprobs=0, + top_p=1, + user="user-1234", + ) + completion_stream.response.close() + + @pytest.mark.skip() + @parametrize + def test_raw_response_create_overload_2(self, client: GradientAI) -> None: + response = client.chat.completions.with_raw_response.create( + messages=[ + { + "content": "string", + "role": "system", + } + ], + model="llama3-8b-instruct", + stream=True, + ) + + assert response.http_request.headers.get("X-Stainless-Lang") == "python" + stream = response.parse() + stream.close() + + @pytest.mark.skip() + @parametrize + def test_streaming_response_create_overload_2(self, client: GradientAI) -> None: + with client.chat.completions.with_streaming_response.create( + messages=[ + { + "content": "string", + "role": "system", + } + ], + model="llama3-8b-instruct", + stream=True, + ) as response: + assert not response.is_closed + assert response.http_request.headers.get("X-Stainless-Lang") == "python" + + stream = response.parse() + stream.close() + + assert cast(Any, response.is_closed) is True + class TestAsyncCompletions: parametrize = pytest.mark.parametrize( @@ -106,7 +189,7 @@ class TestAsyncCompletions: @pytest.mark.skip() @parametrize - async def test_method_create(self, async_client: AsyncGradientAI) -> None: + async def test_method_create_overload_1(self, async_client: AsyncGradientAI) -> None: completion = await async_client.chat.completions.create( messages=[ { @@ -120,7 +203,7 @@ async def test_method_create(self, async_client: AsyncGradientAI) -> None: @pytest.mark.skip() @parametrize - async def test_method_create_with_all_params(self, async_client: AsyncGradientAI) -> None: + async def test_method_create_with_all_params_overload_1(self, async_client: AsyncGradientAI) -> None: completion = await async_client.chat.completions.create( messages=[ { @@ -138,7 +221,7 @@ async def test_method_create_with_all_params(self, async_client: AsyncGradientAI n=1, presence_penalty=-2, stop="\n", - stream=True, + stream=False, stream_options={"include_usage": True}, temperature=1, top_logprobs=0, @@ -149,7 +232,7 @@ async def test_method_create_with_all_params(self, async_client: AsyncGradientAI @pytest.mark.skip() @parametrize - async def test_raw_response_create(self, async_client: AsyncGradientAI) -> None: + async def test_raw_response_create_overload_1(self, async_client: AsyncGradientAI) -> None: response = await async_client.chat.completions.with_raw_response.create( messages=[ { @@ -167,7 +250,7 @@ async def test_raw_response_create(self, async_client: AsyncGradientAI) -> None: @pytest.mark.skip() @parametrize - async def test_streaming_response_create(self, async_client: AsyncGradientAI) -> None: + async def test_streaming_response_create_overload_1(self, async_client: AsyncGradientAI) -> None: async with async_client.chat.completions.with_streaming_response.create( messages=[ { @@ -184,3 +267,86 @@ async def test_streaming_response_create(self, async_client: AsyncGradientAI) -> assert_matches_type(CompletionCreateResponse, completion, path=["response"]) assert cast(Any, response.is_closed) is True + + @pytest.mark.skip() + @parametrize + async def test_method_create_overload_2(self, async_client: AsyncGradientAI) -> None: + completion_stream = await async_client.chat.completions.create( + messages=[ + { + "content": "string", + "role": "system", + } + ], + model="llama3-8b-instruct", + stream=True, + ) + await completion_stream.response.aclose() + + @pytest.mark.skip() + @parametrize + async def test_method_create_with_all_params_overload_2(self, async_client: AsyncGradientAI) -> None: + completion_stream = await async_client.chat.completions.create( + messages=[ + { + "content": "string", + "role": "system", + } + ], + model="llama3-8b-instruct", + stream=True, + frequency_penalty=-2, + logit_bias={"foo": 0}, + logprobs=True, + max_completion_tokens=256, + max_tokens=0, + metadata={"foo": "string"}, + n=1, + presence_penalty=-2, + stop="\n", + stream_options={"include_usage": True}, + temperature=1, + top_logprobs=0, + top_p=1, + user="user-1234", + ) + await completion_stream.response.aclose() + + @pytest.mark.skip() + @parametrize + async def test_raw_response_create_overload_2(self, async_client: AsyncGradientAI) -> None: + response = await async_client.chat.completions.with_raw_response.create( + messages=[ + { + "content": "string", + "role": "system", + } + ], + model="llama3-8b-instruct", + stream=True, + ) + + assert response.http_request.headers.get("X-Stainless-Lang") == "python" + stream = await response.parse() + await stream.close() + + @pytest.mark.skip() + @parametrize + async def test_streaming_response_create_overload_2(self, async_client: AsyncGradientAI) -> None: + async with async_client.chat.completions.with_streaming_response.create( + messages=[ + { + "content": "string", + "role": "system", + } + ], + model="llama3-8b-instruct", + stream=True, + ) as response: + assert not response.is_closed + assert response.http_request.headers.get("X-Stainless-Lang") == "python" + + stream = await response.parse() + await stream.close() + + assert cast(Any, response.is_closed) is True From 593e13b1077d38a81a80a2e473031f197143374c Mon Sep 17 00:00:00 2001 From: "stainless-app[bot]" <142633134+stainless-app[bot]@users.noreply.github.com> Date: Sat, 28 Jun 2025 00:58:35 +0000 Subject: [PATCH 2/2] release: 0.1.0-alpha.9 --- .release-please-manifest.json | 2 +- CHANGELOG.md | 8 ++++++++ pyproject.toml | 2 +- src/gradientai/_version.py | 2 +- 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/.release-please-manifest.json b/.release-please-manifest.json index c373724d..46b9b6b2 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,3 +1,3 @@ { - ".": "0.1.0-alpha.8" + ".": "0.1.0-alpha.9" } \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index b26d4058..4d5ad5ba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # Changelog +## 0.1.0-alpha.9 (2025-06-28) + +Full Changelog: [v0.1.0-alpha.8...v0.1.0-alpha.9](https://github.com/digitalocean/gradientai-python/compare/v0.1.0-alpha.8...v0.1.0-alpha.9) + +### Features + +* **api:** manual updates ([e0c210a](https://github.com/digitalocean/gradientai-python/commit/e0c210a0ffde24bd2c5877689f8ab222288cc597)) + ## 0.1.0-alpha.8 (2025-06-27) Full Changelog: [v0.1.0-alpha.7...v0.1.0-alpha.8](https://github.com/digitalocean/gradientai-python/compare/v0.1.0-alpha.7...v0.1.0-alpha.8) diff --git a/pyproject.toml b/pyproject.toml index 60a58f89..87c4aeeb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "c63a5cfe-b235-4fbe-8bbb-82a9e02a482a-python" -version = "0.1.0-alpha.8" +version = "0.1.0-alpha.9" description = "The official Python library for GradientAI" dynamic = ["readme"] license = "Apache-2.0" diff --git a/src/gradientai/_version.py b/src/gradientai/_version.py index 8c8f2b63..5cd8ca49 100644 --- a/src/gradientai/_version.py +++ b/src/gradientai/_version.py @@ -1,4 +1,4 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. __title__ = "gradientai" -__version__ = "0.1.0-alpha.8" # x-release-please-version +__version__ = "0.1.0-alpha.9" # x-release-please-version