From e0c210a0ffde24bd2c5877689f8ab222288cc597 Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Sat, 28 Jun 2025 00:58:18 +0000
Subject: [PATCH 1/2] feat(api): manual updates

add streaming to chat completions
---
 .stats.yml                                    |   2 +-
 api.md                                        |   4 +-
 .../resources/agents/chat/completions.py      |  18 +-
 src/gradientai/resources/chat/completions.py  | 537 +++++++++++++++++-
 src/gradientai/types/agents/chat/__init__.py  |   2 +-
 .../chat/agent_chat_completion_chunk.py       |  93 +++
 src/gradientai/types/chat/__init__.py         |   1 +
 .../chat/chat_completion_chunk.py             |   4 +-
 .../types/chat/completion_create_params.py    |  31 +-
 tests/api_resources/chat/test_completions.py  | 186 +++++-
 10 files changed, 840 insertions(+), 38 deletions(-)
 create mode 100644 src/gradientai/types/agents/chat/agent_chat_completion_chunk.py
 rename src/gradientai/types/{agents => }/chat/chat_completion_chunk.py (96%)

diff --git a/.stats.yml b/.stats.yml
index 8eb4144d..fcc630a7 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,4 +1,4 @@
 configured_endpoints: 76
 openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/digitalocean%2Fgradientai-e8b3cbc80e18e4f7f277010349f25e1319156704f359911dc464cc21a0d077a6.yml
 openapi_spec_hash: c773d792724f5647ae25a5ae4ccec208
-config_hash: e1b3d85ba9ae21d729a914c789422ba7
+config_hash: 0bc3af28d4abd9be8bcc81f615bc832d
diff --git a/api.md b/api.md
index b1ac8b43..52551f1f 100644
--- a/api.md
+++ b/api.md
@@ -65,7 +65,7 @@ Methods:
 Types:
 
 ```python
-from gradientai.types.agents.chat import ChatCompletionChunk, CompletionCreateResponse
+from gradientai.types.agents.chat import AgentChatCompletionChunk, CompletionCreateResponse
 ```
 
 Methods:
@@ -396,7 +396,7 @@ Methods:
 Types:
 
 ```python
-from gradientai.types.chat import CompletionCreateResponse
+from gradientai.types.chat import ChatCompletionChunk, CompletionCreateResponse
 ```
 
 Methods:
diff --git a/src/gradientai/resources/agents/chat/completions.py b/src/gradientai/resources/agents/chat/completions.py
index 92431cdf..7acba243 100644
--- a/src/gradientai/resources/agents/chat/completions.py
+++ b/src/gradientai/resources/agents/chat/completions.py
@@ -20,8 +20,8 @@
 from ...._streaming import Stream, AsyncStream
 from ...._base_client import make_request_options
 from ....types.agents.chat import completion_create_params
-from ....types.agents.chat.chat_completion_chunk import ChatCompletionChunk
 from ....types.agents.chat.completion_create_response import CompletionCreateResponse
+from ....types.agents.chat.agent_chat_completion_chunk import AgentChatCompletionChunk
 
 __all__ = ["CompletionsResource", "AsyncCompletionsResource"]
 
@@ -186,7 +186,7 @@ def create(
         extra_query: Query | None = None,
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
-    ) -> Stream[ChatCompletionChunk]:
+    ) -> Stream[AgentChatCompletionChunk]:
         """
         Creates a model response for the given chat conversation.
 
@@ -299,7 +299,7 @@ def create(
         extra_query: Query | None = None,
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
-    ) -> CompletionCreateResponse | Stream[ChatCompletionChunk]:
+    ) -> CompletionCreateResponse | Stream[AgentChatCompletionChunk]:
         """
         Creates a model response for the given chat conversation.
 
@@ -412,7 +412,7 @@ def create(
         extra_query: Query | None = None,
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
-    ) -> CompletionCreateResponse | Stream[ChatCompletionChunk]:
+    ) -> CompletionCreateResponse | Stream[AgentChatCompletionChunk]:
         return self._post(
             "/chat/completions"
             if self._client._base_url_overridden
@@ -446,7 +446,7 @@ def create(
             ),
             cast_to=CompletionCreateResponse,
             stream=stream or False,
-            stream_cls=Stream[ChatCompletionChunk],
+            stream_cls=Stream[AgentChatCompletionChunk],
         )
 
 
@@ -610,7 +610,7 @@ async def create(
         extra_query: Query | None = None,
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
-    ) -> AsyncStream[ChatCompletionChunk]:
+    ) -> AsyncStream[AgentChatCompletionChunk]:
         """
         Creates a model response for the given chat conversation.
 
@@ -723,7 +723,7 @@ async def create(
         extra_query: Query | None = None,
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
-    ) -> CompletionCreateResponse | AsyncStream[ChatCompletionChunk]:
+    ) -> CompletionCreateResponse | AsyncStream[AgentChatCompletionChunk]:
         """
         Creates a model response for the given chat conversation.
 
@@ -836,7 +836,7 @@ async def create(
         extra_query: Query | None = None,
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
-    ) -> CompletionCreateResponse | AsyncStream[ChatCompletionChunk]:
+    ) -> CompletionCreateResponse | AsyncStream[AgentChatCompletionChunk]:
         return await self._post(
             "/chat/completions"
             if self._client._base_url_overridden
@@ -870,7 +870,7 @@ async def create(
             ),
             cast_to=CompletionCreateResponse,
             stream=stream or False,
-            stream_cls=AsyncStream[ChatCompletionChunk],
+            stream_cls=AsyncStream[AgentChatCompletionChunk],
         )
 
 
diff --git a/src/gradientai/resources/chat/completions.py b/src/gradientai/resources/chat/completions.py
index a3a47429..7993f757 100644
--- a/src/gradientai/resources/chat/completions.py
+++ b/src/gradientai/resources/chat/completions.py
@@ -3,11 +3,12 @@
 from __future__ import annotations
 
 from typing import Dict, List, Union, Iterable, Optional
+from typing_extensions import Literal, overload
 
 import httpx
 
 from ..._types import NOT_GIVEN, Body, Query, Headers, NotGiven
-from ..._utils import maybe_transform, async_maybe_transform
+from ..._utils import required_args, maybe_transform, async_maybe_transform
 from ..._compat import cached_property
 from ..._resource import SyncAPIResource, AsyncAPIResource
 from ..._response import (
@@ -16,8 +17,10 @@
     async_to_raw_response_wrapper,
     async_to_streamed_response_wrapper,
 )
+from ..._streaming import Stream, AsyncStream
 from ...types.chat import completion_create_params
 from ..._base_client import make_request_options
+from ...types.chat.chat_completion_chunk import ChatCompletionChunk
 from ...types.chat.completion_create_response import CompletionCreateResponse
 
 __all__ = ["CompletionsResource", "AsyncCompletionsResource"]
@@ -43,6 +46,7 @@ def with_streaming_response(self) -> CompletionsResourceWithStreamingResponse:
         """
         return CompletionsResourceWithStreamingResponse(self)
 
+    @overload
     def create(
         self,
         *,
@@ -57,7 +61,7 @@ def create(
         n: Optional[int] | NotGiven = NOT_GIVEN,
         presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
         stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN,
-        stream: Optional[bool] | NotGiven = NOT_GIVEN,
+        stream: Optional[Literal[False]] | NotGiven = NOT_GIVEN,
         stream_options: Optional[completion_create_params.StreamOptions] | NotGiven = NOT_GIVEN,
         temperature: Optional[float] | NotGiven = NOT_GIVEN,
         top_logprobs: Optional[int] | NotGiven = NOT_GIVEN,
@@ -154,6 +158,263 @@ def create(
           timeout: Override the client-level default timeout for this request, in seconds
         """
 
+        ...
+
+    @overload
+    def create(
+        self,
+        *,
+        messages: Iterable[completion_create_params.Message],
+        model: str,
+        stream: Literal[True],
+        frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,
+        logprobs: Optional[bool] | NotGiven = NOT_GIVEN,
+        max_completion_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        metadata: Optional[Dict[str, str]] | NotGiven = NOT_GIVEN,
+        n: Optional[int] | NotGiven = NOT_GIVEN,
+        presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN,
+        stream_options: Optional[completion_create_params.StreamOptions] | NotGiven = NOT_GIVEN,
+        temperature: Optional[float] | NotGiven = NOT_GIVEN,
+        top_logprobs: Optional[int] | NotGiven = NOT_GIVEN,
+        top_p: Optional[float] | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> Stream[ChatCompletionChunk]:
+        """
+        Creates a model response for the given chat conversation.
+
+        Args:
+          messages: A list of messages comprising the conversation so far.
+
+          model: Model ID used to generate the response.
+
+          stream: If set to true, the model response data will be streamed to the client as it is
+              generated using server-sent events.
+
+          frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their
+              existing frequency in the text so far, decreasing the model's likelihood to
+              repeat the same line verbatim.
+
+          logit_bias: Modify the likelihood of specified tokens appearing in the completion.
+
+              Accepts a JSON object that maps tokens (specified by their token ID in the
+              tokenizer) to an associated bias value from -100 to 100. Mathematically, the
+              bias is added to the logits generated by the model prior to sampling. The exact
+              effect will vary per model, but values between -1 and 1 should decrease or
+              increase likelihood of selection; values like -100 or 100 should result in a ban
+              or exclusive selection of the relevant token.
+
+          logprobs: Whether to return log probabilities of the output tokens or not. If true,
+              returns the log probabilities of each output token returned in the `content` of
+              `message`.
+
+          max_completion_tokens: The maximum number of completion tokens that may be used over the course of the
+              run. The run will make a best effort to use only the number of completion tokens
+              specified, across multiple turns of the run.
+
+          max_tokens: The maximum number of tokens that can be generated in the completion.
+
+              The token count of your prompt plus `max_tokens` cannot exceed the model's
+              context length.
+
+          metadata: Set of 16 key-value pairs that can be attached to an object. This can be useful
+              for storing additional information about the object in a structured format, and
+              querying for objects via API or the dashboard.
+
+              Keys are strings with a maximum length of 64 characters. Values are strings with
+              a maximum length of 512 characters.
+
+          n: How many chat completion choices to generate for each input message. Note that
+              you will be charged based on the number of generated tokens across all of the
+              choices. Keep `n` as `1` to minimize costs.
+
+          presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on
+              whether they appear in the text so far, increasing the model's likelihood to
+              talk about new topics.
+
+          stop: Up to 4 sequences where the API will stop generating further tokens. The
+              returned text will not contain the stop sequence.
+
+          stream_options: Options for streaming response. Only set this when you set `stream: true`.
+
+          temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
+              make the output more random, while lower values like 0.2 will make it more
+              focused and deterministic. We generally recommend altering this or `top_p` but
+              not both.
+
+          top_logprobs: An integer between 0 and 20 specifying the number of most likely tokens to
+              return at each token position, each with an associated log probability.
+              `logprobs` must be set to `true` if this parameter is used.
+
+          top_p: An alternative to sampling with temperature, called nucleus sampling, where the
+              model considers the results of the tokens with top_p probability mass. So 0.1
+              means only the tokens comprising the top 10% probability mass are considered.
+
+              We generally recommend altering this or `temperature` but not both.
+
+          user: A unique identifier representing your end-user, which can help DigitalOcean to
+              monitor and detect abuse.
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+
+    @overload
+    def create(
+        self,
+        *,
+        messages: Iterable[completion_create_params.Message],
+        model: str,
+        stream: bool,
+        frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,
+        logprobs: Optional[bool] | NotGiven = NOT_GIVEN,
+        max_completion_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        metadata: Optional[Dict[str, str]] | NotGiven = NOT_GIVEN,
+        n: Optional[int] | NotGiven = NOT_GIVEN,
+        presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN,
+        stream_options: Optional[completion_create_params.StreamOptions] | NotGiven = NOT_GIVEN,
+        temperature: Optional[float] | NotGiven = NOT_GIVEN,
+        top_logprobs: Optional[int] | NotGiven = NOT_GIVEN,
+        top_p: Optional[float] | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> CompletionCreateResponse | Stream[ChatCompletionChunk]:
+        """
+        Creates a model response for the given chat conversation.
+
+        Args:
+          messages: A list of messages comprising the conversation so far.
+
+          model: Model ID used to generate the response.
+
+          stream: If set to true, the model response data will be streamed to the client as it is
+              generated using server-sent events.
+
+          frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their
+              existing frequency in the text so far, decreasing the model's likelihood to
+              repeat the same line verbatim.
+
+          logit_bias: Modify the likelihood of specified tokens appearing in the completion.
+
+              Accepts a JSON object that maps tokens (specified by their token ID in the
+              tokenizer) to an associated bias value from -100 to 100. Mathematically, the
+              bias is added to the logits generated by the model prior to sampling. The exact
+              effect will vary per model, but values between -1 and 1 should decrease or
+              increase likelihood of selection; values like -100 or 100 should result in a ban
+              or exclusive selection of the relevant token.
+
+          logprobs: Whether to return log probabilities of the output tokens or not. If true,
+              returns the log probabilities of each output token returned in the `content` of
+              `message`.
+
+          max_completion_tokens: The maximum number of completion tokens that may be used over the course of the
+              run. The run will make a best effort to use only the number of completion tokens
+              specified, across multiple turns of the run.
+
+          max_tokens: The maximum number of tokens that can be generated in the completion.
+
+              The token count of your prompt plus `max_tokens` cannot exceed the model's
+              context length.
+
+          metadata: Set of 16 key-value pairs that can be attached to an object. This can be useful
+              for storing additional information about the object in a structured format, and
+              querying for objects via API or the dashboard.
+
+              Keys are strings with a maximum length of 64 characters. Values are strings with
+              a maximum length of 512 characters.
+
+          n: How many chat completion choices to generate for each input message. Note that
+              you will be charged based on the number of generated tokens across all of the
+              choices. Keep `n` as `1` to minimize costs.
+
+          presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on
+              whether they appear in the text so far, increasing the model's likelihood to
+              talk about new topics.
+
+          stop: Up to 4 sequences where the API will stop generating further tokens. The
+              returned text will not contain the stop sequence.
+
+          stream_options: Options for streaming response. Only set this when you set `stream: true`.
+
+          temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
+              make the output more random, while lower values like 0.2 will make it more
+              focused and deterministic. We generally recommend altering this or `top_p` but
+              not both.
+
+          top_logprobs: An integer between 0 and 20 specifying the number of most likely tokens to
+              return at each token position, each with an associated log probability.
+              `logprobs` must be set to `true` if this parameter is used.
+
+          top_p: An alternative to sampling with temperature, called nucleus sampling, where the
+              model considers the results of the tokens with top_p probability mass. So 0.1
+              means only the tokens comprising the top 10% probability mass are considered.
+
+              We generally recommend altering this or `temperature` but not both.
+
+          user: A unique identifier representing your end-user, which can help DigitalOcean to
+              monitor and detect abuse.
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+
+    @required_args(["messages", "model"], ["messages", "model", "stream"])
+    def create(
+        self,
+        *,
+        messages: Iterable[completion_create_params.Message],
+        model: str,
+        frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,
+        logprobs: Optional[bool] | NotGiven = NOT_GIVEN,
+        max_completion_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        metadata: Optional[Dict[str, str]] | NotGiven = NOT_GIVEN,
+        n: Optional[int] | NotGiven = NOT_GIVEN,
+        presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN,
+        stream: Optional[Literal[False]] | Literal[True] | NotGiven = NOT_GIVEN,
+        stream_options: Optional[completion_create_params.StreamOptions] | NotGiven = NOT_GIVEN,
+        temperature: Optional[float] | NotGiven = NOT_GIVEN,
+        top_logprobs: Optional[int] | NotGiven = NOT_GIVEN,
+        top_p: Optional[float] | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> CompletionCreateResponse | Stream[ChatCompletionChunk]:
+
         # This method requires an inference_key to be set via client argument or environment variable
         if not self._client.inference_key:
             raise TypeError(
@@ -186,12 +447,16 @@ def create(
                     "top_p": top_p,
                     "user": user,
                 },
-                completion_create_params.CompletionCreateParams,
+                completion_create_params.CompletionCreateParamsStreaming
+                if stream
+                else completion_create_params.CompletionCreateParamsNonStreaming,
             ),
             options=make_request_options(
                 extra_headers=headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
             cast_to=CompletionCreateResponse,
+            stream=stream or False,
+            stream_cls=Stream[ChatCompletionChunk],
         )
 
 
@@ -215,6 +480,7 @@ def with_streaming_response(self) -> AsyncCompletionsResourceWithStreamingRespon
         """
         return AsyncCompletionsResourceWithStreamingResponse(self)
 
+    @overload
     async def create(
         self,
         *,
@@ -229,7 +495,7 @@ async def create(
         n: Optional[int] | NotGiven = NOT_GIVEN,
         presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
         stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN,
-        stream: Optional[bool] | NotGiven = NOT_GIVEN,
+        stream: Optional[Literal[False]] | NotGiven = NOT_GIVEN,
         stream_options: Optional[completion_create_params.StreamOptions] | NotGiven = NOT_GIVEN,
         temperature: Optional[float] | NotGiven = NOT_GIVEN,
         top_logprobs: Optional[int] | NotGiven = NOT_GIVEN,
@@ -326,6 +592,263 @@ async def create(
           timeout: Override the client-level default timeout for this request, in seconds
         """
 
+        ...
+
+    @overload
+    async def create(
+        self,
+        *,
+        messages: Iterable[completion_create_params.Message],
+        model: str,
+        stream: Literal[True],
+        frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,
+        logprobs: Optional[bool] | NotGiven = NOT_GIVEN,
+        max_completion_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        metadata: Optional[Dict[str, str]] | NotGiven = NOT_GIVEN,
+        n: Optional[int] | NotGiven = NOT_GIVEN,
+        presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN,
+        stream_options: Optional[completion_create_params.StreamOptions] | NotGiven = NOT_GIVEN,
+        temperature: Optional[float] | NotGiven = NOT_GIVEN,
+        top_logprobs: Optional[int] | NotGiven = NOT_GIVEN,
+        top_p: Optional[float] | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> AsyncStream[ChatCompletionChunk]:
+        """
+        Creates a model response for the given chat conversation.
+
+        Args:
+          messages: A list of messages comprising the conversation so far.
+
+          model: Model ID used to generate the response.
+
+          stream: If set to true, the model response data will be streamed to the client as it is
+              generated using server-sent events.
+
+          frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their
+              existing frequency in the text so far, decreasing the model's likelihood to
+              repeat the same line verbatim.
+
+          logit_bias: Modify the likelihood of specified tokens appearing in the completion.
+
+              Accepts a JSON object that maps tokens (specified by their token ID in the
+              tokenizer) to an associated bias value from -100 to 100. Mathematically, the
+              bias is added to the logits generated by the model prior to sampling. The exact
+              effect will vary per model, but values between -1 and 1 should decrease or
+              increase likelihood of selection; values like -100 or 100 should result in a ban
+              or exclusive selection of the relevant token.
+
+          logprobs: Whether to return log probabilities of the output tokens or not. If true,
+              returns the log probabilities of each output token returned in the `content` of
+              `message`.
+
+          max_completion_tokens: The maximum number of completion tokens that may be used over the course of the
+              run. The run will make a best effort to use only the number of completion tokens
+              specified, across multiple turns of the run.
+
+          max_tokens: The maximum number of tokens that can be generated in the completion.
+
+              The token count of your prompt plus `max_tokens` cannot exceed the model's
+              context length.
+
+          metadata: Set of 16 key-value pairs that can be attached to an object. This can be useful
+              for storing additional information about the object in a structured format, and
+              querying for objects via API or the dashboard.
+
+              Keys are strings with a maximum length of 64 characters. Values are strings with
+              a maximum length of 512 characters.
+
+          n: How many chat completion choices to generate for each input message. Note that
+              you will be charged based on the number of generated tokens across all of the
+              choices. Keep `n` as `1` to minimize costs.
+
+          presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on
+              whether they appear in the text so far, increasing the model's likelihood to
+              talk about new topics.
+
+          stop: Up to 4 sequences where the API will stop generating further tokens. The
+              returned text will not contain the stop sequence.
+
+          stream_options: Options for streaming response. Only set this when you set `stream: true`.
+
+          temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
+              make the output more random, while lower values like 0.2 will make it more
+              focused and deterministic. We generally recommend altering this or `top_p` but
+              not both.
+
+          top_logprobs: An integer between 0 and 20 specifying the number of most likely tokens to
+              return at each token position, each with an associated log probability.
+              `logprobs` must be set to `true` if this parameter is used.
+
+          top_p: An alternative to sampling with temperature, called nucleus sampling, where the
+              model considers the results of the tokens with top_p probability mass. So 0.1
+              means only the tokens comprising the top 10% probability mass are considered.
+
+              We generally recommend altering this or `temperature` but not both.
+
+          user: A unique identifier representing your end-user, which can help DigitalOcean to
+              monitor and detect abuse.
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+
+    @overload
+    async def create(
+        self,
+        *,
+        messages: Iterable[completion_create_params.Message],
+        model: str,
+        stream: bool,
+        frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,
+        logprobs: Optional[bool] | NotGiven = NOT_GIVEN,
+        max_completion_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        metadata: Optional[Dict[str, str]] | NotGiven = NOT_GIVEN,
+        n: Optional[int] | NotGiven = NOT_GIVEN,
+        presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN,
+        stream_options: Optional[completion_create_params.StreamOptions] | NotGiven = NOT_GIVEN,
+        temperature: Optional[float] | NotGiven = NOT_GIVEN,
+        top_logprobs: Optional[int] | NotGiven = NOT_GIVEN,
+        top_p: Optional[float] | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> CompletionCreateResponse | AsyncStream[ChatCompletionChunk]:
+        """
+        Creates a model response for the given chat conversation.
+
+        Args:
+          messages: A list of messages comprising the conversation so far.
+
+          model: Model ID used to generate the response.
+
+          stream: If set to true, the model response data will be streamed to the client as it is
+              generated using server-sent events.
+
+          frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their
+              existing frequency in the text so far, decreasing the model's likelihood to
+              repeat the same line verbatim.
+
+          logit_bias: Modify the likelihood of specified tokens appearing in the completion.
+
+              Accepts a JSON object that maps tokens (specified by their token ID in the
+              tokenizer) to an associated bias value from -100 to 100. Mathematically, the
+              bias is added to the logits generated by the model prior to sampling. The exact
+              effect will vary per model, but values between -1 and 1 should decrease or
+              increase likelihood of selection; values like -100 or 100 should result in a ban
+              or exclusive selection of the relevant token.
+
+          logprobs: Whether to return log probabilities of the output tokens or not. If true,
+              returns the log probabilities of each output token returned in the `content` of
+              `message`.
+
+          max_completion_tokens: The maximum number of completion tokens that may be used over the course of the
+              run. The run will make a best effort to use only the number of completion tokens
+              specified, across multiple turns of the run.
+
+          max_tokens: The maximum number of tokens that can be generated in the completion.
+
+              The token count of your prompt plus `max_tokens` cannot exceed the model's
+              context length.
+
+          metadata: Set of 16 key-value pairs that can be attached to an object. This can be useful
+              for storing additional information about the object in a structured format, and
+              querying for objects via API or the dashboard.
+
+              Keys are strings with a maximum length of 64 characters. Values are strings with
+              a maximum length of 512 characters.
+
+          n: How many chat completion choices to generate for each input message. Note that
+              you will be charged based on the number of generated tokens across all of the
+              choices. Keep `n` as `1` to minimize costs.
+
+          presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on
+              whether they appear in the text so far, increasing the model's likelihood to
+              talk about new topics.
+
+          stop: Up to 4 sequences where the API will stop generating further tokens. The
+              returned text will not contain the stop sequence.
+
+          stream_options: Options for streaming response. Only set this when you set `stream: true`.
+
+          temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
+              make the output more random, while lower values like 0.2 will make it more
+              focused and deterministic. We generally recommend altering this or `top_p` but
+              not both.
+
+          top_logprobs: An integer between 0 and 20 specifying the number of most likely tokens to
+              return at each token position, each with an associated log probability.
+              `logprobs` must be set to `true` if this parameter is used.
+
+          top_p: An alternative to sampling with temperature, called nucleus sampling, where the
+              model considers the results of the tokens with top_p probability mass. So 0.1
+              means only the tokens comprising the top 10% probability mass are considered.
+
+              We generally recommend altering this or `temperature` but not both.
+
+          user: A unique identifier representing your end-user, which can help DigitalOcean to
+              monitor and detect abuse.
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+
+    @required_args(["messages", "model"], ["messages", "model", "stream"])
+    async def create(
+        self,
+        *,
+        messages: Iterable[completion_create_params.Message],
+        model: str,
+        frequency_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        logit_bias: Optional[Dict[str, int]] | NotGiven = NOT_GIVEN,
+        logprobs: Optional[bool] | NotGiven = NOT_GIVEN,
+        max_completion_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        max_tokens: Optional[int] | NotGiven = NOT_GIVEN,
+        metadata: Optional[Dict[str, str]] | NotGiven = NOT_GIVEN,
+        n: Optional[int] | NotGiven = NOT_GIVEN,
+        presence_penalty: Optional[float] | NotGiven = NOT_GIVEN,
+        stop: Union[Optional[str], List[str], None] | NotGiven = NOT_GIVEN,
+        stream: Optional[Literal[False]] | Literal[True] | NotGiven = NOT_GIVEN,
+        stream_options: Optional[completion_create_params.StreamOptions] | NotGiven = NOT_GIVEN,
+        temperature: Optional[float] | NotGiven = NOT_GIVEN,
+        top_logprobs: Optional[int] | NotGiven = NOT_GIVEN,
+        top_p: Optional[float] | NotGiven = NOT_GIVEN,
+        user: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> CompletionCreateResponse | AsyncStream[ChatCompletionChunk]:
+
         # This method requires an inference_key to be set via client argument or environment variable
         if not hasattr(self._client, "inference_key") or not self._client.inference_key:
             raise TypeError(
@@ -358,12 +881,16 @@ async def create(
                     "top_p": top_p,
                     "user": user,
                 },
-                completion_create_params.CompletionCreateParams,
+                completion_create_params.CompletionCreateParamsStreaming
+                if stream
+                else completion_create_params.CompletionCreateParamsNonStreaming,
             ),
             options=make_request_options(
                 extra_headers=headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
             cast_to=CompletionCreateResponse,
+            stream=stream or False,
+            stream_cls=AsyncStream[ChatCompletionChunk],
         )
 
 
diff --git a/src/gradientai/types/agents/chat/__init__.py b/src/gradientai/types/agents/chat/__init__.py
index f0243162..305ba0af 100644
--- a/src/gradientai/types/agents/chat/__init__.py
+++ b/src/gradientai/types/agents/chat/__init__.py
@@ -2,6 +2,6 @@
 
 from __future__ import annotations
 
-from .chat_completion_chunk import ChatCompletionChunk as ChatCompletionChunk
 from .completion_create_params import CompletionCreateParams as CompletionCreateParams
 from .completion_create_response import CompletionCreateResponse as CompletionCreateResponse
+from .agent_chat_completion_chunk import AgentChatCompletionChunk as AgentChatCompletionChunk
diff --git a/src/gradientai/types/agents/chat/agent_chat_completion_chunk.py b/src/gradientai/types/agents/chat/agent_chat_completion_chunk.py
new file mode 100644
index 00000000..36ee3d9e
--- /dev/null
+++ b/src/gradientai/types/agents/chat/agent_chat_completion_chunk.py
@@ -0,0 +1,93 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import List, Optional
+from typing_extensions import Literal
+
+from ...._models import BaseModel
+from ...shared.chat_completion_token_logprob import ChatCompletionTokenLogprob
+
+__all__ = ["AgentChatCompletionChunk", "Choice", "ChoiceDelta", "ChoiceLogprobs", "Usage"]
+
+
+class ChoiceDelta(BaseModel):
+    content: Optional[str] = None
+    """The contents of the chunk message."""
+
+    refusal: Optional[str] = None
+    """The refusal message generated by the model."""
+
+    role: Optional[Literal["developer", "user", "assistant"]] = None
+    """The role of the author of this message."""
+
+
+class ChoiceLogprobs(BaseModel):
+    content: Optional[List[ChatCompletionTokenLogprob]] = None
+    """A list of message content tokens with log probability information."""
+
+    refusal: Optional[List[ChatCompletionTokenLogprob]] = None
+    """A list of message refusal tokens with log probability information."""
+
+
+class Choice(BaseModel):
+    delta: ChoiceDelta
+    """A chat completion delta generated by streamed model responses."""
+
+    finish_reason: Optional[Literal["stop", "length"]] = None
+    """The reason the model stopped generating tokens.
+
+    This will be `stop` if the model hit a natural stop point or a provided stop
+    sequence, or `length` if the maximum number of tokens specified in the request
+    was reached
+    """
+
+    index: int
+    """The index of the choice in the list of choices."""
+
+    logprobs: Optional[ChoiceLogprobs] = None
+    """Log probability information for the choice."""
+
+
+class Usage(BaseModel):
+    completion_tokens: int
+    """Number of tokens in the generated completion."""
+
+    prompt_tokens: int
+    """Number of tokens in the prompt."""
+
+    total_tokens: int
+    """Total number of tokens used in the request (prompt + completion)."""
+
+
+class AgentChatCompletionChunk(BaseModel):
+    id: str
+    """A unique identifier for the chat completion. Each chunk has the same ID."""
+
+    choices: List[Choice]
+    """A list of chat completion choices.
+
+    Can contain more than one elements if `n` is greater than 1. Can also be empty
+    for the last chunk if you set `stream_options: {"include_usage": true}`.
+    """
+
+    created: int
+    """The Unix timestamp (in seconds) of when the chat completion was created.
+
+    Each chunk has the same timestamp.
+    """
+
+    model: str
+    """The model to generate the completion."""
+
+    object: Literal["chat.completion.chunk"]
+    """The object type, which is always `chat.completion.chunk`."""
+
+    usage: Optional[Usage] = None
+    """
+    An optional field that will only be present when you set
+    `stream_options: {"include_usage": true}` in your request. When present, it
+    contains a null value **except for the last chunk** which contains the token
+    usage statistics for the entire request.
+
+    **NOTE:** If the stream is interrupted or cancelled, you may not receive the
+    final usage chunk which contains the total token usage for the request.
+    """
diff --git a/src/gradientai/types/chat/__init__.py b/src/gradientai/types/chat/__init__.py
index 9384ac14..f0243162 100644
--- a/src/gradientai/types/chat/__init__.py
+++ b/src/gradientai/types/chat/__init__.py
@@ -2,5 +2,6 @@
 
 from __future__ import annotations
 
+from .chat_completion_chunk import ChatCompletionChunk as ChatCompletionChunk
 from .completion_create_params import CompletionCreateParams as CompletionCreateParams
 from .completion_create_response import CompletionCreateResponse as CompletionCreateResponse
diff --git a/src/gradientai/types/agents/chat/chat_completion_chunk.py b/src/gradientai/types/chat/chat_completion_chunk.py
similarity index 96%
rename from src/gradientai/types/agents/chat/chat_completion_chunk.py
rename to src/gradientai/types/chat/chat_completion_chunk.py
index b81aef72..4adcc63d 100644
--- a/src/gradientai/types/agents/chat/chat_completion_chunk.py
+++ b/src/gradientai/types/chat/chat_completion_chunk.py
@@ -3,8 +3,8 @@
 from typing import List, Optional
 from typing_extensions import Literal
 
-from ...._models import BaseModel
-from ...shared.chat_completion_token_logprob import ChatCompletionTokenLogprob
+from ..._models import BaseModel
+from ..shared.chat_completion_token_logprob import ChatCompletionTokenLogprob
 
 __all__ = ["ChatCompletionChunk", "Choice", "ChoiceDelta", "ChoiceLogprobs", "Usage"]
 
diff --git a/src/gradientai/types/chat/completion_create_params.py b/src/gradientai/types/chat/completion_create_params.py
index 11d032ff..ec5c6b70 100644
--- a/src/gradientai/types/chat/completion_create_params.py
+++ b/src/gradientai/types/chat/completion_create_params.py
@@ -6,17 +6,19 @@
 from typing_extensions import Literal, Required, TypeAlias, TypedDict
 
 __all__ = [
-    "CompletionCreateParams",
+    "CompletionCreateParamsBase",
     "Message",
     "MessageChatCompletionRequestSystemMessage",
     "MessageChatCompletionRequestDeveloperMessage",
     "MessageChatCompletionRequestUserMessage",
     "MessageChatCompletionRequestAssistantMessage",
     "StreamOptions",
+    "CompletionCreateParamsNonStreaming",
+    "CompletionCreateParamsStreaming",
 ]
 
 
-class CompletionCreateParams(TypedDict, total=False):
+class CompletionCreateParamsBase(TypedDict, total=False):
     messages: Required[Iterable[Message]]
     """A list of messages comprising the conversation so far."""
 
@@ -92,12 +94,6 @@ class CompletionCreateParams(TypedDict, total=False):
     The returned text will not contain the stop sequence.
     """
 
-    stream: Optional[bool]
-    """
-    If set to true, the model response data will be streamed to the client as it is
-    generated using server-sent events.
-    """
-
     stream_options: Optional[StreamOptions]
     """Options for streaming response. Only set this when you set `stream: true`."""
 
@@ -183,3 +179,22 @@ class StreamOptions(TypedDict, total=False):
     **NOTE:** If the stream is interrupted, you may not receive the final usage
     chunk which contains the total token usage for the request.
     """
+
+
+class CompletionCreateParamsNonStreaming(CompletionCreateParamsBase, total=False):
+    stream: Optional[Literal[False]]
+    """
+    If set to true, the model response data will be streamed to the client as it is
+    generated using server-sent events.
+    """
+
+
+class CompletionCreateParamsStreaming(CompletionCreateParamsBase):
+    stream: Required[Literal[True]]
+    """
+    If set to true, the model response data will be streamed to the client as it is
+    generated using server-sent events.
+    """
+
+
+CompletionCreateParams = Union[CompletionCreateParamsNonStreaming, CompletionCreateParamsStreaming]
diff --git a/tests/api_resources/chat/test_completions.py b/tests/api_resources/chat/test_completions.py
index b4c09579..25b8419a 100644
--- a/tests/api_resources/chat/test_completions.py
+++ b/tests/api_resources/chat/test_completions.py
@@ -19,7 +19,7 @@ class TestCompletions:
 
     @pytest.mark.skip()
     @parametrize
-    def test_method_create(self, client: GradientAI) -> None:
+    def test_method_create_overload_1(self, client: GradientAI) -> None:
         completion = client.chat.completions.create(
             messages=[
                 {
@@ -33,7 +33,7 @@ def test_method_create(self, client: GradientAI) -> None:
 
     @pytest.mark.skip()
     @parametrize
-    def test_method_create_with_all_params(self, client: GradientAI) -> None:
+    def test_method_create_with_all_params_overload_1(self, client: GradientAI) -> None:
         completion = client.chat.completions.create(
             messages=[
                 {
@@ -51,7 +51,7 @@ def test_method_create_with_all_params(self, client: GradientAI) -> None:
             n=1,
             presence_penalty=-2,
             stop="\n",
-            stream=True,
+            stream=False,
             stream_options={"include_usage": True},
             temperature=1,
             top_logprobs=0,
@@ -62,7 +62,7 @@ def test_method_create_with_all_params(self, client: GradientAI) -> None:
 
     @pytest.mark.skip()
     @parametrize
-    def test_raw_response_create(self, client: GradientAI) -> None:
+    def test_raw_response_create_overload_1(self, client: GradientAI) -> None:
         response = client.chat.completions.with_raw_response.create(
             messages=[
                 {
@@ -80,7 +80,7 @@ def test_raw_response_create(self, client: GradientAI) -> None:
 
     @pytest.mark.skip()
     @parametrize
-    def test_streaming_response_create(self, client: GradientAI) -> None:
+    def test_streaming_response_create_overload_1(self, client: GradientAI) -> None:
         with client.chat.completions.with_streaming_response.create(
             messages=[
                 {
@@ -98,6 +98,89 @@ def test_streaming_response_create(self, client: GradientAI) -> None:
 
         assert cast(Any, response.is_closed) is True
 
+    @pytest.mark.skip()
+    @parametrize
+    def test_method_create_overload_2(self, client: GradientAI) -> None:
+        completion_stream = client.chat.completions.create(
+            messages=[
+                {
+                    "content": "string",
+                    "role": "system",
+                }
+            ],
+            model="llama3-8b-instruct",
+            stream=True,
+        )
+        completion_stream.response.close()
+
+    @pytest.mark.skip()
+    @parametrize
+    def test_method_create_with_all_params_overload_2(self, client: GradientAI) -> None:
+        completion_stream = client.chat.completions.create(
+            messages=[
+                {
+                    "content": "string",
+                    "role": "system",
+                }
+            ],
+            model="llama3-8b-instruct",
+            stream=True,
+            frequency_penalty=-2,
+            logit_bias={"foo": 0},
+            logprobs=True,
+            max_completion_tokens=256,
+            max_tokens=0,
+            metadata={"foo": "string"},
+            n=1,
+            presence_penalty=-2,
+            stop="\n",
+            stream_options={"include_usage": True},
+            temperature=1,
+            top_logprobs=0,
+            top_p=1,
+            user="user-1234",
+        )
+        completion_stream.response.close()
+
+    @pytest.mark.skip()
+    @parametrize
+    def test_raw_response_create_overload_2(self, client: GradientAI) -> None:
+        response = client.chat.completions.with_raw_response.create(
+            messages=[
+                {
+                    "content": "string",
+                    "role": "system",
+                }
+            ],
+            model="llama3-8b-instruct",
+            stream=True,
+        )
+
+        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+        stream = response.parse()
+        stream.close()
+
+    @pytest.mark.skip()
+    @parametrize
+    def test_streaming_response_create_overload_2(self, client: GradientAI) -> None:
+        with client.chat.completions.with_streaming_response.create(
+            messages=[
+                {
+                    "content": "string",
+                    "role": "system",
+                }
+            ],
+            model="llama3-8b-instruct",
+            stream=True,
+        ) as response:
+            assert not response.is_closed
+            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+
+            stream = response.parse()
+            stream.close()
+
+        assert cast(Any, response.is_closed) is True
+
 
 class TestAsyncCompletions:
     parametrize = pytest.mark.parametrize(
@@ -106,7 +189,7 @@ class TestAsyncCompletions:
 
     @pytest.mark.skip()
     @parametrize
-    async def test_method_create(self, async_client: AsyncGradientAI) -> None:
+    async def test_method_create_overload_1(self, async_client: AsyncGradientAI) -> None:
         completion = await async_client.chat.completions.create(
             messages=[
                 {
@@ -120,7 +203,7 @@ async def test_method_create(self, async_client: AsyncGradientAI) -> None:
 
     @pytest.mark.skip()
     @parametrize
-    async def test_method_create_with_all_params(self, async_client: AsyncGradientAI) -> None:
+    async def test_method_create_with_all_params_overload_1(self, async_client: AsyncGradientAI) -> None:
         completion = await async_client.chat.completions.create(
             messages=[
                 {
@@ -138,7 +221,7 @@ async def test_method_create_with_all_params(self, async_client: AsyncGradientAI
             n=1,
             presence_penalty=-2,
             stop="\n",
-            stream=True,
+            stream=False,
             stream_options={"include_usage": True},
             temperature=1,
             top_logprobs=0,
@@ -149,7 +232,7 @@ async def test_method_create_with_all_params(self, async_client: AsyncGradientAI
 
     @pytest.mark.skip()
     @parametrize
-    async def test_raw_response_create(self, async_client: AsyncGradientAI) -> None:
+    async def test_raw_response_create_overload_1(self, async_client: AsyncGradientAI) -> None:
         response = await async_client.chat.completions.with_raw_response.create(
             messages=[
                 {
@@ -167,7 +250,7 @@ async def test_raw_response_create(self, async_client: AsyncGradientAI) -> None:
 
     @pytest.mark.skip()
     @parametrize
-    async def test_streaming_response_create(self, async_client: AsyncGradientAI) -> None:
+    async def test_streaming_response_create_overload_1(self, async_client: AsyncGradientAI) -> None:
         async with async_client.chat.completions.with_streaming_response.create(
             messages=[
                 {
@@ -184,3 +267,86 @@ async def test_streaming_response_create(self, async_client: AsyncGradientAI) ->
             assert_matches_type(CompletionCreateResponse, completion, path=["response"])
 
         assert cast(Any, response.is_closed) is True
+
+    @pytest.mark.skip()
+    @parametrize
+    async def test_method_create_overload_2(self, async_client: AsyncGradientAI) -> None:
+        completion_stream = await async_client.chat.completions.create(
+            messages=[
+                {
+                    "content": "string",
+                    "role": "system",
+                }
+            ],
+            model="llama3-8b-instruct",
+            stream=True,
+        )
+        await completion_stream.response.aclose()
+
+    @pytest.mark.skip()
+    @parametrize
+    async def test_method_create_with_all_params_overload_2(self, async_client: AsyncGradientAI) -> None:
+        completion_stream = await async_client.chat.completions.create(
+            messages=[
+                {
+                    "content": "string",
+                    "role": "system",
+                }
+            ],
+            model="llama3-8b-instruct",
+            stream=True,
+            frequency_penalty=-2,
+            logit_bias={"foo": 0},
+            logprobs=True,
+            max_completion_tokens=256,
+            max_tokens=0,
+            metadata={"foo": "string"},
+            n=1,
+            presence_penalty=-2,
+            stop="\n",
+            stream_options={"include_usage": True},
+            temperature=1,
+            top_logprobs=0,
+            top_p=1,
+            user="user-1234",
+        )
+        await completion_stream.response.aclose()
+
+    @pytest.mark.skip()
+    @parametrize
+    async def test_raw_response_create_overload_2(self, async_client: AsyncGradientAI) -> None:
+        response = await async_client.chat.completions.with_raw_response.create(
+            messages=[
+                {
+                    "content": "string",
+                    "role": "system",
+                }
+            ],
+            model="llama3-8b-instruct",
+            stream=True,
+        )
+
+        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+        stream = await response.parse()
+        await stream.close()
+
+    @pytest.mark.skip()
+    @parametrize
+    async def test_streaming_response_create_overload_2(self, async_client: AsyncGradientAI) -> None:
+        async with async_client.chat.completions.with_streaming_response.create(
+            messages=[
+                {
+                    "content": "string",
+                    "role": "system",
+                }
+            ],
+            model="llama3-8b-instruct",
+            stream=True,
+        ) as response:
+            assert not response.is_closed
+            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+
+            stream = await response.parse()
+            await stream.close()
+
+        assert cast(Any, response.is_closed) is True

From 593e13b1077d38a81a80a2e473031f197143374c Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Sat, 28 Jun 2025 00:58:35 +0000
Subject: [PATCH 2/2] release: 0.1.0-alpha.9

---
 .release-please-manifest.json | 2 +-
 CHANGELOG.md                  | 8 ++++++++
 pyproject.toml                | 2 +-
 src/gradientai/_version.py    | 2 +-
 4 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/.release-please-manifest.json b/.release-please-manifest.json
index c373724d..46b9b6b2 100644
--- a/.release-please-manifest.json
+++ b/.release-please-manifest.json
@@ -1,3 +1,3 @@
 {
-  ".": "0.1.0-alpha.8"
+  ".": "0.1.0-alpha.9"
 }
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b26d4058..4d5ad5ba 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,13 @@
 # Changelog
 
+## 0.1.0-alpha.9 (2025-06-28)
+
+Full Changelog: [v0.1.0-alpha.8...v0.1.0-alpha.9](https://github.com/digitalocean/gradientai-python/compare/v0.1.0-alpha.8...v0.1.0-alpha.9)
+
+### Features
+
+* **api:** manual updates ([e0c210a](https://github.com/digitalocean/gradientai-python/commit/e0c210a0ffde24bd2c5877689f8ab222288cc597))
+
 ## 0.1.0-alpha.8 (2025-06-27)
 
 Full Changelog: [v0.1.0-alpha.7...v0.1.0-alpha.8](https://github.com/digitalocean/gradientai-python/compare/v0.1.0-alpha.7...v0.1.0-alpha.8)
diff --git a/pyproject.toml b/pyproject.toml
index 60a58f89..87c4aeeb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "c63a5cfe-b235-4fbe-8bbb-82a9e02a482a-python"
-version = "0.1.0-alpha.8"
+version = "0.1.0-alpha.9"
 description = "The official Python library for GradientAI"
 dynamic = ["readme"]
 license = "Apache-2.0"
diff --git a/src/gradientai/_version.py b/src/gradientai/_version.py
index 8c8f2b63..5cd8ca49 100644
--- a/src/gradientai/_version.py
+++ b/src/gradientai/_version.py
@@ -1,4 +1,4 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
 __title__ = "gradientai"
-__version__ = "0.1.0-alpha.8"  # x-release-please-version
+__version__ = "0.1.0-alpha.9"  # x-release-please-version