cloudflare · mchenco · Aug 27, 2025 · Aug 27, 2025 · Aug 27, 2025 · Aug 27, 2025
@@ -8,6 +8,8 @@ import google from "../../assets/images/workers-ai/google.svg";
 import deepseek from "../../assets/images/workers-ai/deepseek.svg";
 import qwen from "../../assets/images/workers-ai/qwen.svg";
 import blackforestlabs from "../../assets/images/workers-ai/blackforestlabs.svg";
+import deepgram from "../../assets/images/workers-ai/deepgram.svg";
+import leonardo from "../../assets/images/workers-ai/leonardo.svg";
 
 export const authorData: Record<string, { name: string; logo: string }> = {
 	openai: {
@@ -54,4 +56,12 @@ export const authorData: Record<string, { name: string; logo: string }> = {
 		name: "Black Forest Labs",
 		logo: blackforestlabs.src,
 	},
+	deepgram: {
+		name: "Deepgram",
+		logo: deepgram.src,
+	},
+	leonardo: {
+		name: "Leonardo",
+		logo: leonardo.src,
+	},
 };
@@ -0,0 +1,57 @@
+---
+title: Deepgram and Leonardo partner models now available on Workers AI
+description: State-of-the-art TTS, STT and image generation models, hosted on Workers AI infrastructure
+products:
+  - workers-ai
+date: 2025-08-27
+---
+
+New state-of-the-art models have landed on Workers AI! This time, we're introducing new **partner models** trained by our friends at [Deepgram](https://deepgram.com) and [Leonardo](https://leonardo.ai), hosted on Workers AI infrastructure.
+
+As well, we're introuding a new turn detection model that enables you to detect when someone is done speaking — useful for building voice agents!
+
+Read the [blog](https://blog.cloudflare.com/workers-ai-partner-models) for more details and check out some of the new models on our platform:
+- [`@cf/deepgram/aura-1`](/workers-ai/models/aura-1) is a text-to-speech model that allows you to input text and have it come to life in a customizable voice
+- [`@cf/deepgram/nova-3`](/workers-ai/models/nova-3) is speech-to-text model that transcribes multilingual audio at a blazingly fast speed
+- [`@cf/pipecat-ai/smart-turn-v2`](/workers-ai/models/smart-turn-v2) helps you detect when someone is done speaking
+- [`@cf/leonardo/lucid-origin`](/workers-ai/models/lucid-origin) is a text-to-image model that generates images with sharp graphic design, stunning full-HD renders, or highly specific creative direction
+- [`@cf/leonardo/phoenix-1.0`](/workers-ai/models/phoenix-1.0) is a text-to-image model with exceptional prompt adherence and coherent text
+
+You can filter out new partner models with the `Partner` capability on our [Models](/workers-ai/models) page.
+
+As well, we're introducing WebSocket support for some of our audio models, which you can filter though the `Realtime` capability on our [Models](/workers-ai/models) page. WebSockets allows you to create a bi-directional connection to our inference server with low latency — perfect for those that are building voice agents.
+
+An example python snippet on how to use WebSockets with our new Aura model:
+
+```
+import json
+import os
+import asyncio
+import websockets
+
+uri = f"wss://api.cloudflare.com/client/v4/accounts/{ACCOUNT_ID}/ai/run/@cf/deepgram/aura-1"
+
+input = [
+    "Line one, out of three lines that will be provided to the aura model.",
+    "Line two, out of three lines that will be provided to the aura model.",
+    "Line three, out of three lines that will be provided to the aura model. This is a last line.",
+]
+
+
+async def text_to_speech():
+    async with websockets.connect(uri, additional_headers={"Authorization": os.getenv("CF_TOKEN")}) as websocket:
+        print("connection established")
+        for line in input:
+            print(f"sending `{line}`")
+            await websocket.send(json.dumps({"type": "Speak", "text": line}))
+
+            print("line was sent, flushing")
+            await websocket.send(json.dumps({"type": "Flush"}))
+            print("flushed, recving")
+            resp = await websocket.recv()
+            print(f"response received {resp}")
+
+
+if __name__ == "__main__":
+    asyncio.run(text_to_speech())
+```
@@ -66,15 +66,30 @@ The Price in Tokens column is equivalent to the Price in Neurons column - the di
 | @cf/baai/bge-large-en-v1.5 | $0.204 per M input tokens | 18582 neurons per M input tokens |
 | @cf/baai/bge-m3            | $0.012 per M input tokens | 1075 neurons per M input tokens  |
 
-## Other model pricing
+## Image model pricing
 
 | Model                                 | Price in Tokens                                            | Price in Neurons                                                         |
 | ------------------------------------- | ---------------------------------------------------------- | ------------------------------------------------------------------------ |
 | @cf/black-forest-labs/flux-1-schnell  | $0.0000528 per 512x512 tile <br/> $0.0001056 per step      | 4.80 neurons per 512x512 tile <br/> 9.60 neurons per step                |
-| @cf/huggingface/distilbert-sst-2-int8 | $0.026 per M input tokens                                  | 2394 neurons per M input tokens                                          |
-| @cf/baai/bge-reranker-base            | $0.003 per M input tokens                                  | 283 neurons per M input tokens                                           |
-| @cf/meta/m2m100-1.2b                  | $0.342 per M input tokens <br/> $0.342 per M output tokens | 31050 neurons per M input tokens <br/> 31050 neurons per M output tokens |
-| @cf/microsoft/resnet-50               | $2.51 per M images                                         | 228055 neurons per M images                                              |
+| @cf/leonardo/lucid-origin    | $0.006996 per 512x512 tile <br/> $0.000132 per step | 636.00 neurons per 512x512 tile <br/> 12.00 neurons per step |
+| @cf/leonardo/phoenix-1.0     | $0.005830 per 512x512 tile <br/> $0.000110 per step | 530.00 neurons per 512x512 tile <br/> 10.00 neurons per step |
+
+## Audio model pricing
+
+| Model                                 | Price in Tokens                                            | Price in Neurons                                                         |
+| ------------------------------------- | ---------------------------------------------------------- | ------------------------------------------------------------------------ |
 | @cf/openai/whisper                    | $0.0005 per audio minute                                   | 41.14 neurons per audio minute                                           |
 | @cf/openai/whisper-large-v3-turbo     | $0.0005 per audio minute                                   | 46.63 neurons per audio minute                                           |
 | @cf/myshell-ai/melotts                | $0.0002 per audio minute                                   | 18.63 neurons per audio minute                                           |
+| @cf/deepgram/aura-1          | $0.015 per 1k characters input <br/>                | 1.36 neurons per 1k characters input <br/>                   |
+| @cf/deepgram/nova-3          | $0.0052 per audio minute output <br/>               | 7.88 neurons per audio minute output <br/>                   |
+| @cf/pipecat-ai/smart-turn-v2 | $0.00033795 per audio minute input <br/>            | 0.51 neurons per audio minute output <br/>                   |
+
+## Other model pricing
+
+| Model                                 | Price in Tokens                                            | Price in Neurons                                                         |
+| ------------------------------------- | ---------------------------------------------------------- | ------------------------------------------------------------------------ |
+| @cf/huggingface/distilbert-sst-2-int8 | $0.026 per M input tokens                                  | 2394 neurons per M input tokens                                          |
+| @cf/baai/bge-reranker-base            | $0.003 per M input tokens                                  | 283 neurons per M input tokens                                           |
+| @cf/meta/m2m100-1.2b                  | $0.342 per M input tokens <br/> $0.342 per M output tokens | 31050 neurons per M input tokens <br/> 31050 neurons per M output tokens |
+| @cf/microsoft/resnet-50               | $2.51 per M images                                         | 228055 neurons per M images                                              |
@@ -3,6 +3,21 @@ link: "/workers-ai/changelog/"
 productName: Workers AI
 productLink: "/workers-ai/"
 entries:
+  - publish_date: "2025-08-27"
+    title: Introducing Partner models to the Workers AI catalog
+    description: |-
+      - Read the [blog](https://blog.cloudflare.com/workers-ai-partner-models) for more details
+      - [`@cf/deepgram/aura-1`](/workers-ai/models/aura-1) is a text-to-speech model that allows you to input text and have it come to life in a customizable voice
+      - [`@cf/deepgram/nova-3`](/workers-ai/models/nova-3) is speech-to-text model that transcribes multilingual audio at a blazingly fast speed
+      - [`@cf/pipecat-ai/smart-turn-v2`](/workers-ai/models/smart-turn-v2) helps you detect when someone is done speaking
+      - [`@cf/leonardo/lucid-origin`](/workers-ai/models/lucid-origin) is a text-to-image model that generates images with sharp graphic design, stunning full-HD renders, or highly specific creative direction
+      - [`@cf/leonardo/phoenix-1.0`](/workers-ai/models/phoenix-1.0) is a text-to-image model with exceptional prompt adherence and coherent text
+      - WebSocket support added for audio models like `@cf/deepgram/aura-1`, `@cf/deepgram/nova-3`, `@cf/pipecat-ai/smart-turn-v2`
+  - publish_date: "2025-08-05"
+    title: Adding gpt-oss models to our catalog
+    description: |-
+      - Check out the [blog](https://blog.cloudflare.com/openai-gpt-oss-on-workers-ai) for more details about the new models
+      - Take a look at the [`gpt-oss-120b`](/workers-ai/models/gpt-oss-120b) and [`gpt-oss-20b`](/workers-ai/models/gpt-oss-20b) model pages for more information about schemas, pricing, and context windows
   - publish_date: "2025-04-09"
     title: Pricing correction for @cf/myshell-ai/melotts
     description: |-

@@ -0,0 +1,106 @@
+{
+    "id": "1f55679f-009e-4456-aa4f-049a62b4b6a0",
+    "source": 1,
+    "name": "@cf/deepgram/aura-1",
+    "description": "Aura is a context-aware text-to-speech (TTS) model that applies natural pacing, expressiveness, and fillers based on the context of the provided text. The quality of your text input directly impacts the naturalness of the audio output.",
+    "task": {
+        "id": "b52660a1-9a95-4ab2-8b1d-f232be34604a",
+        "name": "Text-to-Speech",
+        "description": "Text-to-Speech (TTS) is the task of generating natural sounding speech given text input. TTS models can be extended to have a single model that generates speech for multiple speakers and multiple languages."
+    },
+    "created_at": "2025-08-27 01:18:18.880",
+    "tags": [],
+    "properties": [
+        {
+            "property_id": "async_queue",
+            "value": "true"
+        },
+        {
+            "property_id": "partner",
+            "value": "true"
+        },
+        {
+            "property_id": "realtime",
+            "value": "true"
+        },
+        {
+            "property_id": "price",
+            "value": [
+                {
+                    "unit": "per 1k characters",
+                    "price": 0.0150,
+                    "currency": "USD"
+                }
+            ]
+        }
+    ],
+    "schema": {
+        "input": {
+            "type": "object",
+            "properties": {
+                "speaker": {
+                    "type": "string",
+                    "enum": [
+                        "angus",
+                        "asteria",
+                        "arcas",
+                        "orion",
+                        "orpheus",
+                        "athena",
+                        "luna",
+                        "zeus",
+                        "perseus",
+                        "helios",
+                        "hera",
+                        "stella"
+                    ],
+                    "default": "angus",
+                    "description": "Speaker used to produce the audio."
+                },
+                "encoding": {
+                    "type": "string",
+                    "enum": [
+                        "linear16",
+                        "flac",
+                        "mulaw",
+                        "alaw",
+                        "mp3",
+                        "opus",
+                        "aac"
+                    ],
+                    "description": "Encoding of the output audio."
+                },
+                "container": {
+                    "type": "string",
+                    "enum": [
+                        "none",
+                        "wav",
+                        "ogg"
+                    ],
+                    "description": "Container specifies the file format wrapper for the output audio. The available options depend on the encoding type.."
+                },
+                "text": {
+                    "type": "string",
+                    "description": "The text content to be converted to speech"
+                },
+                "sample_rate": {
+                    "type": "number",
+                    "description": "Sample Rate specifies the sample rate for the output audio. Based on the encoding, different sample rates are supported. For some encodings, the sample rate is not configurable"
+                },
+                "bit_rate": {
+                    "type": "number",
+                    "description": "The bitrate of the audio in bits per second. Choose from predefined ranges or specific values based on the encoding type."
+                }
+            },
+            "required": [
+                "text"
+            ]
+        },
+        "output": {
+            "type": "string",
+            "contentType": "audio/mpeg",
+            "format": "binary",
+            "description": "The generated audio in MP3 format"
+        }
+    }
+}
@@ -0,0 +1,92 @@
+{
+    "id": "0e372c11-8720-46c9-a02d-666188a22dae",
+    "source": 1,
+    "name": "@cf/leonardo/lucid-origin",
+    "description": "Lucid Origin from Leonardo.AI is their most adaptable and prompt-responsive model to date. Whether you're generating images with sharp graphic design, stunning full-HD renders, or highly specific creative direction, it adheres closely to your prompts, renders text with accuracy, and supports a wide array of visual styles and aesthetics – from stylized concept art to crisp product mockups.\n",
+    "task": {
+        "id": "3d6e1f35-341b-4915-a6c8-9a7142a9033a",
+        "name": "Text-to-Image",
+        "description": "Generates images from input text. These models can be used to generate and modify images based on text prompts."
+    },
+    "created_at": "2025-08-25 19:21:28.770",
+    "tags": [],
+    "properties": [
+        {
+            "property_id": "partner",
+            "value": "true"
+        },
+        {
+            "property_id": "price",
+            "value": [
+                {
+                    "unit": "per 512 by 512 tile",
+                    "price": 0.007,
+                    "currency": "USD"
+                },
+                {
+                    "unit": "per step",
+                    "price": 0.00013,
+                    "currency": "USD"
+                }
+            ]
+        }
+    ],
+    "schema": {
+        "input": {
+            "type": "object",
+            "properties": {
+                "prompt": {
+                    "type": "string",
+                    "minLength": 1,
+                    "description": "A text description of the image you want to generate."
+                },
+                "guidance": {
+                    "type": "number",
+                    "default": 4.5,
+                    "minimum": 0,
+                    "maximum": 10,
+                    "description": "Controls how closely the generated image should adhere to the prompt; higher values make the image more aligned with the prompt"
+                },
+                "seed": {
+                    "type": "integer",
+                    "minimum": 0,
+                    "description": "Random seed for reproducibility of the image generation"
+                },
+                "height": {
+                    "type": "integer",
+                    "minimum": 0,
+                    "maximum": 2500,
+                    "default": 1120,
+                    "description": "The height of the generated image in pixels"
+                },
+                "width": {
+                    "type": "integer",
+                    "minimum": 0,
+                    "maximum": 2500,
+                    "default": 1120,
+                    "description": "The width of the generated image in pixels"
+                },
+                "num_steps": {
+                    "type": "integer",
+                    "default": 4,
+                    "minimum": 1,
+                    "maximum": 40,
+                    "description": "The number of diffusion steps; higher values can improve quality but take longer"
+                }
+            },
+            "required": [
+                "prompt"
+            ]
+        },
+        "output": {
+            "type": "object",
+            "contentType": "application/json",
+            "properties": {
+                "image": {
+                    "type": "string",
+                    "description": "The generated image in Base64 format."
+                }
+            }
+        }
+    }
+}