From b5bd75650cb2079ba8c11e8930edc5e7526023db Mon Sep 17 00:00:00 2001 From: mchen Date: Wed, 27 Aug 2025 01:41:05 -0400 Subject: [PATCH 1/8] model pages --- src/content/workers-ai-models/aura-1.json | 106 ++++++ .../workers-ai-models/lucid-origin.json | 92 ++++++ src/content/workers-ai-models/nova-3.json | 308 ++++++++++++++++++ .../workers-ai-models/phoenix-1.0.json | 93 ++++++ 4 files changed, 599 insertions(+) create mode 100644 src/content/workers-ai-models/aura-1.json create mode 100644 src/content/workers-ai-models/lucid-origin.json create mode 100644 src/content/workers-ai-models/nova-3.json create mode 100644 src/content/workers-ai-models/phoenix-1.0.json diff --git a/src/content/workers-ai-models/aura-1.json b/src/content/workers-ai-models/aura-1.json new file mode 100644 index 00000000000000..a83c566ed831be --- /dev/null +++ b/src/content/workers-ai-models/aura-1.json @@ -0,0 +1,106 @@ +{ + "id": "1f55679f-009e-4456-aa4f-049a62b4b6a0", + "source": 1, + "name": "@cf/deepgram/aura-1", + "description": "Aura is a context-aware text-to-speech (TTS) model that applies natural pacing, expressiveness, and fillers based on the context of the provided text. The quality of your text input directly impacts the naturalness of the audio output.", + "task": { + "id": "b52660a1-9a95-4ab2-8b1d-f232be34604a", + "name": "Text-to-Speech", + "description": "Text-to-Speech (TTS) is the task of generating natural sounding speech given text input. TTS models can be extended to have a single model that generates speech for multiple speakers and multiple languages." + }, + "created_at": "2025-08-27 01:18:18.880", + "tags": [], + "properties": [ + { + "property_id": "async_queue", + "value": "true" + }, + { + "property_id": "partner", + "value": "true" + }, + { + "property_id": "realtime", + "value": "true" + }, + { + "property_id": "price", + "value": [ + { + "unit": "per 1k characters", + "price": 0.0150, + "currency": "USD" + } + ] + } + ], + "schema": { + "input": { + "type": "object", + "properties": { + "speaker": { + "type": "string", + "enum": [ + "angus", + "asteria", + "arcas", + "orion", + "orpheus", + "athena", + "luna", + "zeus", + "perseus", + "helios", + "hera", + "stella" + ], + "default": "angus", + "description": "Speaker used to produce the audio." + }, + "encoding": { + "type": "string", + "enum": [ + "linear16", + "flac", + "mulaw", + "alaw", + "mp3", + "opus", + "aac" + ], + "description": "Encoding of the output audio." + }, + "container": { + "type": "string", + "enum": [ + "none", + "wav", + "ogg" + ], + "description": "Container specifies the file format wrapper for the output audio. The available options depend on the encoding type.." + }, + "text": { + "type": "string", + "description": "The text content to be converted to speech" + }, + "sample_rate": { + "type": "number", + "description": "Sample Rate specifies the sample rate for the output audio. Based on the encoding, different sample rates are supported. For some encodings, the sample rate is not configurable" + }, + "bit_rate": { + "type": "number", + "description": "The bitrate of the audio in bits per second. Choose from predefined ranges or specific values based on the encoding type." + } + }, + "required": [ + "text" + ] + }, + "output": { + "type": "string", + "contentType": "audio/mpeg", + "format": "binary", + "description": "The generated audio in MP3 format" + } + } +} \ No newline at end of file diff --git a/src/content/workers-ai-models/lucid-origin.json b/src/content/workers-ai-models/lucid-origin.json new file mode 100644 index 00000000000000..a2b1780a5f3b6e --- /dev/null +++ b/src/content/workers-ai-models/lucid-origin.json @@ -0,0 +1,92 @@ +{ + "id": "0e372c11-8720-46c9-a02d-666188a22dae", + "source": 1, + "name": "@cf/leonardo/lucid-origin", + "description": "Lucid Origin from Leonardo.AI is their most adaptable and prompt-responsive model to date. Whether you're generating images with sharp graphic design, stunning full-HD renders, or highly specific creative direction, it adheres closely to your prompts, renders text with accuracy, and supports a wide array of visual styles and aesthetics – from stylized concept art to crisp product mockups.\n", + "task": { + "id": "3d6e1f35-341b-4915-a6c8-9a7142a9033a", + "name": "Text-to-Image", + "description": "Generates images from input text. These models can be used to generate and modify images based on text prompts." + }, + "created_at": "2025-08-25 19:21:28.770", + "tags": [], + "properties": [ + { + "property_id": "partner", + "value": "true" + }, + { + "property_id": "price", + "value": [ + { + "unit": "per 512 by 512 tile", + "price": 0.007, + "currency": "USD" + }, + { + "unit": "per step", + "price": 0.00013, + "currency": "USD" + } + ] + } + ], + "schema": { + "input": { + "type": "object", + "properties": { + "prompt": { + "type": "string", + "minLength": 1, + "description": "A text description of the image you want to generate." + }, + "guidance": { + "type": "number", + "default": 4.5, + "minimum": 0, + "maximum": 10, + "description": "Controls how closely the generated image should adhere to the prompt; higher values make the image more aligned with the prompt" + }, + "seed": { + "type": "integer", + "minimum": 0, + "description": "Random seed for reproducibility of the image generation" + }, + "height": { + "type": "integer", + "minimum": 0, + "maximum": 2500, + "default": 1120, + "description": "The height of the generated image in pixels" + }, + "width": { + "type": "integer", + "minimum": 0, + "maximum": 2500, + "default": 1120, + "description": "The width of the generated image in pixels" + }, + "num_steps": { + "type": "integer", + "default": 4, + "minimum": 1, + "maximum": 40, + "description": "The number of diffusion steps; higher values can improve quality but take longer" + } + }, + "required": [ + "prompt" + ] + }, + "output": { + "type": "object", + "contentType": "application/json", + "properties": { + "image": { + "type": "string", + "description": "The generated image in Base64 format." + } + } + } + } +} \ No newline at end of file diff --git a/src/content/workers-ai-models/nova-3.json b/src/content/workers-ai-models/nova-3.json new file mode 100644 index 00000000000000..653516920e6acb --- /dev/null +++ b/src/content/workers-ai-models/nova-3.json @@ -0,0 +1,308 @@ +{ + "id": "a226909f-eef8-4265-a3a0-90db0422762e", + "source": 1, + "name": "@cf/deepgram/nova-3", + "description": "Transcribe audio using Deepgram’s speech-to-text model", + "task": { + "id": "dfce1c48-2a81-462e-a7fd-de97ce985207", + "name": "Automatic Speech Recognition", + "description": "Automatic speech recognition (ASR) models convert a speech signal, typically an audio input, to text." + }, + "created_at": "2025-06-05 16:05:15.199", + "tags": [], + "properties": [ + { + "property_id": "async_queue", + "value": "true" + }, + { + "property_id": "price", + "value": [ + { + "unit": "per audio minute", + "price": 0.0052, + "currency": "USD" + } + ] + }, + { + "property_id": "partner", + "value": "true" + }, + { + "property_id": "realtime", + "value": "true" + } + ], + "schema": { + "input": { + "type": "object", + "properties": { + "audio": { + "type": "object", + "properties": { + "body": { + "type": "object" + }, + "contentType": { + "type": "string" + } + }, + "required": [ + "body", + "contentType" + ] + }, + "custom_topic_mode": { + "type": "string", + "enum": [ + "extended", + "strict" + ], + "description": "Sets how the model will interpret strings submitted to the custom_topic param. When strict, the model will only return topics submitted using the custom_topic param. When extended, the model will return its own detected topics in addition to those submitted using the custom_topic param." + }, + "custom_topic": { + "type": "string", + "description": "Custom topics you want the model to detect within your input audio or text if present Submit up to 100" + }, + "custom_intent_mode": { + "type": "string", + "description": "Sets how the model will interpret intents submitted to the custom_intent param. When strict, the model will only return intents submitted using the custom_intent param. When extended, the model will return its own detected intents in addition those submitted using the custom_intents param", + "enum": [ + "extended", + "strict" + ] + }, + "custom_intent": { + "type": "string", + "description": "Custom intents you want the model to detect within your input audio if present" + }, + "detect_entities": { + "type": "boolean", + "description": "Identifies and extracts key entities from content in submitted audio" + }, + "detect_language": { + "type": "boolean", + "description": "Identifies the dominant language spoken in submitted audio" + }, + "diarize": { + "type": "boolean", + "description": "Recognize speaker changes. Each word in the transcript will be assigned a speaker number starting at 0" + }, + "dictation": { + "type": "boolean", + "description": "Identify and extract key entities from content in submitted audio" + }, + "encoding": { + "type": "string", + "description": "Specify the expected encoding of your submitted audio", + "enum": [ + "linear16", + "flac", + "mulaw", + "amr-nb", + "amr-wb", + "opus", + "speex", + "g729" + ] + }, + "extra": { + "type": "string", + "description": "Arbitrary key-value pairs that are attached to the API response for usage in downstream processing" + }, + "filter_words": { + "type": "boolean", + "description": "Filler Words can help transcribe interruptions in your audio, like 'uh' and 'um'" + }, + "keyterm": { + "type": "string", + "description": "Key term prompting can boost or suppress specialized terminology and brands." + }, + "keywords": { + "type": "string", + "description": "Keywords can boost or suppress specialized terminology and brands." + }, + "language": { + "type": "string", + "description": "The BCP-47 language tag that hints at the primary spoken language. Depending on the Model and API endpoint you choose only certain languages are available." + }, + "measurements": { + "type": "boolean", + "description": "Spoken measurements will be converted to their corresponding abbreviations." + }, + "mip_opt_out": { + "type": "boolean", + "description": "Opts out requests from the Deepgram Model Improvement Program. Refer to our Docs for pricing impacts before setting this to true. https://dpgr.am/deepgram-mip." + }, + "mode": { + "type": "string", + "description": "Mode of operation for the model representing broad area of topic that will be talked about in the supplied audio", + "enum": [ + "general", + "medical", + "finance" + ] + }, + "multichannel": { + "type": "boolean", + "description": "Transcribe each audio channel independently." + }, + "numerals": { + "type": "boolean", + "description": "Numerals converts numbers from written format to numerical format." + }, + "paragraphs": { + "type": "boolean", + "description": "Splits audio into paragraphs to improve transcript readability." + }, + "profanity_filter": { + "type": "boolean", + "description": "Profanity Filter looks for recognized profanity and converts it to the nearest recognized non-profane word or removes it from the transcript completely." + }, + "punctuate": { + "type": "boolean", + "description": "Add punctuation and capitalization to the transcript." + }, + "redact": { + "type": "string", + "description": "Redaction removes sensitive information from your transcripts." + }, + "replace": { + "type": "string", + "description": "Search for terms or phrases in submitted audio and replaces them." + }, + "search": { + "type": "string", + "description": "Search for terms or phrases in submitted audio." + }, + "sentiment": { + "type": "boolean", + "description": "Recognizes the sentiment throughout a transcript or text." + }, + "smart_format": { + "type": "boolean", + "description": "Apply formatting to transcript output. When set to true, additional formatting will be applied to transcripts to improve readability." + }, + "topics": { + "type": "boolean", + "description": "Detect topics throughout a transcript or text." + }, + "utterances": { + "type": "boolean", + "description": "Segments speech into meaningful semantic units." + }, + "utt_split": { + "type": "number", + "description": "Seconds to wait before detecting a pause between words in submitted audio." + } + }, + "required": [ + "audio" + ] + }, + "output": { + "type": "object", + "contentType": "application/json", + "properties": { + "results": { + "type": "object", + "properties": { + "channels": { + "type": "array", + "items": { + "type": "object", + "properties": { + "alternatives": { + "type": "array", + "items": { + "type": "object", + "properties": { + "confidence": { + "type": "number" + }, + "transcript": { + "type": "string" + }, + "words": { + "type": "array", + "items": { + "type": "object", + "properties": { + "confidence": { + "type": "number" + }, + "end": { + "type": "number" + }, + "start": { + "type": "number" + }, + "word": { + "type": "string" + } + } + } + } + } + } + } + } + } + }, + "summary": { + "type": "object", + "properties": { + "result": { + "type": "string" + }, + "short": { + "type": "string" + } + } + }, + "sentiments": { + "type": "object", + "properties": { + "segments": { + "type": "array", + "items": { + "type": "object", + "properties": { + "text": { + "type": "string" + }, + "start_word": { + "type": "number" + }, + "end_word": { + "type": "number" + }, + "sentiment": { + "type": "string" + }, + "sentiment_score": { + "type": "number" + } + } + } + }, + "average": { + "type": "object", + "properties": { + "sentiment": { + "type": "string" + }, + "sentiment_score": { + "type": "number" + } + } + } + } + } + } + } + } + } + } +} \ No newline at end of file diff --git a/src/content/workers-ai-models/phoenix-1.0.json b/src/content/workers-ai-models/phoenix-1.0.json new file mode 100644 index 00000000000000..58e02d81e70a71 --- /dev/null +++ b/src/content/workers-ai-models/phoenix-1.0.json @@ -0,0 +1,93 @@ +{ + "id": "724608fa-983e-495d-b95c-340d6b7e78be", + "source": 1, + "name": "@cf/leonardo/phoenix-1.0", + "description": "Phoenix 1.0 is a model by Leonardo.Ai that generates images with exceptional prompt adherence and coherent text.", + "task": { + "id": "3d6e1f35-341b-4915-a6c8-9a7142a9033a", + "name": "Text-to-Image", + "description": "Generates images from input text. These models can be used to generate and modify images based on text prompts." + }, + "created_at": "2025-08-25 18:12:18.073", + "tags": [], + "properties": [ + { + "property_id": "price", + "value": [ + { + "unit": "per 512 by 512 tile", + "price": 0.0058, + "currency": "USD" + }, + { + "unit": "per step", + "price": 0.00011, + "currency": "USD" + } + ] + }, + { + "property_id": "partner", + "value": "true" + }, + ], + "schema": { + "input": { + "type": "object", + "properties": { + "prompt": { + "type": "string", + "minLength": 1, + "description": "A text description of the image you want to generate." + }, + "guidance": { + "type": "number", + "default": 2, + "minimum": 2, + "maximum": 10, + "description": "Controls how closely the generated image should adhere to the prompt; higher values make the image more aligned with the prompt" + }, + "seed": { + "type": "integer", + "minimum": 0, + "description": "Random seed for reproducibility of the image generation" + }, + "height": { + "type": "integer", + "minimum": 0, + "maximum": 2048, + "default": 1024, + "description": "The height of the generated image in pixels" + }, + "width": { + "type": "integer", + "minimum": 0, + "maximum": 2048, + "default": 1024, + "description": "The width of the generated image in pixels" + }, + "num_steps": { + "type": "integer", + "default": 25, + "minimum": 1, + "maximum": 50, + "description": "The number of diffusion steps; higher values can improve quality but take longer" + }, + "negative_prompt": { + "type": "string", + "minLength": 1, + "description": "Specify what to exclude from the generated images" + } + }, + "required": [ + "prompt" + ] + }, + "output": { + "type": "string", + "contentType": "image/jpeg", + "format": "binary", + "description": "The generated image in JPEG format" + } + } +} \ No newline at end of file From 8b7759b840621857deb6576d1c8386a253ef2c6d Mon Sep 17 00:00:00 2001 From: mchen Date: Wed, 27 Aug 2025 02:00:02 -0400 Subject: [PATCH 2/8] added pricing to page --- .../docs/workers-ai/platform/pricing.mdx | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/src/content/docs/workers-ai/platform/pricing.mdx b/src/content/docs/workers-ai/platform/pricing.mdx index 513728353f95de..03d2e7d48e214c 100644 --- a/src/content/docs/workers-ai/platform/pricing.mdx +++ b/src/content/docs/workers-ai/platform/pricing.mdx @@ -66,15 +66,30 @@ The Price in Tokens column is equivalent to the Price in Neurons column - the di | @cf/baai/bge-large-en-v1.5 | $0.204 per M input tokens | 18582 neurons per M input tokens | | @cf/baai/bge-m3 | $0.012 per M input tokens | 1075 neurons per M input tokens | -## Other model pricing +## Image model pricing | Model | Price in Tokens | Price in Neurons | | ------------------------------------- | ---------------------------------------------------------- | ------------------------------------------------------------------------ | | @cf/black-forest-labs/flux-1-schnell | $0.0000528 per 512x512 tile
$0.0001056 per step | 4.80 neurons per 512x512 tile
9.60 neurons per step | -| @cf/huggingface/distilbert-sst-2-int8 | $0.026 per M input tokens | 2394 neurons per M input tokens | -| @cf/baai/bge-reranker-base | $0.003 per M input tokens | 283 neurons per M input tokens | -| @cf/meta/m2m100-1.2b | $0.342 per M input tokens
$0.342 per M output tokens | 31050 neurons per M input tokens
31050 neurons per M output tokens | -| @cf/microsoft/resnet-50 | $2.51 per M images | 228055 neurons per M images | +| @cf/leonardo/lucid-origin | $0.006996 per 512x512 tile
$0.000132 per step | 636.00 neurons per 512x512 tile
12.00 neurons per step | +| @cf/leonardo/phoenix-1.0 | $0.005830 per 512x512 tile
$0.000110 per step | 530.00 neurons per 512x512 tile
10.00 neurons per step | + +## Audio model pricing + +| Model | Price in Tokens | Price in Neurons | +| ------------------------------------- | ---------------------------------------------------------- | ------------------------------------------------------------------------ | | @cf/openai/whisper | $0.0005 per audio minute | 41.14 neurons per audio minute | | @cf/openai/whisper-large-v3-turbo | $0.0005 per audio minute | 46.63 neurons per audio minute | | @cf/myshell-ai/melotts | $0.0002 per audio minute | 18.63 neurons per audio minute | +| @cf/deepgram/aura-1 | $0.015 per 1k characters input
| 1.36 neurons per 1k characters input
| +| @cf/deepgram/nova-3 | $0.0052 per audio minute output
| 7.88 neurons per audio minute output
| +| @cf/pipecat-ai/smart-turn-v2 | $0.00033795 per audio minute input
| 0.51 neurons per audio minute output
| + +## Other model pricing + +| Model | Price in Tokens | Price in Neurons | +| ------------------------------------- | ---------------------------------------------------------- | ------------------------------------------------------------------------ | +| @cf/huggingface/distilbert-sst-2-int8 | $0.026 per M input tokens | 2394 neurons per M input tokens | +| @cf/baai/bge-reranker-base | $0.003 per M input tokens | 283 neurons per M input tokens | +| @cf/meta/m2m100-1.2b | $0.342 per M input tokens
$0.342 per M output tokens | 31050 neurons per M input tokens
31050 neurons per M output tokens | +| @cf/microsoft/resnet-50 | $2.51 per M images | 228055 neurons per M images | \ No newline at end of file From 9343934ebd9dc39d1c9a312db403ab69d9449553 Mon Sep 17 00:00:00 2001 From: mchen Date: Wed, 27 Aug 2025 02:38:56 -0400 Subject: [PATCH 3/8] changelogs --- .../workers-ai/2025-08-27-partner-models.mdx | 22 +++++++++++++++++++ src/content/release-notes/workers-ai.yaml | 15 +++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 src/content/changelog/workers-ai/2025-08-27-partner-models.mdx diff --git a/src/content/changelog/workers-ai/2025-08-27-partner-models.mdx b/src/content/changelog/workers-ai/2025-08-27-partner-models.mdx new file mode 100644 index 00000000000000..9da20dc2e03ad9 --- /dev/null +++ b/src/content/changelog/workers-ai/2025-08-27-partner-models.mdx @@ -0,0 +1,22 @@ +--- +title: Deepgram and Leonardo partner models now available on Workers AI +description: State-of-the-art TTS, STT and image generation models, hosted on Workers AI infrastructure +products: + - workers-ai +date: 2025-08-27 +--- + +New state-of-the-art models have landed on Workers AI! This time, we're introducing new **partner models** trained by our friends at [Deepgram](https://deepgram.com) and [Leonardo](https://leonardo.ai), hosted on Workers AI infrastructure. + +As well, we're introuding a new turn detection model that enables you to detect when someone is done speaking — useful for building voice agents! + +Read the [blog](https://blog.cloudflare.com/workers-ai-partner-models) for more details and check out some of the new models on our platform: +- [`@cf/deepgram/aura-1`](/workers-ai/models/aura-1) is a text-to-speech model that allows you to input text and have it come to life in a customizable voice +- [`@cf/deepgram/nova-3`](/workers-ai/models/nova-3) is speech-to-text model that transcribes multilingual audio at a blazingly fast speed +- [`@cf/pipecat-ai/smart-turn-v2`](/workers-ai/models/smart-turn-v2) helps you detect when someone is done speaking +- [`@cf/leonardo/lucid-origin`](/workers-ai/models/lucid-origin) is a text-to-image model that generates images with sharp graphic design, stunning full-HD renders, or highly specific creative direction +- [`@cf/leonardo/phoenix-1.0`](/workers-ai/models/phoenix-1.0) is a text-to-image model with exceptional prompt adherence and coherent text + +You can filter out new partner models with the `Partner` capability on our [Models](/workers-ai/models) page. + +As well, we're introducing WebSocket support for some of our audio models, which you can filter though the `Realtime` capability on our [Models](/workers-ai/models) page. WebSockets allows you to create a bi-directional connection to our inference server with low latency — perfect for those that are building voice agents. \ No newline at end of file diff --git a/src/content/release-notes/workers-ai.yaml b/src/content/release-notes/workers-ai.yaml index 03cd725e75c1b0..e627cef9bd5a56 100644 --- a/src/content/release-notes/workers-ai.yaml +++ b/src/content/release-notes/workers-ai.yaml @@ -3,6 +3,21 @@ link: "/workers-ai/changelog/" productName: Workers AI productLink: "/workers-ai/" entries: + - publish_date: "2025-08-27" + title: Introducing Partner models to the Workers AI catalog + description: |- + - Read the [blog](https://blog.cloudflare.com/workers-ai-partner-models) for more details + - [`@cf/deepgram/aura-1`](/workers-ai/models/aura-1) is a text-to-speech model that allows you to input text and have it come to life in a customizable voice + - [`@cf/deepgram/nova-3`](/workers-ai/models/nova-3) is speech-to-text model that transcribes multilingual audio at a blazingly fast speed + - [`@cf/pipecat-ai/smart-turn-v2`](/workers-ai/models/smart-turn-v2) helps you detect when someone is done speaking + - [`@cf/leonardo/lucid-origin`](/workers-ai/models/lucid-origin) is a text-to-image model that generates images with sharp graphic design, stunning full-HD renders, or highly specific creative direction + - [`@cf/leonardo/phoenix-1.0`](/workers-ai/models/phoenix-1.0) is a text-to-image model with exceptional prompt adherence and coherent text + - WebSocket support added for audio models like `@cf/deepgram/aura-1`, `@cf/deepgram/nova-3`, `@cf/pipecat-ai/smart-turn-v2` + - publish_date: "2025-08-05" + title: Adding gpt-oss models to our catalog + description: |- + - Check out the [blog](https://blog.cloudflare.com/openai-gpt-oss-on-workers-ai) for more details about the new models + - Take a look at the [`gpt-oss-120b`](/workers-ai/models/gpt-oss-120b) and [`gpt-oss-20b`](/workers-ai/models/gpt-oss-20b) model pages for more information about schemas, pricing, and context windows - publish_date: "2025-04-09" title: Pricing correction for @cf/myshell-ai/melotts description: |- From 09e7701051f605fb6b1fda6496aea5969481055c Mon Sep 17 00:00:00 2001 From: mchen Date: Wed, 27 Aug 2025 02:43:10 -0400 Subject: [PATCH 4/8] code example --- .../workers-ai/2025-08-27-partner-models.mdx | 37 ++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/src/content/changelog/workers-ai/2025-08-27-partner-models.mdx b/src/content/changelog/workers-ai/2025-08-27-partner-models.mdx index 9da20dc2e03ad9..21359bd2975b68 100644 --- a/src/content/changelog/workers-ai/2025-08-27-partner-models.mdx +++ b/src/content/changelog/workers-ai/2025-08-27-partner-models.mdx @@ -19,4 +19,39 @@ Read the [blog](https://blog.cloudflare.com/workers-ai-partner-models) for more You can filter out new partner models with the `Partner` capability on our [Models](/workers-ai/models) page. -As well, we're introducing WebSocket support for some of our audio models, which you can filter though the `Realtime` capability on our [Models](/workers-ai/models) page. WebSockets allows you to create a bi-directional connection to our inference server with low latency — perfect for those that are building voice agents. \ No newline at end of file +As well, we're introducing WebSocket support for some of our audio models, which you can filter though the `Realtime` capability on our [Models](/workers-ai/models) page. WebSockets allows you to create a bi-directional connection to our inference server with low latency — perfect for those that are building voice agents. + +An example python snippet on how to use WebSockets with our new Aura model: + +``` +import json +import os +import asyncio +import websockets + +uri = f"wss://api.cloudflare.com/client/v4/accounts/{ACCOUNT_ID}/ai/run/@cf/deepgram/aura-1" + +input = [ + "Line one, out of three lines that will be provided to the aura model.", + "Line two, out of three lines that will be provided to the aura model.", + "Line three, out of three lines that will be provided to the aura model. This is a last line.", +] + + +async def text_to_speech(): + async with websockets.connect(uri, additional_headers={"Authorization": os.getenv("CF_TOKEN")}) as websocket: + print("connection established") + for line in input: + print(f"sending `{line}`") + await websocket.send(json.dumps({"type": "Speak", "text": line})) + + print("line was sent, flushing") + await websocket.send(json.dumps({"type": "Flush"})) + print("flushed, recving") + resp = await websocket.recv() + print(f"response received {resp}") + + +if __name__ == "__main__": + asyncio.run(text_to_speech()) +``` \ No newline at end of file From 2f7dc731cd1b8c081519727d44451430c735c65d Mon Sep 17 00:00:00 2001 From: mchen Date: Wed, 27 Aug 2025 02:45:33 -0400 Subject: [PATCH 5/8] trailing comma --- src/content/workers-ai-models/phoenix-1.0.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/content/workers-ai-models/phoenix-1.0.json b/src/content/workers-ai-models/phoenix-1.0.json index 58e02d81e70a71..78888d44125450 100644 --- a/src/content/workers-ai-models/phoenix-1.0.json +++ b/src/content/workers-ai-models/phoenix-1.0.json @@ -29,7 +29,7 @@ { "property_id": "partner", "value": "true" - }, + } ], "schema": { "input": { From 5b9393f0360045d1623d642707a0b3852233643e Mon Sep 17 00:00:00 2001 From: mchen Date: Wed, 27 Aug 2025 07:35:19 -0400 Subject: [PATCH 6/8] svg logos --- src/assets/images/workers-ai/deepgram.svg | 1 + src/assets/images/workers-ai/leonardo.svg | 1 + src/components/models/data.ts | 10 ++++++++++ 3 files changed, 12 insertions(+) create mode 100644 src/assets/images/workers-ai/deepgram.svg create mode 100644 src/assets/images/workers-ai/leonardo.svg diff --git a/src/assets/images/workers-ai/deepgram.svg b/src/assets/images/workers-ai/deepgram.svg new file mode 100644 index 00000000000000..06348cc153e5bd --- /dev/null +++ b/src/assets/images/workers-ai/deepgram.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/src/assets/images/workers-ai/leonardo.svg b/src/assets/images/workers-ai/leonardo.svg new file mode 100644 index 00000000000000..7a067ab4d9957a --- /dev/null +++ b/src/assets/images/workers-ai/leonardo.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/src/components/models/data.ts b/src/components/models/data.ts index d3c85fdcce153e..8290245bdb916c 100644 --- a/src/components/models/data.ts +++ b/src/components/models/data.ts @@ -8,6 +8,8 @@ import google from "../../assets/images/workers-ai/google.svg"; import deepseek from "../../assets/images/workers-ai/deepseek.svg"; import qwen from "../../assets/images/workers-ai/qwen.svg"; import blackforestlabs from "../../assets/images/workers-ai/blackforestlabs.svg"; +import deepgram from "../../assets/images/workers-ai/deepgram.svg"; +import leonardo from "../../assets/images/workers-ai/leonardo.svg"; export const authorData: Record = { openai: { @@ -54,4 +56,12 @@ export const authorData: Record = { name: "Black Forest Labs", logo: blackforestlabs.src, }, + "deepgram": { + name: "Deepgram", + logo: deepgram.src, + }, + "leonardo": { + name: "Leonardo", + logo: leonardo.src, + }, }; From 0a00eaf600359db5641942e48839b8d6149c88b7 Mon Sep 17 00:00:00 2001 From: mchen Date: Wed, 27 Aug 2025 08:16:15 -0400 Subject: [PATCH 7/8] adding pipecat --- .../workers-ai-models/smart-turn-v2.json | 102 ++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 src/content/workers-ai-models/smart-turn-v2.json diff --git a/src/content/workers-ai-models/smart-turn-v2.json b/src/content/workers-ai-models/smart-turn-v2.json new file mode 100644 index 00000000000000..fe90b53f827fb9 --- /dev/null +++ b/src/content/workers-ai-models/smart-turn-v2.json @@ -0,0 +1,102 @@ +{ + "id": "fe8904cf-e20e-4884-b829-ed7cec0a01cb", + "source": 1, + "name": "@cf/pipecat-ai/smart-turn-v2", + "description": "An open source, community-driven, native audio turn detection model in 2nd version", + "task": { + "id": "ccb1ca5a-043d-41a7-8a3b-61017b2796fd", + "name": "Voice Activity Detection", + "description": "Detecting the presence or absence of human speech, used in speech processing." + }, + "created_at": "2025-08-04 10:08:04.219", + "tags": [], + "properties": [ + { + "property_id": "async_queue", + "value": "true" + }, + { + "property_id": "price", + "value": [ + { + "unit": "per audio minute", + "price": 0.00034, + "currency": "USD" + } + ] + } + ], + "schema": { + "input": { + "type": "object", + "oneOf": [ + { + "properties": { + "audio": { + "type": "object", + "description": "readable stream with audio data and content-type specified for that data", + "properties": { + "body": { + "type": "object" + }, + "contentType": { + "type": "string" + } + }, + "required": [ + "body", + "contentType" + ] + }, + "dtype": { + "type": "string", + "description": "type of data PCM data that's sent to the inference server as raw array", + "enum": [ + "uint8", + "float32", + "float64" + ] + } + }, + "required": [ + "audio" + ] + }, + { + "properties": { + "audio": { + "type": "string", + "description": "base64 encoded audio data" + }, + "dtype": { + "type": "string", + "description": "type of data PCM data that's sent to the inference server as raw array", + "enum": [ + "uint8", + "float32", + "float64" + ] + } + }, + "required": [ + "audio" + ] + } + ] + }, + "output": { + "type": "object", + "contentType": "application/json", + "properties": { + "is_complete": { + "type": "boolean", + "description": "if true, end-of-turn was detected" + }, + "probability": { + "type": "number", + "description": "probability of the end-of-turn detection" + } + } + } + } +} \ No newline at end of file From 467cf80e1e3869f1f520d89eeef789bda6e2fb6c Mon Sep 17 00:00:00 2001 From: mchen Date: Wed, 27 Aug 2025 08:20:51 -0400 Subject: [PATCH 8/8] prettier --- src/components/models/data.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/components/models/data.ts b/src/components/models/data.ts index 8290245bdb916c..6ea493e7532f15 100644 --- a/src/components/models/data.ts +++ b/src/components/models/data.ts @@ -56,11 +56,11 @@ export const authorData: Record = { name: "Black Forest Labs", logo: blackforestlabs.src, }, - "deepgram": { + deepgram: { name: "Deepgram", logo: deepgram.src, }, - "leonardo": { + leonardo: { name: "Leonardo", logo: leonardo.src, },