From 6049c5bfb8ea0410f6267496e41e3c61e696044e Mon Sep 17 00:00:00 2001 From: Kilerd Chan Date: Fri, 20 Mar 2026 17:56:51 +0900 Subject: [PATCH] docs: multispeaker is only for s2-pro --- api-reference/openapi.json | 4 ++-- developer-guide/models-pricing/models-overview.mdx | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/api-reference/openapi.json b/api-reference/openapi.json index d9dae74..0324dc4 100644 --- a/api-reference/openapi.json +++ b/api-reference/openapi.json @@ -2185,7 +2185,7 @@ "type": "null" } ], - "description": "Inline voice references for zero-shot cloning. Requires MessagePack (not JSON). For single speaker, provide an array of ReferenceAudio objects. For multiple speakers, provide an array of arrays where each inner array contains references for one speaker. The speaker index corresponds to the index in reference_id array. Example for multi-speaker: [[{audio, text}], [{audio, text}, {audio, text}]] for 2 speakers where speaker 1 has 2 reference samples.", + "description": "Inline voice references for zero-shot cloning. Requires MessagePack (not JSON). For single speaker, provide an array of ReferenceAudio objects. For multiple speakers, provide an array of arrays where each inner array contains references for one speaker. **Multi-speaker is only available with the S2-Pro model.** The speaker index corresponds to the index in reference_id array. Example for multi-speaker: [[{audio, text}], [{audio, text}, {audio, text}]] for 2 speakers where speaker 1 has 2 reference samples.", "title": "References" }, "reference_id": { @@ -2206,7 +2206,7 @@ } ], "default": null, - "description": "Voice model ID(s) from Fish Audio library or your custom models. For single speaker synthesis, provide a string. For multi-speaker synthesis (e.g., dialogue), provide an array of model IDs. When using multiple speakers, use speaker tags in your text like [0] and [1] to indicate which speaker should speak each part. Example: '[0]Hello![1]Hi there![0]How are you?' with reference_id: ['speaker-a-id', 'speaker-b-id']", + "description": "Voice model ID(s) from Fish Audio library or your custom models. For single speaker synthesis, provide a string. For multi-speaker synthesis (e.g., dialogue), provide an array of model IDs. **Multi-speaker is only available with the S2-Pro model.** When using multiple speakers, use speaker tags in your text like [0] and [1] to indicate which speaker should speak each part. Example: '[0]Hello![1]Hi there![0]How are you?' with reference_id: ['speaker-a-id', 'speaker-b-id']", "title": "Reference Id" }, "prosody": { diff --git a/developer-guide/models-pricing/models-overview.mdx b/developer-guide/models-pricing/models-overview.mdx index 0bdf3d7..6825825 100644 --- a/developer-guide/models-pricing/models-overview.mdx +++ b/developer-guide/models-pricing/models-overview.mdx @@ -52,7 +52,7 @@ Fish Audio offers state-of-the-art text-to-speech models optimized for different **Fish Audio S2-Pro** - Our next-generation TTS model with best-in-class performance - Natural language control with `[bracket]` syntax — not limited to a fixed set (e.g., `[whispers sweetly]`, `[laughing nervously]`) - - Multi-speaker dialogue support + - Multi-speaker dialogue support **(S2-Pro exclusive)** - 80+ languages - 100ms time-to-first-audio - Full SGLang-based serving stack