diff --git a/.changeset/dull-ligers-bow.md b/.changeset/dull-ligers-bow.md new file mode 100644 index 00000000000..13606d73f9f --- /dev/null +++ b/.changeset/dull-ligers-bow.md @@ -0,0 +1,7 @@ +--- +'firebase': minor +'@firebase/ai': minor +--- + +Deprecate `sendMediaChunks()` and `sendMediaStream()`. Instead, use the new methods added to the `LiveSession` class. +Add `sendTextRealtime()`, `sendAudioReatime()`, and `sendVideoRealtime()` to the `LiveSession` class. diff --git a/.changeset/fast-rocks-sing.md b/.changeset/fast-rocks-sing.md new file mode 100644 index 00000000000..58c9a01c410 --- /dev/null +++ b/.changeset/fast-rocks-sing.md @@ -0,0 +1,5 @@ +--- +'@firebase/ai': minor +--- + +Add support for audio transcriptions in the Live API. diff --git a/common/api-review/ai.api.md b/common/api-review/ai.api.md index f3113e0ac22..a13fcb9cdf9 100644 --- a/common/api-review/ai.api.md +++ b/common/api-review/ai.api.md @@ -92,6 +92,10 @@ export interface AudioConversationController { stop: () => Promise; } +// @public +export interface AudioTranscriptionConfig { +} + // @public export abstract class Backend { protected constructor(type: BackendType); @@ -922,7 +926,9 @@ export interface LanguageModelPromptOptions { // @beta export interface LiveGenerationConfig { frequencyPenalty?: number; + inputAudioTranscription?: AudioTranscriptionConfig; maxOutputTokens?: number; + outputAudioTranscription?: AudioTranscriptionConfig; presencePenalty?: number; responseModalities?: ResponseModality[]; speechConfig?: SpeechConfig; @@ -975,8 +981,10 @@ export type LiveResponseType = (typeof LiveResponseType)[keyof typeof LiveRespon // @beta export interface LiveServerContent { + inputTranscription?: Transcription; interrupted?: boolean; modelTurn?: Content; + outputTranscription?: Transcription; turnComplete?: boolean; // (undocumented) type: 'serverContent'; @@ -1005,9 +1013,14 @@ export class LiveSession { isClosed: boolean; receive(): AsyncGenerator; send(request: string | Array, turnComplete?: boolean): Promise; + sendAudioRealtime(blob: GenerativeContentBlob): Promise; sendFunctionResponses(functionResponses: FunctionResponse[]): Promise; + // @deprecated sendMediaChunks(mediaChunks: GenerativeContentBlob[]): Promise; + // @deprecated (undocumented) sendMediaStream(mediaChunkStream: ReadableStream): Promise; + sendTextRealtime(text: string): Promise; + sendVideoRealtime(blob: GenerativeContentBlob): Promise; } // @public @@ -1337,6 +1350,11 @@ export interface ToolConfig { functionCallingConfig?: FunctionCallingConfig; } +// @beta +export interface Transcription { + text?: string; +} + // @public export type TypedSchema = IntegerSchema | NumberSchema | StringSchema | BooleanSchema | ObjectSchema | ArraySchema | AnyOfSchema; diff --git a/docs-devsite/_toc.yaml b/docs-devsite/_toc.yaml index 04d65f6c333..4f3bb1f3ca4 100644 --- a/docs-devsite/_toc.yaml +++ b/docs-devsite/_toc.yaml @@ -18,6 +18,8 @@ toc: path: /docs/reference/js/ai.arrayschema.md - title: AudioConversationController path: /docs/reference/js/ai.audioconversationcontroller.md + - title: AudioTranscriptionConfig + path: /docs/reference/js/ai.audiotranscriptionconfig.md - title: Backend path: /docs/reference/js/ai.backend.md - title: BaseParams @@ -202,6 +204,8 @@ toc: path: /docs/reference/js/ai.thinkingconfig.md - title: ToolConfig path: /docs/reference/js/ai.toolconfig.md + - title: Transcription + path: /docs/reference/js/ai.transcription.md - title: URLContext path: /docs/reference/js/ai.urlcontext.md - title: URLContextMetadata diff --git a/docs-devsite/ai.audiotranscriptionconfig.md b/docs-devsite/ai.audiotranscriptionconfig.md new file mode 100644 index 00000000000..ff53c9061ea --- /dev/null +++ b/docs-devsite/ai.audiotranscriptionconfig.md @@ -0,0 +1,19 @@ +Project: /docs/reference/js/_project.yaml +Book: /docs/reference/_book.yaml +page_type: reference + +{% comment %} +DO NOT EDIT THIS FILE! +This is generated by the JS SDK team, and any local changes will be +overwritten. Changes should be made in the source code at +https://github.com/firebase/firebase-js-sdk +{% endcomment %} + +# AudioTranscriptionConfig interface +The audio transcription configuration. + +Signature: + +```typescript +export interface AudioTranscriptionConfig +``` diff --git a/docs-devsite/ai.livegenerationconfig.md b/docs-devsite/ai.livegenerationconfig.md index 1a920afa1e7..2e842a34313 100644 --- a/docs-devsite/ai.livegenerationconfig.md +++ b/docs-devsite/ai.livegenerationconfig.md @@ -26,7 +26,9 @@ export interface LiveGenerationConfig | Property | Type | Description | | --- | --- | --- | | [frequencyPenalty](./ai.livegenerationconfig.md#livegenerationconfigfrequencypenalty) | number | (Public Preview) Frequency penalties. | +| [inputAudioTranscription](./ai.livegenerationconfig.md#livegenerationconfiginputaudiotranscription) | [AudioTranscriptionConfig](./ai.audiotranscriptionconfig.md#audiotranscriptionconfig_interface) | (Public Preview) Enables transcription of audio input.When enabled, the model will respond with transcriptions of your audio input in the inputTranscriptions property in [LiveServerContent](./ai.liveservercontent.md#liveservercontent_interface) messages. Note that the transcriptions are broken up across messages, so you may only receive small amounts of text per message. For example, if you ask the model "How are you today?", the model may transcribe that input across three messages, broken up as "How a", "re yo", "u today?". | | [maxOutputTokens](./ai.livegenerationconfig.md#livegenerationconfigmaxoutputtokens) | number | (Public Preview) Specifies the maximum number of tokens that can be generated in the response. The number of tokens per word varies depending on the language outputted. Is unbounded by default. | +| [outputAudioTranscription](./ai.livegenerationconfig.md#livegenerationconfigoutputaudiotranscription) | [AudioTranscriptionConfig](./ai.audiotranscriptionconfig.md#audiotranscriptionconfig_interface) | (Public Preview) Enables transcription of audio input.When enabled, the model will respond with transcriptions of its audio output in the outputTranscription property in [LiveServerContent](./ai.liveservercontent.md#liveservercontent_interface) messages. Note that the transcriptions are broken up across messages, so you may only receive small amounts of text per message. For example, if the model says "How are you today?", the model may transcribe that output across three messages, broken up as "How a", "re yo", "u today?". | | [presencePenalty](./ai.livegenerationconfig.md#livegenerationconfigpresencepenalty) | number | (Public Preview) Positive penalties. | | [responseModalities](./ai.livegenerationconfig.md#livegenerationconfigresponsemodalities) | [ResponseModality](./ai.md#responsemodality)\[\] | (Public Preview) The modalities of the response. | | [speechConfig](./ai.livegenerationconfig.md#livegenerationconfigspeechconfig) | [SpeechConfig](./ai.speechconfig.md#speechconfig_interface) | (Public Preview) Configuration for speech synthesis. | @@ -47,6 +49,21 @@ Frequency penalties. frequencyPenalty?: number; ``` +## LiveGenerationConfig.inputAudioTranscription + +> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. +> + +Enables transcription of audio input. + +When enabled, the model will respond with transcriptions of your audio input in the `inputTranscriptions` property in [LiveServerContent](./ai.liveservercontent.md#liveservercontent_interface) messages. Note that the transcriptions are broken up across messages, so you may only receive small amounts of text per message. For example, if you ask the model "How are you today?", the model may transcribe that input across three messages, broken up as "How a", "re yo", "u today?". + +Signature: + +```typescript +inputAudioTranscription?: AudioTranscriptionConfig; +``` + ## LiveGenerationConfig.maxOutputTokens > This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. @@ -60,6 +77,21 @@ Specifies the maximum number of tokens that can be generated in the response. Th maxOutputTokens?: number; ``` +## LiveGenerationConfig.outputAudioTranscription + +> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. +> + +Enables transcription of audio input. + +When enabled, the model will respond with transcriptions of its audio output in the `outputTranscription` property in [LiveServerContent](./ai.liveservercontent.md#liveservercontent_interface) messages. Note that the transcriptions are broken up across messages, so you may only receive small amounts of text per message. For example, if the model says "How are you today?", the model may transcribe that output across three messages, broken up as "How a", "re yo", "u today?". + +Signature: + +```typescript +outputAudioTranscription?: AudioTranscriptionConfig; +``` + ## LiveGenerationConfig.presencePenalty > This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. diff --git a/docs-devsite/ai.liveservercontent.md b/docs-devsite/ai.liveservercontent.md index f9c3ca1de79..6162601b8c2 100644 --- a/docs-devsite/ai.liveservercontent.md +++ b/docs-devsite/ai.liveservercontent.md @@ -25,11 +25,26 @@ export interface LiveServerContent | Property | Type | Description | | --- | --- | --- | +| [inputTranscription](./ai.liveservercontent.md#liveservercontentinputtranscription) | [Transcription](./ai.transcription.md#transcription_interface) | (Public Preview) Transcription of the audio that was input to the model. | | [interrupted](./ai.liveservercontent.md#liveservercontentinterrupted) | boolean | (Public Preview) Indicates whether the model was interrupted by the client. An interruption occurs when the client sends a message before the model finishes it's turn. This is undefined if the model was not interrupted. | | [modelTurn](./ai.liveservercontent.md#liveservercontentmodelturn) | [Content](./ai.content.md#content_interface) | (Public Preview) The content that the model has generated as part of the current conversation with the user. | +| [outputTranscription](./ai.liveservercontent.md#liveservercontentoutputtranscription) | [Transcription](./ai.transcription.md#transcription_interface) | (Public Preview) Transcription of the audio output from the model. | | [turnComplete](./ai.liveservercontent.md#liveservercontentturncomplete) | boolean | (Public Preview) Indicates whether the turn is complete. This is undefined if the turn is not complete. | | [type](./ai.liveservercontent.md#liveservercontenttype) | 'serverContent' | (Public Preview) | +## LiveServerContent.inputTranscription + +> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. +> + +Transcription of the audio that was input to the model. + +Signature: + +```typescript +inputTranscription?: Transcription; +``` + ## LiveServerContent.interrupted > This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. @@ -56,6 +71,19 @@ The content that the model has generated as part of the current conversation wit modelTurn?: Content; ``` +## LiveServerContent.outputTranscription + +> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. +> + +Transcription of the audio output from the model. + +Signature: + +```typescript +outputTranscription?: Transcription; +``` + ## LiveServerContent.turnComplete > This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. diff --git a/docs-devsite/ai.livesession.md b/docs-devsite/ai.livesession.md index 558c5eb3bd6..2f05fbc924b 100644 --- a/docs-devsite/ai.livesession.md +++ b/docs-devsite/ai.livesession.md @@ -39,9 +39,12 @@ export declare class LiveSession | [close()](./ai.livesession.md#livesessionclose) | | (Public Preview) Closes this session. All methods on this session will throw an error once this resolves. | | [receive()](./ai.livesession.md#livesessionreceive) | | (Public Preview) Yields messages received from the server. This can only be used by one consumer at a time. | | [send(request, turnComplete)](./ai.livesession.md#livesessionsend) | | (Public Preview) Sends content to the server. | +| [sendAudioRealtime(blob)](./ai.livesession.md#livesessionsendaudiorealtime) | | (Public Preview) Sends audio data to the server in realtime. | | [sendFunctionResponses(functionResponses)](./ai.livesession.md#livesessionsendfunctionresponses) | | (Public Preview) Sends function responses to the server. | | [sendMediaChunks(mediaChunks)](./ai.livesession.md#livesessionsendmediachunks) | | (Public Preview) Sends realtime input to the server. | -| [sendMediaStream(mediaChunkStream)](./ai.livesession.md#livesessionsendmediastream) | | (Public Preview) Sends a stream of [GenerativeContentBlob](./ai.generativecontentblob.md#generativecontentblob_interface). | +| [sendMediaStream(mediaChunkStream)](./ai.livesession.md#livesessionsendmediastream) | | (Public Preview) | +| [sendTextRealtime(text)](./ai.livesession.md#livesessionsendtextrealtime) | | (Public Preview) Sends text to the server in realtime. | +| [sendVideoRealtime(blob)](./ai.livesession.md#livesessionsendvideorealtime) | | (Public Preview) Sends video data to the server in realtime. | ## LiveSession.inConversation @@ -135,6 +138,45 @@ Promise<void> If this session has been closed. +## LiveSession.sendAudioRealtime() + +> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. +> + +Sends audio data to the server in realtime. + +The server requires that the audio data is base64-encoded 16-bit PCM at 16kHz little-endian. + +Signature: + +```typescript +sendAudioRealtime(blob: GenerativeContentBlob): Promise; +``` + +#### Parameters + +| Parameter | Type | Description | +| --- | --- | --- | +| blob | [GenerativeContentBlob](./ai.generativecontentblob.md#generativecontentblob_interface) | The base64-encoded PCM data to send to the server in realtime. | + +Returns: + +Promise<void> + +#### Exceptions + +If this session has been closed. + +### Example + + +```javascript +// const pcmData = ... base64-encoded 16-bit PCM at 16kHz little-endian. +const blob = { mimeType: "audio/pcm", data: pcmData }; +liveSession.sendAudioRealtime(blob); + +``` + ## LiveSession.sendFunctionResponses() > This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. @@ -167,6 +209,11 @@ If this session has been closed. > This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. > +> Warning: This API is now obsolete. +> +> Use `sendTextRealtime()`, `sendAudioRealtime()`, and `sendVideoRealtime()` instead. +> + Sends realtime input to the server. Signature: @@ -194,7 +241,12 @@ If this session has been closed. > This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. > -Sends a stream of [GenerativeContentBlob](./ai.generativecontentblob.md#generativecontentblob_interface). +> Warning: This API is now obsolete. +> +> Use `sendTextRealtime()`, `sendAudioRealtime()`, and `sendVideoRealtime()` instead. +> +> Sends a stream of [GenerativeContentBlob](./ai.generativecontentblob.md#generativecontentblob_interface). +> Signature: @@ -216,3 +268,77 @@ Promise<void> If this session has been closed. +## LiveSession.sendTextRealtime() + +> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. +> + +Sends text to the server in realtime. + +Signature: + +```typescript +sendTextRealtime(text: string): Promise; +``` + +#### Parameters + +| Parameter | Type | Description | +| --- | --- | --- | +| text | string | The text data to send. | + +Returns: + +Promise<void> + +#### Exceptions + +If this session has been closed. + +### Example + + +```javascript +liveSession.sendTextRealtime("Hello, how are you?"); + +``` + +## LiveSession.sendVideoRealtime() + +> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. +> + +Sends video data to the server in realtime. + +The server requires that the video is sent as individual video frames at 1 FPS. It is recommended to set `mimeType` to `image/jpeg`. + +Signature: + +```typescript +sendVideoRealtime(blob: GenerativeContentBlob): Promise; +``` + +#### Parameters + +| Parameter | Type | Description | +| --- | --- | --- | +| blob | [GenerativeContentBlob](./ai.generativecontentblob.md#generativecontentblob_interface) | The base64-encoded video data to send to the server in realtime. | + +Returns: + +Promise<void> + +#### Exceptions + +If this session has been closed. + +### Example + + +```javascript +// const videoFrame = ... base64-encoded JPEG data +const blob = { mimeType: "image/jpeg", data: videoFrame }; +liveSession.sendVideoRealtime(blob); + +``` + diff --git a/docs-devsite/ai.md b/docs-devsite/ai.md index fabdbc5cc55..79902cab4e7 100644 --- a/docs-devsite/ai.md +++ b/docs-devsite/ai.md @@ -56,6 +56,7 @@ The Firebase AI Web SDK. | [AI](./ai.ai.md#ai_interface) | An instance of the Firebase AI SDK.Do not create this instance directly. Instead, use [getAI()](./ai.md#getai_a94a413). | | [AIOptions](./ai.aioptions.md#aioptions_interface) | Options for initializing the AI service using [getAI()](./ai.md#getai_a94a413). This allows specifying which backend to use (Vertex AI Gemini API or Gemini Developer API) and configuring its specific options (like location for Vertex AI). | | [AudioConversationController](./ai.audioconversationcontroller.md#audioconversationcontroller_interface) | (Public Preview) A controller for managing an active audio conversation. | +| [AudioTranscriptionConfig](./ai.audiotranscriptionconfig.md#audiotranscriptionconfig_interface) | The audio transcription configuration. | | [BaseParams](./ai.baseparams.md#baseparams_interface) | Base parameters for a number of methods. | | [ChromeAdapter](./ai.chromeadapter.md#chromeadapter_interface) | (Public Preview) Defines an inference "backend" that uses Chrome's on-device model, and encapsulates logic for detecting when on-device inference is possible.These methods should not be called directly by the user. | | [Citation](./ai.citation.md#citation_interface) | A single citation. | @@ -134,6 +135,7 @@ The Firebase AI Web SDK. | [TextPart](./ai.textpart.md#textpart_interface) | Content part interface if the part represents a text string. | | [ThinkingConfig](./ai.thinkingconfig.md#thinkingconfig_interface) | Configuration for "thinking" behavior of compatible Gemini models.Certain models utilize a thinking process before generating a response. This allows them to reason through complex problems and plan a more coherent and accurate answer. | | [ToolConfig](./ai.toolconfig.md#toolconfig_interface) | Tool config. This config is shared for all tools provided in the request. | +| [Transcription](./ai.transcription.md#transcription_interface) | (Public Preview) Transcription of audio. This can be returned from a [LiveGenerativeModel](./ai.livegenerativemodel.md#livegenerativemodel_class) if transcription is enabled with the inputAudioTranscription or outputAudioTranscription properties on the [LiveGenerationConfig](./ai.livegenerationconfig.md#livegenerationconfig_interface). | | [URLContext](./ai.urlcontext.md#urlcontext_interface) | (Public Preview) Specifies the URL Context configuration. | | [URLContextMetadata](./ai.urlcontextmetadata.md#urlcontextmetadata_interface) | (Public Preview) Metadata related to [URLContextTool](./ai.urlcontexttool.md#urlcontexttool_interface). | | [URLContextTool](./ai.urlcontexttool.md#urlcontexttool_interface) | (Public Preview) A tool that allows you to provide additional context to the models in the form of public web URLs. By including URLs in your request, the Gemini model will access the content from those pages to inform and enhance its response. | diff --git a/docs-devsite/ai.transcription.md b/docs-devsite/ai.transcription.md new file mode 100644 index 00000000000..7ab6a360d5e --- /dev/null +++ b/docs-devsite/ai.transcription.md @@ -0,0 +1,41 @@ +Project: /docs/reference/js/_project.yaml +Book: /docs/reference/_book.yaml +page_type: reference + +{% comment %} +DO NOT EDIT THIS FILE! +This is generated by the JS SDK team, and any local changes will be +overwritten. Changes should be made in the source code at +https://github.com/firebase/firebase-js-sdk +{% endcomment %} + +# Transcription interface +> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. +> + +Transcription of audio. This can be returned from a [LiveGenerativeModel](./ai.livegenerativemodel.md#livegenerativemodel_class) if transcription is enabled with the `inputAudioTranscription` or `outputAudioTranscription` properties on the [LiveGenerationConfig](./ai.livegenerationconfig.md#livegenerationconfig_interface). + +Signature: + +```typescript +export interface Transcription +``` + +## Properties + +| Property | Type | Description | +| --- | --- | --- | +| [text](./ai.transcription.md#transcriptiontext) | string | (Public Preview) The text transcription of the audio. | + +## Transcription.text + +> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment. +> + +The text transcription of the audio. + +Signature: + +```typescript +text?: string; +``` diff --git a/packages/ai/integration/live.test.ts b/packages/ai/integration/live.test.ts index caa18970ab7..6b50fe65222 100644 --- a/packages/ai/integration/live.test.ts +++ b/packages/ai/integration/live.test.ts @@ -154,6 +154,45 @@ describe('Live', function () { }); }); + describe('sendTextRealtime()', () => { + it('should send a single text chunk and receive a response', async () => { + const model = getLiveGenerativeModel(testConfig.ai, { + model: testConfig.model, + generationConfig: textLiveGenerationConfig + }); + const session = await model.connect(); + const responsePromise = nextTurnText(session.receive()); + + await session.sendTextRealtime('Are you an AI? Yes or No.'); + + const responseText = await responsePromise; + expect(responseText).to.include('Yes'); + + await session.close(); + }); + }); + + describe('sendAudioRealtime()', () => { + it('should send a single audio chunk and receive a response', async () => { + const model = getLiveGenerativeModel(testConfig.ai, { + model: testConfig.model, + generationConfig: textLiveGenerationConfig + }); + const session = await model.connect(); + const responsePromise = nextTurnText(session.receive()); + + await session.sendAudioRealtime({ + data: HELLO_AUDIO_PCM_BASE64, // "Hey, can you hear me?" + mimeType: 'audio/pcm' + }); + + const responseText = await responsePromise; + expect(responseText).to.include('Yes'); + + await session.close(); + }); + }); + describe('sendMediaChunks()', () => { it('should send a single audio chunk and receive a response', async () => { const model = getLiveGenerativeModel(testConfig.ai, { @@ -231,6 +270,56 @@ describe('Live', function () { }); }); + describe('Transcripts', async () => { + it('should receive transcript of audio input', async () => { + const model = getLiveGenerativeModel(testConfig.ai, { + model: testConfig.model, + generationConfig: { + responseModalities: [ResponseModality.AUDIO], + inputAudioTranscription: {}, + outputAudioTranscription: {} + } + }); + const session = await model.connect(); + const stream = session.receive(); + + await session.sendAudioRealtime({ + data: HELLO_AUDIO_PCM_BASE64, + mimeType: 'audio/pcm' + }); + + let aggregatedInputTranscription = ''; + let aggregatedOutputTranscription = ''; + let result = await stream.next(); + while (!result.done) { + const chunk = result.value as + | LiveServerContent + | LiveServerToolCall + | LiveServerToolCallCancellation; + if (chunk.type === 'serverContent') { + if (chunk.turnComplete) { + break; + } + + if (chunk.inputTranscription) { + aggregatedInputTranscription += chunk.inputTranscription?.text; + } + if (chunk.outputTranscription) { + aggregatedOutputTranscription += + chunk.outputTranscription?.text; + } + } + + result = await stream.next(); + } + + expect(aggregatedInputTranscription).to.not.be.empty; + expect(aggregatedOutputTranscription).to.not.be.empty; + + await session.close(); + }); + }); + /** * These tests are currently very unreliable. Their behavior seems to change frequently. * Skipping them for now. diff --git a/packages/ai/src/methods/live-session-helpers.test.ts b/packages/ai/src/methods/live-session-helpers.test.ts index cad0475b358..a62315c701d 100644 --- a/packages/ai/src/methods/live-session-helpers.test.ts +++ b/packages/ai/src/methods/live-session-helpers.test.ts @@ -65,7 +65,7 @@ class MockLiveSession { isClosed = false; inConversation = false; send = sinon.stub(); - sendMediaChunks = sinon.stub(); + sendAudioRealtime = sinon.stub(); sendFunctionResponses = sinon.stub(); messageGenerator = new MockMessageGenerator(); receive = (): MockMessageGenerator => this.messageGenerator; @@ -226,8 +226,8 @@ describe('Audio Conversation Helpers', () => { await clock.tickAsync(1); - expect(liveSession.sendMediaChunks).to.have.been.calledOnce; - const [sentChunk] = liveSession.sendMediaChunks.getCall(0).args[0]; + expect(liveSession.sendAudioRealtime).to.have.been.calledOnce; + const sentChunk = liveSession.sendAudioRealtime.getCall(0).args[0]; expect(sentChunk.mimeType).to.equal('audio/pcm'); expect(sentChunk.data).to.be.a('string'); await controller.stop(); diff --git a/packages/ai/src/methods/live-session-helpers.ts b/packages/ai/src/methods/live-session-helpers.ts index b3907d6219b..cb3be493f5d 100644 --- a/packages/ai/src/methods/live-session-helpers.ts +++ b/packages/ai/src/methods/live-session-helpers.ts @@ -184,7 +184,7 @@ export class AudioConversationRunner { mimeType: 'audio/pcm', data: base64 }; - void this.liveSession.sendMediaChunks([chunk]); + void this.liveSession.sendAudioRealtime(chunk); }; } diff --git a/packages/ai/src/methods/live-session.test.ts b/packages/ai/src/methods/live-session.test.ts index 7454b1208c9..428e92ec770 100644 --- a/packages/ai/src/methods/live-session.test.ts +++ b/packages/ai/src/methods/live-session.test.ts @@ -110,6 +110,42 @@ describe('LiveSession', () => { }); }); + describe('sendTextRealtime()', () => { + it('should send a correctly formatted realtimeInput message', async () => { + const text = 'foo'; + await session.sendTextRealtime(text); + expect(mockHandler.send).to.have.been.calledOnce; + const sentData = JSON.parse(mockHandler.send.getCall(0).args[0]); + expect(sentData).to.deep.equal({ + realtimeInput: { text } + }); + }); + }); + + describe('sendAudioRealtime()', () => { + it('should send a correctly formatted realtimeInput message', async () => { + const blob = { data: 'abcdef', mimeType: 'audio/pcm' }; + await session.sendAudioRealtime(blob); + expect(mockHandler.send).to.have.been.calledOnce; + const sentData = JSON.parse(mockHandler.send.getCall(0).args[0]); + expect(sentData).to.deep.equal({ + realtimeInput: { audio: blob } + }); + }); + }); + + describe('sendVideoRealtime()', () => { + it('should send a correctly formatted realtimeInput message', async () => { + const blob = { data: 'abcdef', mimeType: 'image/jpeg' }; + await session.sendVideoRealtime(blob); + expect(mockHandler.send).to.have.been.calledOnce; + const sentData = JSON.parse(mockHandler.send.getCall(0).args[0]); + expect(sentData).to.deep.equal({ + realtimeInput: { video: blob } + }); + }); + }); + describe('sendMediaChunks()', () => { it('should send a correctly formatted realtimeInput message', async () => { const chunks = [{ data: 'base64', mimeType: 'audio/webm' }]; diff --git a/packages/ai/src/methods/live-session.ts b/packages/ai/src/methods/live-session.ts index 92d325e2f0d..1db5e3d4dd4 100644 --- a/packages/ai/src/methods/live-session.ts +++ b/packages/ai/src/methods/live-session.ts @@ -96,14 +96,19 @@ export class LiveSession { } /** - * Sends realtime input to the server. + * Sends text to the server in realtime. * - * @param mediaChunks - The media chunks to send. + * @example + * ```javascript + * liveSession.sendTextRealtime("Hello, how are you?"); + * ``` + * + * @param text - The text data to send. * @throws If this session has been closed. * * @beta */ - async sendMediaChunks(mediaChunks: GenerativeContentBlob[]): Promise { + async sendTextRealtime(text: string): Promise { if (this.isClosed) { throw new AIError( AIErrorCode.REQUEST_ERROR, @@ -111,27 +116,33 @@ export class LiveSession { ); } - // The backend does not support sending more than one mediaChunk in one message. - // Work around this limitation by sending mediaChunks in separate messages. - mediaChunks.forEach(mediaChunk => { - const message: _LiveClientRealtimeInput = { - realtimeInput: { mediaChunks: [mediaChunk] } - }; - this.webSocketHandler.send(JSON.stringify(message)); - }); + const message: _LiveClientRealtimeInput = { + realtimeInput: { + text + } + }; + this.webSocketHandler.send(JSON.stringify(message)); } /** - * Sends function responses to the server. + * Sends audio data to the server in realtime. * - * @param functionResponses - The function responses to send. + * @remarks The server requires that the audio data is base64-encoded 16-bit PCM at 16kHz + * little-endian. + * + * @example + * ```javascript + * // const pcmData = ... base64-encoded 16-bit PCM at 16kHz little-endian. + * const blob = { mimeType: "audio/pcm", data: pcmData }; + * liveSession.sendAudioRealtime(blob); + * ``` + * + * @param blob - The base64-encoded PCM data to send to the server in realtime. * @throws If this session has been closed. * * @beta */ - async sendFunctionResponses( - functionResponses: FunctionResponse[] - ): Promise { + async sendAudioRealtime(blob: GenerativeContentBlob): Promise { if (this.isClosed) { throw new AIError( AIErrorCode.REQUEST_ERROR, @@ -139,25 +150,32 @@ export class LiveSession { ); } - const message: _LiveClientToolResponse = { - toolResponse: { - functionResponses + const message: _LiveClientRealtimeInput = { + realtimeInput: { + audio: blob } }; this.webSocketHandler.send(JSON.stringify(message)); } /** - * Sends a stream of {@link GenerativeContentBlob}. + * Sends video data to the server in realtime. * - * @param mediaChunkStream - The stream of {@link GenerativeContentBlob} to send. + * @remarks The server requires that the video is sent as individual video frames at 1 FPS. It + * is recommended to set `mimeType` to `image/jpeg`. + * + * @example + * ```javascript + * // const videoFrame = ... base64-encoded JPEG data + * const blob = { mimeType: "image/jpeg", data: videoFrame }; + * liveSession.sendVideoRealtime(blob); + * ``` + * @param blob - The base64-encoded video data to send to the server in realtime. * @throws If this session has been closed. * * @beta */ - async sendMediaStream( - mediaChunkStream: ReadableStream - ): Promise { + async sendVideoRealtime(blob: GenerativeContentBlob): Promise { if (this.isClosed) { throw new AIError( AIErrorCode.REQUEST_ERROR, @@ -165,25 +183,38 @@ export class LiveSession { ); } - const reader = mediaChunkStream.getReader(); - while (true) { - try { - const { done, value } = await reader.read(); + const message: _LiveClientRealtimeInput = { + realtimeInput: { + video: blob + } + }; + this.webSocketHandler.send(JSON.stringify(message)); + } - if (done) { - break; - } else if (!value) { - throw new Error('Missing chunk in reader, but reader is not done.'); - } + /** + * Sends function responses to the server. + * + * @param functionResponses - The function responses to send. + * @throws If this session has been closed. + * + * @beta + */ + async sendFunctionResponses( + functionResponses: FunctionResponse[] + ): Promise { + if (this.isClosed) { + throw new AIError( + AIErrorCode.REQUEST_ERROR, + 'This LiveSession has been closed and cannot be used.' + ); + } - await this.sendMediaChunks([value]); - } catch (e) { - // Re-throw any errors that occur during stream consumption or sending. - const message = - e instanceof Error ? e.message : 'Error processing media stream.'; - throw new AIError(AIErrorCode.REQUEST_ERROR, message); + const message: _LiveClientToolResponse = { + toolResponse: { + functionResponses } - } + }; + this.webSocketHandler.send(JSON.stringify(message)); } /** @@ -259,4 +290,73 @@ export class LiveSession { await this.webSocketHandler.close(1000, 'Client closed session.'); } } + + /** + * Sends realtime input to the server. + * + * @deprecated Use `sendTextRealtime()`, `sendAudioRealtime()`, and `sendVideoRealtime()` instead. + * + * @param mediaChunks - The media chunks to send. + * @throws If this session has been closed. + * + * @beta + */ + async sendMediaChunks(mediaChunks: GenerativeContentBlob[]): Promise { + if (this.isClosed) { + throw new AIError( + AIErrorCode.REQUEST_ERROR, + 'This LiveSession has been closed and cannot be used.' + ); + } + + // The backend does not support sending more than one mediaChunk in one message. + // Work around this limitation by sending mediaChunks in separate messages. + mediaChunks.forEach(mediaChunk => { + const message: _LiveClientRealtimeInput = { + realtimeInput: { mediaChunks: [mediaChunk] } + }; + this.webSocketHandler.send(JSON.stringify(message)); + }); + } + + /** + * @deprecated Use `sendTextRealtime()`, `sendAudioRealtime()`, and `sendVideoRealtime()` instead. + * + * Sends a stream of {@link GenerativeContentBlob}. + * + * @param mediaChunkStream - The stream of {@link GenerativeContentBlob} to send. + * @throws If this session has been closed. + * + * @beta + */ + async sendMediaStream( + mediaChunkStream: ReadableStream + ): Promise { + if (this.isClosed) { + throw new AIError( + AIErrorCode.REQUEST_ERROR, + 'This LiveSession has been closed and cannot be used.' + ); + } + + const reader = mediaChunkStream.getReader(); + while (true) { + try { + const { done, value } = await reader.read(); + + if (done) { + break; + } else if (!value) { + throw new Error('Missing chunk in reader, but reader is not done.'); + } + + await this.sendMediaChunks([value]); + } catch (e) { + // Re-throw any errors that occur during stream consumption or sending. + const message = + e instanceof Error ? e.message : 'Error processing media stream.'; + throw new AIError(AIErrorCode.REQUEST_ERROR, message); + } + } + } } diff --git a/packages/ai/src/models/live-generative-model.test.ts b/packages/ai/src/models/live-generative-model.test.ts index 495f340b846..a899e0b39fa 100644 --- a/packages/ai/src/models/live-generative-model.test.ts +++ b/packages/ai/src/models/live-generative-model.test.ts @@ -168,4 +168,35 @@ describe('LiveGenerativeModel', () => { mockHandler.simulateServerMessage({ setupComplete: true }); await connectPromise; }); + it('connect() should deconstruct generationConfig to send transcription configs in top level setup', async () => { + const model = new LiveGenerativeModel( + fakeAI, + { + model: 'gemini-pro', + generationConfig: { + temperature: 0.8, + inputAudioTranscription: {}, + outputAudioTranscription: {} + }, + systemInstruction: { role: 'system', parts: [{ text: 'Be a pirate' }] } + }, + mockHandler + ); + const connectPromise = model.connect(); + + // Wait for setup message + await clock.runAllAsync(); + + const sentData = JSON.parse(mockHandler.send.getCall(0).args[0]); + // inputAudioTranscription and outputAudioTranscription should be at the top-level setup message, + // rather than in the generationConfig. + expect(sentData.setup.generationConfig).to.deep.equal({ temperature: 0.8 }); + expect(sentData.setup.inputAudioTranscription).to.deep.equal({}); + expect(sentData.setup.outputAudioTranscription).to.deep.equal({}); + expect(sentData.setup.systemInstruction.parts[0].text).to.equal( + 'Be a pirate' + ); + mockHandler.simulateServerMessage({ setupComplete: true }); + await connectPromise; + }); }); diff --git a/packages/ai/src/models/live-generative-model.ts b/packages/ai/src/models/live-generative-model.ts index 251df095202..a89921070e3 100644 --- a/packages/ai/src/models/live-generative-model.ts +++ b/packages/ai/src/models/live-generative-model.ts @@ -86,13 +86,23 @@ export class LiveGenerativeModel extends AIModel { fullModelPath = `projects/${this._apiSettings.project}/locations/${this._apiSettings.location}/${this.model}`; } + // inputAudioTranscription and outputAudioTranscription are on the generation config in the public API, + // but the backend expects them to be in the `setup` message. + const { + inputAudioTranscription, + outputAudioTranscription, + ...generationConfig + } = this.generationConfig; + const setupMessage: _LiveClientSetup = { setup: { model: fullModelPath, - generationConfig: this.generationConfig, + generationConfig, tools: this.tools, toolConfig: this.toolConfig, - systemInstruction: this.systemInstruction + systemInstruction: this.systemInstruction, + inputAudioTranscription, + outputAudioTranscription } }; diff --git a/packages/ai/src/types/live-responses.ts b/packages/ai/src/types/live-responses.ts index d1870fa109f..3bdb32c1269 100644 --- a/packages/ai/src/types/live-responses.ts +++ b/packages/ai/src/types/live-responses.ts @@ -21,7 +21,13 @@ import { GenerativeContentBlob, Part } from './content'; -import { LiveGenerationConfig, Tool, ToolConfig } from './requests'; +import { + AudioTranscriptionConfig, + LiveGenerationConfig, + Tool, + ToolConfig +} from './requests'; +import { Transcription } from './responses'; /** * User input that is sent to the model. @@ -33,6 +39,8 @@ export interface _LiveClientContent { clientContent: { turns: [Content]; turnComplete: boolean; + inputTranscription?: Transcription; + outputTranscription?: Transcription; }; } @@ -44,7 +52,14 @@ export interface _LiveClientContent { // eslint-disable-next-line @typescript-eslint/naming-convention export interface _LiveClientRealtimeInput { realtimeInput: { - mediaChunks: GenerativeContentBlob[]; + text?: string; + audio?: GenerativeContentBlob; + video?: GenerativeContentBlob; + + /** + * @deprecated Use `text`, `audio`, and `video` instead. + */ + mediaChunks?: GenerativeContentBlob[]; }; } @@ -67,9 +82,22 @@ export interface _LiveClientToolResponse { export interface _LiveClientSetup { setup: { model: string; - generationConfig?: LiveGenerationConfig; + generationConfig?: _LiveGenerationConfig; tools?: Tool[]; toolConfig?: ToolConfig; systemInstruction?: string | Part | Content; + inputAudioTranscription?: AudioTranscriptionConfig; + outputAudioTranscription?: AudioTranscriptionConfig; }; } + +/** + * The Live Generation Config. + * + * The public API ({@link LiveGenerationConfig}) has `inputAudioTranscription` and `outputAudioTranscription`, + * but the server expects these fields to be in the top-level `setup` message. This was a conscious API decision. + */ +export type _LiveGenerationConfig = Omit< + LiveGenerationConfig, + 'inputAudioTranscription' | 'outputAudioTranscription' +>; diff --git a/packages/ai/src/types/requests.ts b/packages/ai/src/types/requests.ts index 1e5fa367420..6e5d2147686 100644 --- a/packages/ai/src/types/requests.ts +++ b/packages/ai/src/types/requests.ts @@ -184,6 +184,24 @@ export interface LiveGenerationConfig { * The modalities of the response. */ responseModalities?: ResponseModality[]; + /** + * Enables transcription of audio input. + * + * When enabled, the model will respond with transcriptions of your audio input in the `inputTranscriptions` property + * in {@link LiveServerContent} messages. Note that the transcriptions are broken up across + * messages, so you may only receive small amounts of text per message. For example, if you ask the model + * "How are you today?", the model may transcribe that input across three messages, broken up as "How a", "re yo", "u today?". + */ + inputAudioTranscription?: AudioTranscriptionConfig; + /** + * Enables transcription of audio input. + * + * When enabled, the model will respond with transcriptions of its audio output in the `outputTranscription` property + * in {@link LiveServerContent} messages. Note that the transcriptions are broken up across + * messages, so you may only receive small amounts of text per message. For example, if the model says + * "How are you today?", the model may transcribe that output across three messages, broken up as "How a", "re yo", "u today?". + */ + outputAudioTranscription?: AudioTranscriptionConfig; } /** @@ -478,3 +496,8 @@ export interface SpeechConfig { */ voiceConfig?: VoiceConfig; } + +/** + * The audio transcription configuration. + */ +export interface AudioTranscriptionConfig {} diff --git a/packages/ai/src/types/responses.ts b/packages/ai/src/types/responses.ts index ec06592f90a..be56d0d2baa 100644 --- a/packages/ai/src/types/responses.ts +++ b/packages/ai/src/types/responses.ts @@ -553,6 +553,29 @@ export interface LiveServerContent { * model was not interrupted. */ interrupted?: boolean; + /** + * Transcription of the audio that was input to the model. + */ + inputTranscription?: Transcription; + /** + * Transcription of the audio output from the model. + */ + outputTranscription?: Transcription; +} + +/** + * Transcription of audio. This can be returned from a {@link LiveGenerativeModel} if transcription + * is enabled with the `inputAudioTranscription` or `outputAudioTranscription` properties on + * the {@link LiveGenerationConfig}. + * + * @beta + */ + +export interface Transcription { + /** + * The text transcription of the audio. + */ + text?: string; } /**