In [None]:
%pip install --quiet websockets


In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

In [None]:
PROJECT_ID = "visionai-testing-stable"  # @param {type: "string"}

LOCATION = "us-central1"  # @param {type: "string"}

# us-central1-autopush-aiplatform.sandbox.googleapis.com
# us-central1-staging-aiplatform.sandbox.googleapis.com
# us-central1-aiplatform.googleapis.com
HOST = "us-central1-autopush-aiplatform.sandbox.googleapis.com" # @param {type: "string"}

SERVICE_URL = f"wss://{HOST}/ws/google.cloud.aiplatform.v1beta1.LlmBidiService/BidiGenerateContent"

In [None]:
bearer_token = !gcloud auth application-default print-access-token

In [None]:
bearer_token

['ya29.A0AS3H6Nw1GaNWh43UGt9KDbsZcSuP81wMrgSsHHtd77fXvNMP8IQvLECoWz4Ry2CkxGoe_h9XXYzmCUUAtQyVfZGvlBWMKsM0Qem8glR6dMr8r7e0E32BwPbBr7mRIMjPv0iJs31lemvbeJJf4sGIxnzNUbLSVfgl0g94pXU9X89cMzSMuRFXOHr8u9NPU3XFip--rwQaCgYKAeYSARYSFQHGX2MiDIUr_oulWYDpMt0kYiCh2w0206']

In [None]:
MODEL_ID = "gemini-live-2.5-flash-native-audio-exp"  # @param {type: "string"}

MODEL = (
    f"projects/{PROJECT_ID}/locations/{LOCATION}/publishers/google/models/{MODEL_ID}"
)

In [None]:
import base64
import json

from IPython.display import Audio, Markdown, display
import numpy as np
from websockets.asyncio.client import connect
import websockets

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Read wav file
with open("/content/drive/MyDrive/test_audio/output_24khz.wav", "rb") as f:
#with open("/content/drive/MyDrive/Projects/LiveAPI/Trump_WEF_2018.mp3", "rb") as f:
    wav_data = f.read()

# PCM S16LE

print(f"Successfully read {len(wav_data)} bytes from from the input wav file.")

# Set model generation_config
CONFIG = {
    "response_modalities": ["AUDIO"],
    "speech_config": {
        # Encode wav_data to Base64
        "voice_config": { "replicated_voice_config": {"voice_sample_audio": base64.b64encode(wav_data).decode('utf-8'), "mime_type": "audio/pcm;rate=24000"}},
        # "voice_config": { "prebuilt_voice_config": {"voice_name": 'charon'}},
        "language_code": "en-US",
    },
}

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {bearer_token[0]}",
}

# Connect to the server
async with connect(SERVICE_URL, additional_headers=headers) as ws:
    # Setup the session
    await ws.send(
        json.dumps(
            {
                "setup": {
                    # "system_instruction": {"parts": [{"text": "??"}]},
                    "model": MODEL,
                    "generation_config": CONFIG,
                    "output_audio_transcription": {},
                    "realtime_input_config": {
                      "automatic_activity_detection": {
                          # "silence_duration_ms": 1,
                          # "start_of_speech_sensitivity_threshold": 0.9,
                      }
                  }
                }
            }
        )
    )

    # Receive setup response
    raw_response = await ws.recv(decode=False)
    setup_response = json.loads(raw_response.decode("ascii"))
    print(setup_response)

    # Send text message
    text_input = "Say the following sentence: Numerous references to the phrase have occurred in movies, television, books, video games, advertising, websites, and graphic arts."
    display(Markdown(f"**Input:** {text_input}"))

    msg = {
        "client_content": {
            "turns": [{"role": "user", "parts": [{"text": text_input}]}],
            "turn_complete": True,
        }
    }

    await ws.send(json.dumps(msg))

    responses = []

    # Receive chucks of server response
    async for raw_response in ws:
        response = json.loads(raw_response.decode())
        server_content = response.pop("serverContent", None)
        if server_content is None:
            break
        output_transcription = server_content.pop("outputTranscription", None)
        if output_transcription is not None:
          raw_text = output_transcription.pop("text", None)
          if raw_text is not None:
            display(Markdown(f"**Response >** {raw_text}"))

        model_turn = server_content.pop("modelTurn", None)
        if model_turn is not None:
            parts = model_turn.pop("parts", None)
            if parts is not None:
                for part in parts:
                    pcm_data = base64.b64decode(part["inlineData"]["data"])
                    responses.append(np.frombuffer(pcm_data, dtype=np.int16))


        # End of turn
        turn_complete = server_content.pop("turnComplete", None)
        if turn_complete:
            break
    try:
      await ws.close(code=1000, reason="Normal closure") #example close
      print(f"Close Code: {ws.close_code}")
      print(f"Close Reason: {ws.close_reason}")
      print(f"Close Reason: {ws.state}")
    except websockets.exceptions.ConnectionClosed as e:
      print(f"Connection closed by exception, code: {e.code}, reason: {e.reason}")
    except Exception as e:
      print(f"An unexpected error occured: {e}")

    # Play the returned audio message
    if (len(responses) > 0):
      display(Audio(np.concatenate(responses), rate=24000, autoplay=True))
    else:
      display("00")

Successfully read 320078 bytes from from the input wav file.
{'setupComplete': {'sessionId': '8808fac2-ffd3-4e37-888b-325975a65a90'}}


**Input:** Say the following sentence: Numerous references to the phrase have occurred in movies, television, books, video games, advertising, websites, and graphic arts.

**Response >** Numerous

**Response >**  references

**Response >**  to

**Response >**  the

**Response >**  phrase

**Response >**  have

**Response >**  occurred

**Response >**  in

**Response >**  movies,

**Response >**  television,

**Response >**  books,

**Response >**  video

**Response >**  games,

**Response >**  advertising,

**Response >**  websites,

**Response >**  and

**Response >**  graphic

**Response >**  arts.

Close Code: 1000
Close Reason: 
Close Reason: 3
