*Copyright 2024 Google LLC.*

In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Live API - Websockets Quickstart

<table align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/google-gemini/cookbook/blob/main/gemini-2/websockets/live_api_starter.ipynb"><img src="../../images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
</table>

**This** notebok demonstrates simple usage of the Gemini Live API.

This Notebook connects directly to the API websockets to demonstrate the the low-level details for anyone building without using an SDK.

- If you are not interested in the low-level websocket details you should read the [SDK version of this notebook](../../quickstarts/Get_started_LiveAPI.ipynb).

This notebook implements a simple turn-based chat where you send messages as text, and the model replies with audio. The API is capable of much more than that. The goal here is to **demonstrate with simple code**.

- The [Live API - Text to Text](../../quickstarts/Get_started_LiveAPI.ipynb) notebook is even simpler than this, as it doesn't deal with audio.
- The [Live API - Audio Streaming in Colab](./live_api_streaming_in_colab.ipynb) demonstrates streaming audio **in Colab**.<br> It's more _fun_ than this notebook but **not optimized for readability**.
- The [Live API Audio Video to Audio python script](./live_api_starter.py) doesn't work in colab, but provides a relatively readable implementation of audio and video streaming.

## Setup

### Install and import

In [None]:
!pip install -q websockets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/168.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m163.8/168.2 kB[0m [31m4.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.2/168.2 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import asyncio
import base64
import contextlib
import datetime
import os
import json
import wave
import itertools

from websockets.asyncio.client import connect
from IPython.display import display, Audio

### Constants

To run the following cell, your API key must be stored in a Colab Secret named `GOOGLE_API_KEY`. If you don't already have an API key, or you're not sure how to create a Colab Secret, see [Authentication](../../quickstarts/Authentication.ipynb) for an example.

In [None]:
from google.colab import userdata
os.environ['GOOGLE_API_KEY'] = userdata.get('GOOGLE_API_KEY')

Multimodal Live API are a new capability introduced with the [Gemini 2.0](https://ai.google.dev/gemini-api/docs/models/gemini-v2) model so only works with this model. You need to use the  `v1alpha` client version.


In [None]:
MODEL = 'models/gemini-2.0-flash-exp'

HOST='generativelanguage.googleapis.com'

URI = f'wss://{HOST}/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={os.environ["GOOGLE_API_KEY"]}'

### Logging

Uncomment the `logger.setLevel` call to show the log messages

In [None]:
import logging

logger = logging.getLogger('Bidi')
#logger.setLevel('DEBUG')

### Wave file writer

The code in this secrtion is not essential for understanding the API, feel free to skip to the next section.

The simplest way to playback the audio in Colab, is to write it outto a `.wav` file.

In [None]:
@contextlib.contextmanager
def wave_file(filename, channels=1, rate=24000, sample_width=2):
    with wave.open(filename, "wb") as wf:
        wf.setnchannels(channels)
        wf.setsampwidth(sample_width)
        wf.setframerate(rate)
        yield wf

## Main audio loop

The class below implements the interaction with the Live API.

In [None]:
class AudioLoop:
  def __init__(self, tools=None):
    if tools is None:
      self.tools = []
    else:
      self.tools = tools
    self.ws = None
    self.index = 0

  async def run(self):
    print("Type 'q' to quit")

    logger.debug('connect')
    async with connect(URI, additional_headers={'Content-Type': 'application/json'}) as ws:
      self.ws = ws
      await self.setup()

      while True:
        # Ideally these would be separate tasks.
        if not await self.send():
          break
        await self.recv()

  async def setup(self):
      logger.debug("set_up")
      await self.ws.send(json.dumps({
          'setup' : {
               "model": MODEL,
               "tools": self.tools
          }
      }))
      raw_response = await self.ws.recv(decode=False)
      setup_response = json.loads(raw_response.decode('ascii'))
      logger.debug(f'Connected: {setup_response}')

  async def send(self):
    logger.debug('send')
    # `asyncio.to_thread` is important here, without it all other tasks are blocked.
    text = await asyncio.to_thread(input, "message > ")

    # If the input returns 'q' quit.
    if text.lower() == 'q':
      return False

    # Wrap the text into a "client_content" message.
    msg = {
        "client_content": {
            "turns": [{
                "role": "user",
                "parts": [{ "text": text }]
            }],
            'turn_complete': True
        }
      }

    # Send the message to the model.
    await self.ws.send(json.dumps(msg))
    logger.debug('sent')
    return True

  async def recv(self):
    # Start a new `.wav` file.
    file_name = f"audio_{self.index}.wav"
    with wave_file(file_name) as wav:
      self.index += 1

      logger.debug('receive')

      # Read chunks from the socket.
      async for raw_response in self.ws:
        response = json.loads(raw_response.decode())
        logger.debug(f'got chunk: {str(response)[:200]}')
        #print(response)

        server_content = response.pop('serverContent', None)
        if server_content is None:
          logger.error(f'Unhandled server message! - {response}')
          break

        # Write audio the chunk to the `.wav` file.
        model_turn = server_content.pop('modelTurn', None)
        if model_turn is not None:
          b64data = model_turn['parts'][0]['inlineData']['data']
          pcm_data = base64.b64decode(b64data)
          print('.', end='')
          logger.debug('Got pcm_data')
          wav.writeframes(pcm_data)

        # Break out of the loop if the model's turn is complete.
        turn_complete = server_content.pop('turnComplete', None)
        if turn_complete:
          logger.debug('turn_complete')
          break

    display(Audio(file_name, autoplay=True))
    await asyncio.sleep(2)


There are 4 methods worth describing here:

### `run` - The main loop

This method:

- Opens a `websocket` connecting to the Live API
- Calls the initial `setup` method
- Then enters the main loop where it alternates between `send` and `recv` until send returns `False`.

### `setup` - Initial setup

The `setup` method sends the `setup` message, and awaits the response. You shouldn't try to `send` or `recv` anything else from the model until you've gotten the model's `setup_complete` response.

The `setup` message (a `BidiGenerateContentSetup` object) is where you can set the `model`, `generation_config`, `system_instructions`, `tools` and `safety_settings`.

### `send` - Sends input text to the api

The `send` method collects input text from the user, wraps it in a `client_content` message (an instance of `BidiGenerateContentClientContent`), and sends it to the model.

### `recv` - Collects audio from the API and plays it

The `recv` method collects audio chunks in a loop and writes them to a `.wav` file. It breaks out of the loop once the model sends a `turn_complete` method, and then plays the audio.

To keep things simple in Colab it collects **all** the audio before playing it. [TODO: link other examples]() demonstrate how to play audio as soon as you start to receive it (using `PyAudio`), and how to interrupt the model (implement input and audio playback on separate tasks).

## Run

### Example 1: simple usage with Google Search

In [None]:
tools = [
  {'google_search': {}},
]

await AudioLoop(tools=tools).run()

Type 'q' to quit
.........................................................................

.................................................

...................

............................................

.........................................

### Example 2: function calling

In [None]:
tools = [
  {'function_declarations': [{'name': 'turn_on_the_lights', 'description': None}, {'name': 'turn_off_the_lights', 'description': None}]}
]

await AudioLoop(tools=tools).run()

Type 'q' to quit
message > make it dark 
.....

ERROR:Bidi:Unhandled server message! - {'toolCall': {'functionCalls': [{'name': 'turn_off_the_lights', 'args': {}, 'id': 'function-call-1061054218196002212'}]}}


In [None]:
tools = [
  {'google_search': {}},
  {'function_declarations': [{'name': 'turn_on_the_lights', 'description': None}, {'name': 'turn_off_the_lights', 'description': None}]}
]

await AudioLoop(tools=tools).run()

In [None]:
tools = [
  {'code_execution': {}}
]

await AudioLoop(tools=tools).run()

## Next steps

<a name="next_steps"></a>

This tutorial just shows basic usage of the Live API, using the Python GenAI SDK.

- If you aren't looking for code, and just want to try multimedia streaming use [Live API in Google AI Studio](https://aistudio.google.com/app/live).
- If you want to see how to setup streaming interruptible audio and video using the Live API and the SDK see the [Audio and Video input Tutorial](../../quickstarts/Get_started_LiveAPI.py).
- Try the [Tool use in the live API tutorial](../../quickstarts/Get_started_LiveAPI_tools.ipynb) for an walkthrough of Gemini 2.0's new tool use capabilities.
- There is a [Streaming audio in Colab example](../../gemini-2/websockets/live_api_streaming_in_colab.ipynb), but this is more of a **demo**, it's **not optimized for readability**.
- Other nice Gemini 2.0 examples can also be found in the [Cookbook](https://github.com/google-gemini/cookbook/tree/main/gemini-2/).
