##### Copyright 2025 Google LLC.

In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Audio understanding with Hugging Face Transformers

<table align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/google-gemini/gemma-cookbook/blob/main/Gemma/[Gemma_3n]Audio_understanding_with_HF.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
</table>

Gemma 3n is a generative AI model optimized for use in everyday devices, such as phones, laptops, and tablets. This tutorial shows you how to get started running Gemma 3n using Hugging Face Transformers using audio input to generate text content. The Transformers Python library provides a API for accessing pre-trained generative AI models, including Gemma. For more information, see the [Transformers](https://huggingface.co/docs/transformers/en/index) documentation.

## Setup

Before starting this tutorial, complete the following steps:

* Get access to Gemma by logging into [Hugging Face](https://huggingface.co/google/gemma-3n-E4b-it) and selecting **Acknowledge license** for a Gemma model.
* Select a Colab runtime with sufficient resources to run
  the Gemma model size you want to run. [Learn more](https://ai.google.dev/gemma/docs/core#sizes).
* Generate a Hugging Face [Access Token](https://huggingface.co/docs/hub/en/security-tokens#how-to-manage-user-access-token) and use it to login from Colab.

This notebook will run on an NVIDIA T4 GPU using Gemma 3n E2B.\
But if you want to use Gemma 3n E4B, select L4 or A100.

In [None]:
# Login into Hugging Face Hub
from huggingface_hub import notebook_login
notebook_login()

### Install Python packages

Install the Hugging Face libraries required for running the Gemma model and making requests.

In [None]:
# Install a transformers version that supports Gemma 3n (>= 4.53)
!pip install "transformers>=4.53.0"

## Define formatting helper functions

Create a chat helper to manage and display the conversations.

In [23]:
import torch

GEMMA_PATH = "google/gemma-3n-E2B-it" #@param ["google/gemma-3n-E2B-it", "google/gemma-3n-E4B-it"]
RESOURCE_URL_PREFIX = "https://raw.githubusercontent.com/google-gemini/gemma-cookbook/refs/heads/main/Demos/sample-data/"

from IPython.display import Audio, Image, Markdown, display

class ChatState():
  def __init__(self, model, processor):
    self.model = model
    self.processor = processor
    self.history = []

  def send_message(self, message, max_tokens=256):
    self.history.append(message)

    input_ids = self.processor.apply_chat_template(
        self.history,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
    )
    input_len = input_ids["input_ids"].shape[-1]

    input_ids = input_ids.to(self.model.device, dtype=model.dtype)
    outputs = self.model.generate(
        **input_ids,
        max_new_tokens=max_tokens,
        disable_compile=True
    )
    text = self.processor.batch_decode(
        outputs[:, input_len:],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )
    self.history.append({
        "role": "assistant",
        "content": [
            {"type": "text", "text": text[0]},
        ]
    })

    # display chat
    for item in message['content']:
      if item['type'] == 'text':
        formatted_prompt = "<font size='+1' color='brown'>🙋‍♂️<blockquote>\n" + item['text'] + "\n</blockquote></font>"
        display(Markdown(formatted_prompt))
      elif item['type'] == 'audio':
        display(Audio(item['audio']))
      elif item['type'] == 'image':
        display(Image(item['image']))

    formatted_text = "<font size='+1' color='teal'>🤖<blockquote>\n" + text[0] + "\n</blockquote></font>"
    display(Markdown(formatted_text))


## Load Model

In [4]:
from transformers import AutoModelForImageTextToText, AutoProcessor

processor = AutoProcessor.from_pretrained(GEMMA_PATH)
model = AutoModelForImageTextToText.from_pretrained(GEMMA_PATH, torch_dtype="auto", device_map="auto")

print(f"Device: {model.device}")
print(f"DType: {model.dtype}")

processor_config.json:   0%|          | 0.00/98.0 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.70M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/769 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.96k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/159k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/3.08G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/2.82G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

Device: cuda:0
DType: torch.bfloat16


## Audio-in: Shopping Buddy

This section demonstrates how to leverage Gemma 3n's audio-in capability with your shopping experience. Speak your shopping needs, and Gemma 3n will act as your intelligent assistant.

```
shopping1.wav
"Add one milk, three apples, and one tofu to the shopping cart."

shopping2.wav
"No apples, change it to four bananas."

shopping3.wav
"Add one dozen eggs."
```

In [25]:
messages = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "itemize it into a shopping list."},
            {"type": "audio", "audio": f"{RESOURCE_URL_PREFIX}shopping1.wav"},
        ]
    },
    {
        "role": "user",
        "content": [
            {"type": "audio", "audio": f"{RESOURCE_URL_PREFIX}shopping2.wav"},
        ]
    },
    {
        "role": "user",
        "content": [
            {"type": "audio", "audio": f"{RESOURCE_URL_PREFIX}shopping3.wav"},
        ]
    }
]

chat = ChatState(model, processor)
chat.send_message(messages[0])

<font size='+1' color='brown'>🙋‍♂️<blockquote>
itemize it into a shopping list.
</blockquote></font>

<font size='+1' color='teal'>🤖<blockquote>
Here's your shopping list:

* 1 milk
* 3 apples
* 1 tofu
</blockquote></font>

In [26]:
chat.send_message(messages[1])

<font size='+1' color='teal'>🤖<blockquote>
Here's the updated shopping list:

* 1 milk
* 4 bananas
* 1 tofu
</blockquote></font>

In [27]:
chat.send_message(messages[2])

<font size='+1' color='teal'>🤖<blockquote>
Here's the updated shopping list:

* 1 milk
* 4 bananas
* 1 tofu
* 1 dozen eggs
</blockquote></font>

In [28]:
for item in chat.history:
  print(f"{item['role']}:")
  for content in item['content']:
    if content['type'] == 'text':
      print(content['text'])
    elif content['type'] == 'audio':
      # show filename
      print(content['audio'].rsplit('/',1)[-1])
  print("-"*80)


user:
itemize it into a shopping list.
shopping1.wav
--------------------------------------------------------------------------------
assistant:
Here's your shopping list:

* 1 milk
* 3 apples
* 1 tofu
--------------------------------------------------------------------------------
user:
shopping2.wav
--------------------------------------------------------------------------------
assistant:
Here's the updated shopping list:

* 1 milk
* 4 bananas
* 1 tofu
--------------------------------------------------------------------------------
user:
shopping3.wav
--------------------------------------------------------------------------------
assistant:
Here's the updated shopping list:

* 1 milk
* 4 bananas
* 1 tofu
* 1 dozen eggs
--------------------------------------------------------------------------------


In [14]:
chat.history = []

## Audio-in: Journal Enhancer

This section demonstrates how to leverage multiple audio files for personal journaling. Just speak your thoughts and reflections throughout the day, and Gemma 3n will then intelligently summarize your daily entries, providing a concise overview.

```
journal1.wav
"Woke up early today, feeling refreshed. The morning light was beautiful, and I enjoyed a nice cup of coffee."

journal2.wav
"Spent the afternoon at the park. It was a perfect day for a walk, and I even spotted a few cherry blossoms."

journal3.wav
"Finished the day with a good book. Feeling grateful for the simple moments. Ready for tomorrow."

journal4.wav
"Just got back from work. The city lights looked amazing tonight, a really clear view from the train."

journal5.wav
"Had a great lunch with my old friend. It was good to catch up and laugh so much. Made my day."
```

In [18]:
prompt = {
    "role": "user",
    "content": [
        {"type": "audio", "audio": f"{RESOURCE_URL_PREFIX}journal1.wav"},
        {"type": "audio", "audio": f"{RESOURCE_URL_PREFIX}journal2.wav"},
        {"type": "audio", "audio": f"{RESOURCE_URL_PREFIX}journal3.wav"},
        {"type": "audio", "audio": f"{RESOURCE_URL_PREFIX}journal4.wav"},
        {"type": "audio", "audio": f"{RESOURCE_URL_PREFIX}journal5.wav"},
        {"type": "text", "text": "Give me a concise overview of these audio."},
    ]
}

chat = ChatState(model, processor)
chat.send_message(prompt)

Output hidden; open in https://colab.research.google.com to view.

## Automatic Speech Translation / Automatic Speech Recognition

Try this by yourself

In [None]:
!pip install ipywebrtc

Collecting ipywebrtc
  Downloading ipywebrtc-0.6.0-py2.py3-none-any.whl.metadata (825 bytes)
Downloading ipywebrtc-0.6.0-py2.py3-none-any.whl (260 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/260.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/260.7 kB[0m [31m2.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m260.7/260.7 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ipywebrtc
Successfully installed ipywebrtc-0.6.0


Press the circle button and start speaking. Click the circle button again when you are finshed. The widget will immediately begin to play back what it captured.

In [None]:
from google.colab import output
output.enable_custom_widget_manager()

from ipywebrtc import AudioRecorder, CameraStream

camera = CameraStream(constraints={'audio': True,'video':False})
recorder = AudioRecorder(stream=camera)
recorder

AudioRecorder(audio=Audio(value=b'', format='webm'), stream=CameraStream(constraints={'audio': True, 'video': …

Convert webm file to wav format that PyTorch can understand.

In [None]:
with open('/content/recording.webm', 'wb') as f:
    f.write(recorder.audio.value)
!ffmpeg -i /content/recording.webm /content/recording.wav -y

ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

### ASR

In [None]:
prompt = {
  "role": "user",
  "content": [
    {"type": "audio", "audio": "/content/recording.wav"},
    {"type": "text", "text": "Transcribe this audio into English."},
  ]
}
chat = ChatState(model, processor)
chat.send_message(prompt)

<font size='+1' color='brown'>🙋‍♂️<blockquote>
Transcribe this audio into English.
</blockquote></font>

<font size='+1' color='teal'>🤖<blockquote>
How do I get to the train station?
</blockquote></font>

### AST

In [None]:
prompt = {
  "role": "user",
  "content": [
    {"type": "audio", "audio": "/content/recording.wav"},
    {"type": "text", "text": "Transcribe this audio into English, and then translate it into French."},
  ]
}
chat = ChatState(model, processor)
chat.send_message(prompt)

<font size='+1' color='brown'>🙋‍♂️<blockquote>
Transcribe this audio into English, and then translate it into French.
</blockquote></font>

<font size='+1' color='teal'>🤖<blockquote>
"How do I get to the train station?"

**French:**
"Comment puis-je aller à la gare?"
</blockquote></font>

## Next steps

Build and explore more with Gemma models:

* [Fine-tune Gemma for text tasks using Hugging Face Transformers](https://ai.google.dev/gemma/docs/core/huggingface_text_finetune_qlora)
* [Fine-tune Gemma for vision tasks using Hugging Face Transformers](https://ai.google.dev/gemma/docs/core/huggingface_vision_finetune_qlora)
* [Perform distributed fine-tuning and inference on Gemma models](https://ai.google.dev/gemma/docs/core/distributed_tuning)
* [Use Gemma open models with Vertex AI](https://cloud.google.com/vertex-ai/docs/generative-ai/open-models/use-gemma)
* [Fine-tune Gemma using Keras and deploy to Vertex AI](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_gemma_kerasnlp_to_vertexai.ipynb)