##### Copyright 2025 Google LLC.

In [29]:
# @title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<a target="_blank" href="https://colab.research.google.com/github/google-gemini/cookbook/blob/main/examples/prompting/Zero_shot_prompting.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" height=30/>

# Podcast and Audio Transcription with Gemini AI

## Advanced Audio-to-Text Conversion  
Gemini 2.0 transforms audio files (podcasts, interviews, call recordings) into **structured transcripts** with:  
- **Precision timestamps** ([00:00] format)  
- **Speaker identification** (labeled or auto-assigned as Speaker A/B)  
- **Audio event detection**:  
  - Background music (with song recognition, e.g., `[02:15] [Firework by Katy Perry]`)  
  - Sound effects (e.g., `[01:30] [Bell ringing]`)  
  - Named jingles (e.g., `[00:45] [The Sofa Shop jingle]`)  

## Implementation Notes  
1. **File Preparation**:  
   - Supports MP3/WAV formats  
   - Update `file_path` to your audio file location  
2. **Output Format**:  
   - Clean text without markdown  
   - Terminates with `[END]` marker  
3. **Accuracy**:  
   - Context-aware spelling (corrects names/titles)  
   - English-alphabet focused (non-English chars only when explicit)  

```python
# Example output snippet:
[00:00] Tom: Welcome to the podcast.  
[00:03] Speaker A: Thanks for having me!  
[00:06] [Coffee shop ambiance]  
[01:30] [END]


In [30]:
%pip install -U -q "google-genai>=1.0.0"

In [31]:
from google import genai

from IPython.display import Markdown

## Configure your API key

To run the following cell, your API key must be stored it in a Colab Secret named `GOOGLE_API_KEY`. If you don't already have an API key, or you're not sure how to create a Colab Secret, see [Authentication](https://github.com/google-gemini/cookbook/blob/main/quickstarts/Authentication.ipynb) for an example.

In [32]:
from google.colab import userdata
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
client = genai.Client(api_key=GOOGLE_API_KEY)

In [33]:
MODEL_ID="gemini-2.0-flash" # @param ["gemini-2.0-flash-lite","gemini-2.0-flash","gemini-2.0-pro-exp-02-05"] {"allow-input":true, isTemplate: true}

In [34]:
%pip install google-genai jinja2



Replace `/content/assets_porsche.mp3` with your audio file path before running.


In [35]:
from jinja2 import Template


# path to the file to upload
file_path = "/content/assets_porsche.mp3" # Repalce with your own file path

# Upload the file to the File API
file = client.files.upload(file=file_path)

# Generate a structured response using the Gemini API
prompt_template = Template("""Generate a transcript of the episode. Include timestamps and identify speakers.

Speakers are:
{% for speaker in speakers %}- {{ speaker }}{% if not loop.last %}\n{% endif %}{% endfor %}

eg:
[00:00] Brady: Hello there.
[00:02] Tim: Hi Brady.

It is important to include the correct speaker names. Use the names you identified earlier. If you really don't know the speaker's name, identify them with a letter of the alphabet, eg there may be an unknown speaker 'A' and another unknown speaker 'B'.

If there is music or a short jingle playing, signify like so:
[01:02] [MUSIC] or [01:02] [JINGLE]

If you can identify the name of the music or jingle playing then use that instead, eg:
[01:02] [Firework by Katy Perry] or [01:02] [The Sofa Shop jingle]

If there is some other sound playing try to identify the sound, eg:
[01:02] [Bell ringing]

Each individual caption should be quite short, a few short sentences at most.

Signify the end of the episode with [END].

Don't use any markdown formatting, like bolding or italics.

Only use characters from the English alphabet, unless you genuinely believe foreign characters are correct.

It is important that you use the correct words and spell everything correctly. Use the context of the podcast to help.
If the hosts discuss something like a movie, book or celebrity, make sure the movie, book, or celebrity name is spelled correctly.""")

# Define the speakers and render the prompt
speakers = ["Tom"]
prompt = prompt_template.render(speakers=speakers)

response = client.models.generate_content(
    model="gemini-2.0-flash",
    contents=[prompt, file],
)

print(response.text)

[00:00] Tom: If the Porsche Macan has proven anything,
[00:04] Tom: it's that the days of sacrificing performance for practicality are gone.
[00:08] Tom: Long gone.
[00:10] Tom: Engineered to deliver a driving experience like no other, the Macan has demonstrated excellence in style and performance to become the leading sports car in its class.
[00:18] Tom: So don't let those five doors fool you.
[00:21] Tom: Once you're in the driver's seat, one thing will become immediately clear.
[00:28] [Engine Revving]
[00:30] Tom: This is a Porsche.
[00:33] Tom: The Macan, now leasing from 3.99%.
[00:36] Tom: Conditions apply.
[END]
