##### Copyright 2025 Google LLC.

In [None]:
# @title Licensed under the Apache License, Version 2.0 (the "License");
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Gemini API: Getting started with Gemini models

<a target="_blank" href="https://colab.research.google.com/github/google-gemini/cookbook/blob/main/quickstarts/Get_started.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" height=30/></a>

## Setup

### Install SDK

Install the SDK from [PyPI](https://github.com/googleapis/python-genai).

In [None]:
!pip3 install -q google_genai>=1.55.0

### Setup your API key

To run the following cell, your API key must be stored it in a Colab Secret named `GEMINI_API_KEY`. If you don't already have an API key or you aren't sure how to create a Colab Secret, see [Authentication](../quickstarts/Authentication.ipynb) for an example.

In [None]:
import os
from google.colab import userdata

os.environ['GEMINI_API_KEY'] = userdata.get('GEMINI_API_KEY')

### Utility functions

In [None]:
from IPython.display import Audio,Markdown, Image, display
import textwrap
import base64
import wave


def wave_file(filename, pcm, channels=1, rate=24000, sample_width=2):
   with wave.open(str(filename), "wb") as wf:
      wf.setnchannels(channels)
      wf.setsampwidth(sample_width)
      wf.setframerate(rate)
      wf.writeframes(pcm)


import collections


def show_outputs(outputs):
  for part in outputs:
    if part.type == 'text':
      display(Markdown(part.text))
    elif part.type == 'thought':
      pass
    elif part.type == 'image':
      display(Image(data=base64.b64decode(part.data)))
    elif part.type == 'function_call':
      print(repr(part))
    elif part.type == 'audio':
      audio_data = part.data
      wave_file("speech.wav", base64.b64decode(audio_data))
      display(Audio("speech.wav"))
    elif part.type == 'code_execution_call':
      display(Markdown(f"\n```python\n{part.arguments.code}\n```\n"))
    elif part.type == 'code_execution_result':
      display(Markdown(part.result.join(['\n```\n', '\n```\n'])))
    elif part.type == 'google_search_call':
      if part.arguments is None:
        display(Markdown(f"\n```\ngogle_search(???)\n```\n"))
      else:
        display(Markdown(f"\n```\ngogle_search({part.arguments.queries})\n```\n"))

    elif part.type == 'google_search_result':
      links = []
      for result in part.result:
        links.append(f"* [{result.title}]({result.url})")
      display(Markdown("\n".join(links)))
    elif part.type == 'url_context_call':
      display(Markdown("Url Context"))
    elif part.type == 'url_context_result':
      markdown_lines = []
      status_emoji = {"SUCCESS": "✅", "FAIL": "❌"}
      for item in part.result:
          emoji = status_emoji.get(item.status, "❓") # Default to '❓' if status is unknown
          line = f"* {emoji} [{item.url}]({item.url})"
          markdown_lines.append(line)

      display(Markdown("\n".join(markdown_lines)))


### Initialize SDK client


In [None]:
from google import genai

In [None]:
client = genai.Client()

In [None]:
for m in client.models.list():
  if 'gemini-3' in m.name:
    print(m.name)

### Choose a model

Select the model you want to use in this guide. You can either select one from the list or enter a model name manually. Keep in mind that some models, such as the 2.5 ones are thinking models and thus take slightly more time to respond. For more details, you can see [thinking notebook](./Get_started_thinking.ipynb) to learn how to switch the thinking off.

For a full overview of all Gemini models, check the [documentation](https://ai.google.dev/gemini-api/docs/models/gemini).

In [None]:
MODEL_ID = "gemini-2.5-flash"

## Standard interactions

Use the `generate_content` method to generate responses to your prompts. You can pass text directly to `generate_content` and use the `.text` property to get the text content of the response. Note that the `.text` field will work when there's only one part in the output.

In [None]:
response = client.interactions.create(
    model=MODEL_ID,
    input="What's the largest planet in our solar system?",
)

In [None]:
print(response.model_dump_json(indent=2, exclude_none=True))

In [None]:
response.outputs

In [None]:
show_outputs(response.outputs)

## Deep research

In [None]:
for m in client.models.list():
  if 'research' in m.name:
    print(m.name)

In [None]:
stream = client.interactions.create(
    agent="deep-research-preview",
    input='I want to learn more about the history of hadrians wall',
    agent_config={'thinking_summaries':'auto', 'type':'deep-research'},
    background = True,
    stream = True,
)

In [None]:
start = next(stream)
print(start)

In [None]:
for chunk in stream:
  print(chunk)

If you get disconnected you can reconect with get, replaying the whole stream:

In [None]:
stream = client.interactions.get(
    id=start.interaction.id,
    stream=True)

In [None]:
for chunk in stream:
  print(chunk)

Get without streaming returns the whole combined result:

In [None]:
interaction = client.interactions.get(id=start.interaction.id)
print(interaction)

If you don't need the thought summaries or realtime updates, you can leave them out:

In [None]:
result = client.interactions.create(
    agent="deep-research-preview",
    input='I want to learn more about the history of hadrians wall',
    background = True,
)

In [None]:
import time

while result.status == 'in_progress':
  time.sleep(10)
  print('.', end='')
  result = client.interactions.get(id=result.id)

In [None]:
print(result)

## Send image prompts

In this first example, you'll download an image from a specified URL, save it as a byte stream and then write those bytes to a local file named `jetpack.png`.

Use Gemini model, a multimodal model that supports multimodal prompts. You can include text, [PDF documents](../quickstarts/PDF_Files.ipynb), images, [audio](../quickstarts/Audio.ipynb) and [video](../quickstarts/Video.ipynb) in your prompt requests and get text or code responses.

See the "Multimedia input" section below for other media types.



In [None]:
import requests
import pathlib
from PIL import Image

IMG = "https://storage.googleapis.com/generativeai-downloads/data/jetpack.png" # @param {type: "string"}

img_bytes = requests.get(IMG).content

img_path = pathlib.Path('jetpack.png')
img_path.write_bytes(img_bytes)

In this second example, you'll open a previously saved image, create a thumbnail of it and then generate a short blog post based on the thumbnail, displaying both the thumbnail and the generated blog post.

In [None]:
image = Image.open(img_path)
image.thumbnail([512,512])
image

In [None]:
import base64

response = client.interactions.create(
    model=MODEL_ID,
    input=[
        {
            "type": 'image',
            "mime_type": "image/png",
            "data": base64.b64encode(img_path.read_bytes()).decode("utf-8"),
        },
        {
            "type": 'text',
            "text": "Write a short and engaging blog post based on this picture.",
        }
    ]
)

In [None]:
show_outputs(response.outputs)

If you've uploaded a file to the files-api, pass the file-uri instead of the bytes.

In [None]:
uploaded_file = client.files.upload(
    file=img_path,
)

In [None]:
import base64

response = client.interactions.create(
    model=MODEL_ID,
    input=[
        {
            "type": 'image',
            "mime_type": "image/png",
            "uri": uploaded_file.uri
        },
        {
            "type": 'text',
            "text": "Write a short and engaging blog post based on this picture.",
        }
    ]
)

In [None]:
show_outputs(response.outputs)

## System instructions

In [None]:
response = client.interactions.create(
    model=MODEL_ID,
    input="Tell me how the internet works",
    system_instruction="Pretend the user is a puppy who's only interested in squeaky toys."
)

In [None]:
show_outputs(response.outputs)

## generation Config

In [None]:
response = client.interactions.create(
    model=MODEL_ID,
    input="Tell me how the internet works",
    system_instruction="Pretend the user is a puppy who's only interested in squeaky toys.",

    generation_config = {
        "max_output_tokens": 255
    }
)

## Multi-turn chat

The Gemini API enables you to have freeform conversations across multiple turns. This is on by default, the id from one turn can be used as the starting point for the next.

In [None]:
response = client.interactions.create(
    model=MODEL_ID,
    system_instruction="""
          You are an expert software developer and a helpful coding assistant.
          You are able to generate high-quality code in any programming language.
        """,
    input="Write a function that checks if a year is a leap year in C#.",
)
msg1 = response

In [None]:
print(response.id)

You can look up a client by id

In [None]:
msc1_copy = client.interactions.get(msg1.id)

Use `response.id` to continue a conversation with another message:

In [None]:
response = client.interactions.create(
    model=MODEL_ID,
    previous_interaction_id=msg1.id,
    input="Wait, we got disconnected, who are you? what were we talking about?",
)
msg2 = response

In [None]:
show_outputs(response.outputs)

### Branching

Since the interactions are persistent resources, reusing an old id branches the conversation:

In [None]:
response = client.interactions.create(
    model=MODEL_ID,
    previous_interaction_id=msg1.id,
    input="Including this one, how many messages have I sent you so far? what were they?"
)

In [None]:
show_outputs(response.outputs)

### Disable multi-turn

If you call interactons with `store=False` it won't store the interaction or return an ID.

In [None]:
response = client.interactions.create(
    model=MODEL_ID,
    store=False,
    system_instruction="""
          You are an expert software developer and a helpful coding assistant.
          You are able to generate high-quality code in any programming language.
        """,
    input="Write a function that checks if a year is a leap year in C#.",
)

In [None]:
response.id is None

### Manual

To manage a multiturn conversation yourself, pass a list of turns as the input.

Then append additional turns and resend it to generate later outputs:

In [None]:
conversation_history = [
  {
    "role": "user",
    "content": [{"type": "text", "text": "What are the three largest cities in Spain?"}]
  },
]

response = client.interactions.create(
    model="gemini-2.5-flash",
    input=conversation_history,
    store=False,
)

show_outputs(response.outputs)

In [None]:
conversation_history.append({
    'role': 'model',
    'content': response.outputs
})

conversation_history.append({
    'role': 'user',
    'content': [{"type": "text", "text": "What is the population of each city?"}]
})

response = client.interactions.create(
    model="gemini-2.5-flash",
    input=conversation_history,
    store=False,
)

conversation_history.append({
    'role': 'model',
    'content': response.outputs
})

In [None]:
show_outputs(response.outputs)

In [None]:
len(conversation_history)

## Generate JSON

The [controlled generation](https://ai.google.dev/gemini-api/docs/structured-output?lang=python#generate-json) capability in Gemini API allows you to constraint the model output to a structured format. You can provide the schemas as Pydantic Models or a JSON string.

In [None]:
import pydantic
import json

class Recipe(pydantic.BaseModel):
    recipe_name: str
    recipe_description: str
    recipe_ingredients: list[str]

response = client.interactions.create(
    model=MODEL_ID,
    input="Write a popular cookie recipe and its ingredients.",
    response_format=Recipe.model_json_schema(),
)

In [None]:
for part in response.outputs:
  if text:= getattr(part, 'text', None):
    parsed = Recipe.model_validate_json(part.text)
    print(repr(parsed))
    print()
    print(parsed.model_dump_json(indent=2))

###Pydantic TypeAdapter

Use `TypeAdaptor` if you need it to generate something other than a pydantic class:

In [None]:


type_adaptor = pydantic.TypeAdapter(list[Recipe])

response = client.interactions.create(
    model=MODEL_ID,
    input="Provide 3 popular cookie recipes and their ingredients.",
    response_format=type_adaptor.json_schema(),
)

In [None]:
for part in response.outputs:
  if text:= getattr(part, 'text', None):
    parsed = type_adaptor.validate_json(part.text)
    print(parsed)

## Generate Images

Gemini can output images directly as part of a conversation:

In [None]:
from IPython.display import Image, Markdown

response = client.interactions.create(
    model="gemini-3-pro-image-preview",
    input='Hi, can create photorealistic image of a pig with wings, a top hat, and monocle flying over a happy futuristic "solar punk" scifi city with lots of greenery?',
    response_modalities=['TEXT', 'IMAGE'] # this is the default for image models
)

In [None]:
print(response.model_dump_json(indent=2, exclude_none=True))

In [None]:
show_outputs(response.outputs)

In [None]:
from IPython.display import Image, Markdown

response = client.interactions.create(
    model="gemini-2.5-flash-image",
    input='Hi, can create photorealistic image of a pig with wings, a top hat, and monocle flying over a happy futuristic "solar punk" scifi city with lots of greenery?',
    #response_modalities=['TEXT', 'IMAGE'] # this is the default for image models
)

[Imagen](./Get_started_imagen.ipynb) is another way to generate images. See the [documentation](https://ai.google.dev/gemini-api/docs/image-generation#choose-a-model) for recommendations on where to use each one.

## Generate audio

In [None]:
response = client.interactions.create(
    model="gemini-2.5-flash-preview-tts",
    input="Say cheerfully: Have a wonderful day!",
    generation_config={'speech_config': {"voice": "algenib", "language": "en-US"}},
    #response_modalities=["audio"]
)



In [None]:
show_outputs(response.outputs)

## Streaming

By default, the model returns a response after completing the entire generation process. you can set

In [None]:
stream = client.interactions.create(
    model=MODEL_ID,
    input="Tell me a story about a lonely robot who finds friendship in a most unexpected place.",
    stream=True
)


In [None]:
for chunk in stream:
  print(chunk)

## Send asynchronous requests

Use the `AsyncGoogleGenAI` class to make async requests.

In [None]:
response = await client.aio.interactions.create(
    model="gemini-2.5-flash",
    input="Tell me a story about a lonely robot who finds friendship in a most unexpected place.",
)

In [None]:
print(response)

## Function calling

[Function calling](https://ai.google.dev/gemini-api/docs/function-calling) lets you provide a set of tools that it can use to respond to the user's prompt. You create a description of a function in your code, then pass that description to a language model in a request. The response from the model includes:
- The name of a function that matches the description.
- The arguments to call it with.

In [None]:
get_destination = {
    'type':'function',
    "name": "get_destination",
    "description": "Get directions to the destination.",
    "parameters": {
        "type": "OBJECT",
        "properties": {
            "destination": {
                "type": "STRING",
                "description": "Get directions to the destination.",
            },
        },
    },
}

response = client.interactions.create(
    model=MODEL_ID,
    input="I'd like to travel to Paris.",
    tools=[get_destination],
)

In [None]:
show_outputs(response.outputs)

In [None]:
import inspect

def to_function_schema(f: callable):
    """
    Converts a Python function to a function-calling JSON schema.
    """
    properties = {}
    required_params = []

    # Use the passed function 'f', not a hardcoded one
    sig = inspect.signature(f)

    for name, param in sig.parameters.items():
      # Get the JSON schema for the parameter's type annotation
      properties[name] = pydantic.TypeAdapter(param.annotation).json_schema()

      # Check if the parameter has a default value
      if param.default == inspect.Parameter.empty:
        required_params.append(name)

    return {
      "type": "function",
      "name": f.__name__,
      "description": f.__doc__,
      "parameters": {
          "type": "object", # 'object' is the correct JSON Schema type
          "properties": properties,
          #"required": required_params, # Add the list of required params here
      },
    }

In [None]:
def price_cookies(recipe:Recipe):
  "Get the price of a cookie, given the recipe"
  return 2.00

to_function_schema(price_cookies)

In [None]:
response = client.interactions.create(
    model=MODEL_ID,
    input="Invent a chocolate cookie recipe, write it out as expected by the pricing function, then call the function on the recipe to compute the the price per cookie",
    tools=[to_function_schema(price_cookies)],
)

In [None]:
show_outputs(response.outputs)

## Code execution

[Code execution](https://ai.google.dev/gemini-api/docs/code-execution?lang=python) lets the model generate and execute Python code to answer complex questions. You can find more examples in the [Code execution quickstart guide](./Code_execution.ipynb).

In [None]:
from IPython.display import Image, Markdown, Code, HTML

response = client.interactions.create(
    model=MODEL_ID,
    input="Generate and run a script to count how many letter r there are in the word strawberry",
    tools=[{'type': 'code_execution'}]
)

In [None]:
print(response.model_dump_json(indent=2, exclude_none=True))

In [None]:
show_outputs(response.outputs)

## GoogleSearch

In [None]:
response = client.interactions.create(
    model=MODEL_ID,
    input="lookup a recipe for cream of brocoli soup",
    tools=[{'type': 'google_search'}]
)

In [None]:
print(response.model_dump_json(indent=2, exclude_none=True))

In [None]:
show_outputs(response.outputs)

## Use urlContext

The URL Context tool empowers Gemini models to directly access, process, and understand content from user-provided web page URLs. This is key for enabling dynamic agentic workflows, allowing models to independently research, analyze articles, and synthesize information from the web as part of their reasoning process.

In this example you will use two links as reference and ask Gemini to find differences between the cook receipes present in each of the links:

In [None]:
prompt = """
Compare recipes from https://www.food.com/recipe/homemade-cream-of-broccoli-soup-271210
and from https://www.allrecipes.com/recipe/13313/best-cream-of-broccoli-soup/,
list the key differences between them.
"""


response = client.interactions.create(
      model=MODEL_ID,
      input=prompt,
      tools=[{'type': "url_context"}]

)


In [None]:
print(response.model_dump_json(indent=2, exclude_none=True))

In [None]:
show_outputs(response.outputs)

## Multimedia input

Data can always be included inline (as below) or uploaded to the Files API, then referred to by it's uri (See example in the "Send Image Prompts" section above.)

### Upload a PDF file

This PDF page is an article titled [Smoothly editing material properties of objects](https://research.google/blog/smoothly-editing-material-properties-of-objects-with-text-to-image-models-and-synthetic-data/) with text-to-image models and synthetic data available on the Google Research Blog.


In [None]:
# Prepare the file to be uploaded
PDF = "https://storage.googleapis.com/generativeai-downloads/data/Smoothly%20editing%20material%20properties%20of%20objects%20with%20text-to-image%20models%20and%20synthetic%20data.pdf"  # @param {type: "string"}
pdf_bytes = requests.get(PDF).content

pdf_path = pathlib.Path('article.pdf')
pdf_path.write_bytes(pdf_bytes)

Secondly, you'll upload the saved PDF file and generate a bulleted list summary of its contents.

In [None]:
response = client.interactions.create(
    model=MODEL_ID,
    input=[
        {
            "type": 'text',
            "text": "Summarize this file as a bulleted list.",
        },
        {
          "type": 'document',
          "mime_type": "application/pdf",
          "data": base64.b64encode(pdf_bytes).decode("utf-8"),
        },
    ]
)

In [None]:
show_outputs(response.outputs)

In [None]:
pdf_upload = client.files.upload(file=pdf_path)

In [None]:
response = client.interactions.create(
    model=MODEL_ID,
    input=[
        {
            "type": 'text',
            "text": "Summarize this file as a bulleted list.",
        },
        {
          "type": 'document',
          "mime_type": "application/pdf",
          "uri": pdf_upload.uri,
        },
    ]
)

In [None]:
show_outputs(response.outputs)

### Upload an audio file

In this case, you'll use a [sound recording](https://www.jfklibrary.org/asset-viewer/archives/jfkwha-006) of President John F. Kennedy’s 1961 State of the Union address.

In [None]:
# Prepare the file to be uploaded
AUDIO = "https://storage.googleapis.com/generativeai-downloads/data/Walking_thoughts_3.m4a"  # @param {type: "string"}
audio_bytes = requests.get(AUDIO).content


In [None]:
response = client.interactions.create(
    model=MODEL_ID,
    input=[
        {
            "type": 'text',
            "text": "Summarize this audio file.",
        },
        {
          "type": 'audio',
          "mime_type": "audio/x-m4a",
          "data": base64.b64encode(audio_bytes).decode("utf-8"),
        },
    ]
)

### Upload a video file

In this case, you'll use a short clip of [Big Buck Bunny](https://peach.blender.org/about/).

In [None]:
# Download the video file
VIDEO_URL = "https://storage.googleapis.com/generativeai-downloads/videos/Big_Buck_Bunny.mp4"  # @param {type: "string"}
video_bytes = requests.get(VIDEO_URL).content


In [None]:
# Ask Gemini about the video
response = client.interactions.create(
    model=MODEL_ID,
    input=[
        {
            "type": 'text',
            "text": "Summarize this video.",
        },
        {
          "type": 'video',
          "mime_type": "video/mp4",
          "data": base64.b64encode(video_bytes).decode("utf-8"),
        },
    ]
)

### YouTube link

For YouTube links, you don't need to explicitly upload the video file content, but you do need to explicitly declare the video URL you want the model to process as part of the `contents` of the request. For more information see the [vision](https://ai.google.dev/gemini-api/docs/vision?lang=python#youtube) documentation including the features and limits.

> **Note:** You're only able to submit up to one YouTube link per `generate_content` request.

> **Note:** If your text input includes YouTube links, the system won't process them, which may result in incorrect responses. To ensure proper handling, explicitly provide the URL using the `uri` parameter in `FileData`.

The following example shows how you can use the model to summarize the video. In this case use a summary video of [Google I/O 2024]("https://www.youtube.com/watch?v=WsEQjeZoEng").

In [None]:
import base64

response = client.interactions.create(
    model=MODEL_ID,
    input=[
        {
            "type": 'video',
            "uri": "https://www.youtube.com/watch?v=WsEQjeZoEng",
        },
        {
            "type": 'text',
            "text": "Summarize this video",
        }
    ]
)
