In [13]:
import os
import getpass

os.environ["GEMINI_API_KEY"] = getpass.getpass("Gemini API Key:")

In [14]:
from google import genai
from google.genai import types
from PIL import Image
from io import BytesIO
import base64

client = genai.Client()

contents = ('A chef in a sunlit rustic kitchen, flour dust dancing in golden morning light streaming through vintage windows, hands kneading artisanal bread dough on a worn wooden table scattered with fresh herbs and heirloom tomatoes, copper pots gleaming in background, shot with 85mm lens, shallow depth of field, warm color grading, professional food photography style, hyper-realistic detail --aspect 16:9')

response = client.models.generate_content(
    model="gemini-2.0-flash-preview-image-generation",
    contents=contents,
    config=types.GenerateContentConfig(
      response_modalities=['TEXT', 'IMAGE']
    )
)

for part in response.candidates[0].content.parts:
  if part.text is not None:
    print(part.text)
  elif part.inline_data is not None:
    image = Image.open(BytesIO((part.inline_data.data)))
    image.save('images/chef.png')
    image.show()

I will generate an image depicting a chef in a rustic kitchen bathed in warm morning sunlight. The air will be filled with dancing flour dust illuminated by the golden light streaming through old-fashioned windows. The chef's hands will be actively kneading artisanal bread dough on a well-used wooden table, which will also feature scattered fresh herbs and colorful heirloom tomatoes. In the soft-focused background, polished copper pots will subtly gleam. The image will have a shallow depth of field, characteristic of an 85mm lens, with a warm color grading and hyper-realistic details, reflecting a professional food photography style.



<div style="background-color: #204B8E; color: white; padding: 10px; border-radius: 5px;">

### Task #1: Adjust the prompt to include face of the chef.

### Insights: Share any insights you have learned by playing around with the prompt.
</div>

### Text and Image -> Image

In [32]:
from google import genai
from google.genai import types
from PIL import Image
from io import BytesIO

import PIL.Image

client = genai.Client()


image = PIL.Image.open('images/brad.jpg')

# Convert the image to a base64 string
buffered = BytesIO()
image.save(buffered, format="JPEG")
image_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')


text_input = 'This is a picture of me. Make me wear a crown like a king.'

content = types.Content(
  role="user",
  parts=[
    types.Part(text=text_input),
    types.Part.from_bytes(
        mime_type="image/jpeg",
        data=base64.b64decode(image_base64)
    )
  ]
)

response = client.models.generate_content(
    model="models/gemini-2.0-flash-preview-image-generation",
    contents=content,
    config=types.GenerateContentConfig(
      response_modalities=['TEXT', 'IMAGE']
    )
)

for part in response.candidates[0].content.parts:
  if part.text is not None:
    print(part.text)
  elif part.inline_data is not None:
    image = Image.open(BytesIO((part.inline_data.data)))
    image.save('images/brad_crown.jpg')
    image.show()

I will add a regal gold crown adorned with shimmering jewels to your head, transforming your portrait into that of a distinguished king.




In [35]:
from pprint import pprint

pprint(response)

GenerateContentResponse(
  automatic_function_calling_history=[],
  candidates=[
    Candidate(
      content=Content(
        parts=[
          Part(
            text="""I will add a regal gold crown adorned with shimmering jewels to your head, transforming your portrait into that of a distinguished king.

"""
          ),
          Part(
            inline_data=Blob(
              data=<... Max depth ...>,
              mime_type=<... Max depth ...>
            )
          ),
        ],
        role='model'
      ),
      finish_reason=<FinishReason.STOP: 'STOP'>,
      index=0
    ),
  ],
  model_version='gemini-2.0-flash-preview-image-generation',
  response_id='Tz-paKivEuOIz7IPsenCyAU',
  sdk_http_response=HttpResponse(
    headers=<dict len=11>
  ),
  usage_metadata=GenerateContentResponseUsageMetadata(
    candidates_token_count=1315,
    candidates_tokens_details=[
      ModalityTokenCount(
        modality=<MediaModality.IMAGE: 'IMAGE'>,
        token_count=1290
      ),
    ]

<div style="background-color: #204B8E; color: white; padding: 10px; border-radius: 5px;">

### Task #1: Analyze the structure of the response. why is it structured this way?

### Answer:
</div>

In [25]:
from google import genai
from google.genai import types

client = genai.Client()

# Upload the first image
image1_path = "images/brad.jpg"
uploaded_file = client.files.upload(file=image1_path)

# Prepare the second image as inline data
image2_path = "images/brad_crown.jpg"
with open(image2_path, 'rb') as f:
    img2_bytes = f.read()

# Create the prompt with text and multiple images
response = client.models.generate_content(

    model="gemini-2.5-flash",
    contents=[
        "What is different between these two images?",
        uploaded_file,  # Use the uploaded file reference
        types.Part.from_bytes(
            data=img2_bytes,
            mime_type='image/png'
        )
    ]
)

print(response.text)

The only difference between the two images is that in the second image, **Brad Pitt is wearing a golden crown on his head**, while in the first image, he is not.


### Video processing

In [36]:
response = client.models.generate_content(
    model='models/gemini-2.5-flash',
    contents=types.Content(
        parts=[
            types.Part(
                file_data=types.FileData(file_uri='https://www.youtube.com/watch?v=G32jFMsS7ow&t=1s')
            ),
            types.Part(text='Please summarize the video in 3 sentences.')
        ]
    )
)

In [37]:
pprint(response.text)

('Multimodal AI is introduced as the next generation of generative AI, '
 'designed to interact using natural human interfaces like voice, vision, and '
 'sound, unlike current AI that relies on keyboards. The video contrasts the '
 'traditional "sequential approach," which converts audio to text for '
 'processing and then back to speech, often leading to delays. Instead, the '
 '"multimodal approach" directly processes various inputs like audio and '
 'images into "embeddings," allowing for end-to-end, real-time responses. This '
 'direct processing, where AI "thinks in sound, not words," makes multimodal '
 'AI blazingly fast but also presents challenges in control, monitoring, and '
 'fine-tuning compared to its sequential counterpart.')


### Audio processing

In [49]:
from google.genai import types

with open('audio/a_projectile_is.wav', 'rb') as f:
    audio_bytes = f.read()

response = client.models.generate_content(
  model='gemini-2.5-flash',
  contents=[
    'transcribe',
    types.Part.from_bytes(
      data=audio_bytes,
      mime_type='audio/mp3',
    )
  ]
)

print(response.text)

A projectile is any object that is thrown or projected into the air and is only subject to the force of gravity. Projectile motion is the specific type of motion that a projectile undergoes, following a curved path.


In [50]:
response = client.models.generate_content(
  model='gemini-2.5-flash',
  contents=[
    'describe this audio clip',
    types.Part.from_bytes(
      data=audio_bytes,
      mime_type='audio/mp3',
    )
  ]
)

print(response.text)

The audio clip features a **clear and articulate male voice** speaking in a calm, informative tone. He is narrating or explaining a concept, specifically defining "projectile" and "projectile motion" in a manner typical of an educational or documentary setting.

Beneath his speech, there is a **subtle, continuous low-frequency hum or drone**. This ambient sound provides a gentle, atmospheric backdrop that is unobtrusive and does not compete with the speaking voice. It sounds somewhat electronic or synthesized, contributing to a thoughtful or slightly serious mood.

The speaking voice remains prominent throughout, with the background sound providing a consistent, underlying layer.
