<a href="https://colab.research.google.com/github/donbcolab/google_genai_colab/blob/main/gemini_multi_modal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [Python Docs Samples](https://github.com/GoogleCloudPlatform/python-docs-samples) Repository
- the superhero of the story is Laurent Picard.  His [A Better Way to Use Google Cloud from Colab](https://medium.com/google-colab/a-better-way-to-use-google-cloud-from-colab-bb93f88b5021) Medium article provides a very simple approach for authenticating form Colab to Google Cloud resources.
- The Python Docs Samples repository is amazing.  It provides some very concise and to the point examples to test and validate core Google Generative AI models.
  - below are some simple ones that I wrapped in a simple gradio application

## Pre-requisistes

- a Google Cloud Project setup with required APIs enabled in **us-central1**
- other locations may be available after April 2024
- For the current Notebook the key API is
  - Vertex AI API (aiplatform.googleapis.com)

## Google Project Setup and Authentication

In [1]:
from google.colab import userdata

PROJECT_ID = userdata.get('GOOGLE_CLOUD_PROJECT_ID')
LOCATION = userdata.get('GOOGLE_CLOUD_LOCATION')

In [2]:
from google.colab import auth
auth.authenticate_user(project_id=PROJECT_ID)

## Install Python dependencies

In [3]:
! pip3 install -U -q google-cloud-aiplatform
! pip3 install -U -q 'anthropic[vertex]'
! pip install -q gradio

## Vertex and Gemini App Logic

In [4]:
import vertexai
import time
from vertexai.generative_models import GenerativeModel, ChatSession, Part
from vertexai.preview.vision_models import Image, ImageGenerationModel, ImageTextModel
from anthropic import AnthropicVertex
from vertexai.preview.generative_models import (
    grounding,
    Tool,
)

vertexai.init(project=PROJECT_ID, location=LOCATION)

def summarize_audio(audio_file_path):
    model = GenerativeModel("gemini-1.5-pro-preview-0409")
    prompt = """
    Please provide a summary for the audio.
    Provide chapter titles with timestamps, be concise and short, no need to provide chapter summaries.
    Do not make up any information that is not part of the audio and do not be verbose.
    """
    time.sleep(3)
    print(audio_file_path)
    with open(audio_file_path, "rb") as audio_file:
        audio_bytes = audio_file.read()
    audio = Part.from_data(audio_bytes, mime_type="audio/mpeg")
    contents = [audio, prompt]
    response = model.generate_content(contents)
    return response.text

def transcript_audio(audio_file_path):
    model = GenerativeModel("gemini-1.5-pro-preview-0409")
    prompt = """
    Transcribe this recording.
    If there are multiple speakers capture it in the format of timecode, speaker, caption.
    Use speaker A, speaker B, etc. to identify speakers.
    """
    time.sleep(3)
    print(audio_file_path)
    with open(audio_file_path, "rb") as audio_file:
        audio_bytes = audio_file.read()
    audio = Part.from_data(audio_bytes, mime_type="audio/mpeg")
    contents = [audio, prompt]
    response = model.generate_content(contents)
    return response.text

def analyze_video_with_audio(video_file_path):
    model = GenerativeModel("gemini-1.5-pro-preview-0409")
    prompt = """
    Provide a description of the video.
    The description should also highlight anything important which people say in the video.
    """
    time.sleep(5)
    print(video_file_path)
    with open(video_file_path, "rb") as video_file:
        video_bytes = video_file.read()
    video = Part.from_data(video_bytes, mime_type="video/mp4")
    contents = [video, prompt]
    response = model.generate_content(contents)
    return response.text

def generate_image(prompt):
    model = ImageGenerationModel.from_pretrained("imagegeneration@006")

    aspect_ratio = "1:1"
    safety_filter_level = "block_few"
    person_generation = "allow_adult"

    generate_response = model.generate_images(
        prompt=prompt,
        number_of_images=1,
        language="en",
        aspect_ratio=aspect_ratio,
        safety_filter_level=safety_filter_level,
        person_generation=person_generation,
    )

    images = []
    for index, result in enumerate(generate_response):
        images.append(generate_response[index]._pil_image)
    return images[0]

def caption_image(input_file):
    model = ImageTextModel.from_pretrained("imagetext@001")
    time.sleep(5)
    print(input_file)
    source_img = Image.load_from_file(location=input_file)

    captions = model.get_captions(
        image=source_img,
        language="en",
        number_of_results=1,
    )

    return captions

def generate_claude_text(prompt):
    client = AnthropicVertex(project_id=PROJECT_ID, region=LOCATION)
    result = []

    with client.messages.stream(
        model="claude-3-haiku@20240307",
        max_tokens=1024,
        messages=[{"role": "user", "content": prompt}],
    ) as stream:
        for text in stream.text_stream:
            result.append(text)

    return "".join(result)

def generate_text_with_grounding(chat_prompt):
    model = GenerativeModel("gemini-1.5-pro-preview-0409")
    chat = model.start_chat()

    def get_chat_response(chat: ChatSession, prompt: str) -> str:
        tool = Tool.from_google_search_retrieval(grounding.GoogleSearchRetrieval())

        response = chat.send_message(prompt, tools=[tool])
        return response.text

    prompt = chat_prompt
    return get_chat_response(chat, prompt)

## Gradio Web App

In [None]:
import gradio as gr

with gr.Blocks() as demo:
    gr.Markdown("# Gemini Multi Modal Generation and Analysis")

    with gr.Tab("Audio Summarization"):
        audio_file_summary = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Upload Audio")
        summarize_button = gr.Button("Summarize Audio")
        summary_output = gr.Textbox(label="Audio Summary")
        summarize_button.click(
            fn=summarize_audio,
            inputs=audio_file_summary,
            outputs=summary_output
            )

    with gr.Tab("Audio Transcription"):
        audio_file_transcript = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Upload Audio")
        transcript_button = gr.Button("Transcribe Audio")
        transcript_output = gr.Textbox(label="Audio Transcription")
        transcript_button.click(
            fn=transcript_audio,
            inputs=audio_file_transcript,
            outputs=transcript_output
            )

    with gr.Tab("Video Analysis"):
        video_file = gr.Video(label="Upload Video", format="mp4")
        analyze_button = gr.Button("Analyze Video")
        analysis_output = gr.Textbox(label="Video Analysis")
        analyze_button.click(
            fn=analyze_video_with_audio,
            inputs=video_file,
            outputs=analysis_output
            )

    with gr.Tab("Image Generation"):
        image_prompt = gr.Textbox(label="Image Prompt")
        generate_button = gr.Button("Generate Image")
        image_output = gr.Image(label="Generated Images")
        generate_button.click(
            fn=generate_image,
            inputs=image_prompt,
            outputs=image_output
            )

    with gr.Tab("Image Captioning"):
        caption_prompt = gr.Image(label="Image", type="filepath")
        caption_button = gr.Button("Generate Caption")
        caption_output = gr.Textbox(label="Image Caption")
        caption_button.click(
            fn=caption_image,
            inputs=caption_prompt,
            outputs=caption_output
            )

    with gr.Tab("Anthropic Claude"):
        claude_prompt = gr.Textbox(label="Prompt")
        claude_button = gr.Button("Generate Text")
        claude_output = gr.Textbox(label="Generated Text")
        claude_button.click(
            fn=generate_claude_text,
            inputs=claude_prompt,
            outputs=claude_output
        )

    with gr.Tab("Google Search as a Tool for Grounding"):
        grounding_prompt = gr.Textbox(label="Prompt")
        grounding_button = gr.Button("Generate Text")
        grounding_output = gr.Textbox(label="Generated Text")
        grounding_button.click(
            fn=generate_text_with_grounding,
            inputs=grounding_prompt,
            outputs=grounding_output
        )

    demo.launch(share=True, debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://f0e950f5e861a7127f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)
