In [8]:
!pip install gsutil

Collecting gsutil
  Downloading gsutil-5.27.tar.gz (3.0 MB)
Collecting argcomplete>=1.9.4
  Downloading argcomplete-3.2.1-py3-none-any.whl (42 kB)
Collecting crcmod>=1.7
  Downloading crcmod-1.7.tar.gz (89 kB)
Collecting fasteners>=0.14.1
  Downloading fasteners-0.19-py3-none-any.whl (18 kB)
Collecting gcs-oauth2-boto-plugin>=3.0
  Downloading gcs-oauth2-boto-plugin-3.0.tar.gz (20 kB)
Collecting google-apitools>=0.5.32
  Downloading google_apitools-0.5.32-py3-none-any.whl (135 kB)
Collecting httplib2==0.20.4
  Downloading httplib2-0.20.4-py3-none-any.whl (96 kB)
Collecting google-reauth>=0.1.0
  Downloading google_reauth-0.1.1-py2.py3-none-any.whl (17 kB)
Collecting monotonic>=1.4
  Using cached monotonic-1.6-py2.py3-none-any.whl (8.2 kB)
Collecting retry_decorator>=1.0.0
  Downloading retry_decorator-1.1.1.tar.gz (3.9 kB)
Collecting rsa==4.7.2
  Downloading rsa-4.7.2-py3-none-any.whl (34 kB)
Collecting boto>=2.29.1
  Downloading boto-2.49.0-py2.py3-none-any.whl (1.4 MB)
Collecting oau

In [2]:
from dotenv import load_dotenv
import os

load_dotenv()
import google.generativeai as genai
genai.configure(api_key=os.getenv('GCP_API_KEY'))

In [4]:
model = genai.GenerativeModel("gemini-pro-vision")

In [5]:
from IPython.display import Markdown, display
from vertexai.preview.generative_models import (
    Content,
    GenerationConfig,
    GenerationResponse,
    GenerativeModel,
    Image,
    Part,
)

### Download custom Python modules and utilities

In [6]:
import os
import urllib.request
import sys

if not os.path.exists("utils"):
    os.makedirs("utils")

    
# download the helper scripts from utils folder
url_prefix = "https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/use-cases/retrieval-augmented-generation/utils/"
files = ["intro_multimodal_rag_utils.py"]

for fname in files:
    urllib.request.urlretrieve(f"{url_prefix}/{fname}", filename=f"utils/{fname}")

### Get documents and images from GCS

In [9]:
# download documents and images used in this notebook
!gsutil -m rsync -r gs://github-repo/rag/intro_multimodal_rag .
print("Synchronization completed")

Synchronization completed


Building synchronization state...
Starting synchronization...
Copying gs://github-repo/rag/intro_multimodal_rag/text_query_answer_02.png...
/ [0/5 files][    0.0 B/823.1 KiB]   0% Done                                    
Copying gs://github-repo/rag/intro_multimodal_rag/google-10k-sample-14pages.pdf...
/ [0/5 files][    0.0 B/823.1 KiB]   0% Done                                    
Copying gs://github-repo/rag/intro_multimodal_rag/class_a_share.png...
/ [0/5 files][    0.0 B/823.1 KiB]   0% Done                                    
Copying gs://github-repo/rag/intro_multimodal_rag/tac_table_revenue.png...
/ [0/5 files][    0.0 B/823.1 KiB]   0% Done                                    
-
- [1/5 files][ 26.4 KiB/823.1 KiB]   3% Done                                    
- [2/5 files][ 95.0 KiB/823.1 KiB]  11% Done                                    
Copying gs://github-repo/rag/intro_multimodal_rag/text_query_answer_01.png...
- [2/5 files][ 95.0 KiB/823.1 KiB]  11% Done                     

### Extract and store metadata of text and images from a document

In [14]:
!pip install pymupdf

Collecting pymupdf
  Downloading PyMuPDF-1.23.7-cp39-none-win_amd64.whl (3.5 MB)
Collecting PyMuPDFb==1.23.7
  Downloading PyMuPDFb-1.23.7-py3-none-win_amd64.whl (24.5 MB)
Installing collected packages: PyMuPDFb, pymupdf
Successfully installed PyMuPDFb-1.23.7 pymupdf-1.23.7


In [15]:
from utils.intro_multimodal_rag_utils import get_document_metadata

In [20]:
# Specify the PDF path
pdf_path = "google-10k-sample-14pages.pdf"

# Specify the image description prompt. Change it
image_description_prompt = """Explain what is going on in the image.
If it's a table, extract all elements of the table. 
If it's a graph, explain the findings in the graph.
Do not include any numbers that are not mentioned in the image:"""

# Extract text and image metadata from the PDF document
text_metadata_df, image_metadata_df = get_document_metadata(
    "gemini-test-project-408107",  # Assuming this is a positional argument
    model,                         # Assuming this is a positional argument
    pdf_path=pdf_path,
    image_save_dir="images",
    image_description_prompt=image_description_prompt,
    embedding_size=1408,
    text_emb_text_limit=1000
)


print("--- Completed processing. ---")

Processing page: 1


DefaultCredentialsError: Your default credentials were not found. To set up Application Default Credentials, see https://cloud.google.com/docs/authentication/external/set-up-adc for more information.

In [21]:
text_metadata_df.head()

NameError: name 'text_metadata_df' is not defined

In [None]:
image_metadata_df.head()

### Import the helper functions to implement RAG¶

In [None]:
from utils.intro_multimodal_rag_utils import (
    get_similar_text_from_query,
    print_text_to_text_citation,
    get_similar_image_from_query,
    print_text_to_image_citation,
    get_gemini_response,
    display_images,
)


### Search similar text with text query

In [None]:
query = "I need details for basic and diluted net income per share of Class A, Class B, and Class C share for google?"

In [None]:
# Matching user text query with "chunk_embedding" to find relevant chunks.
matching_results_text = get_similar_text_from_query(
    PROJECT_ID,
    query,
    text_metadata_df,
    column_name="text_embedding_chunk",
    top_n=3,
    embedding_size=1408,
    chunk_text=True,
)

# Print the matched text citations
print_text_to_text_citation(matching_results_text, print_top=True, chunk_text=True)


### Search similar images with text query

In [None]:
matching_results_image = get_similar_image_from_query(
    PROJECT_ID,
    text_metadata_df,
    image_metadata_df,
    query=query,
    column_name="text_embedding_from_image_description",  # Use image description text embedding
    image_emb=False,  # Use text embedding instead of image embedding
    top_n=3,
    embedding_size=1408,
)

# Markdown(print_text_to_image_citation(matching_results_image, print_top=True))
print("\n **** Result: ***** \n")

# Display the top matching image
display(matching_results_image[0]["image_object"])

In [None]:
## you can check the citations to probe further.
## check the "image description:" which is a description extracted through gemini which helped search our query.
Markdown(print_text_to_image_citation(matching_results_image, print_top=True))

## Image Search

### Search similar image with image query

In [1]:
# You can find a similar image as per the images you have in the metadata.
# In this case, you have a table (picked from the same document source) and you would like to find similar tables in the document.
image_query_path = "tac_table_revenue.png"

# Print a message indicating the input image
print("***Input image from user:***")

# Display the input image
Image.load_from_file(image_query_path)


***Input image from user:***


NameError: name 'Image' is not defined

In [None]:
# Search for Similar Images Based on Input Image and Image Embedding

matching_results_image = get_similar_image_from_query(
    PROJECT_ID,
    text_metadata_df,
    image_metadata_df,
    query=query,  # Use query text for additional filtering (optional)
    column_name="mm_embedding_from_img_only",  # Use image embedding for similarity calculation
    image_emb=True,
    image_query_path=image_query_path,  # Use input image for similarity calculation
    top_n=3,  # Retrieve top 3 matching images
    embedding_size=1408,  # Use embedding size of 1408
)

print("\n **** Result: ***** \n")

# Display the Top Matching Image
display(
    matching_results_image[0]["image_object"]
)  # Display the top matching image object (Pillow Image)


In [None]:
# Display citation details for the top matching image
print_text_to_image_citation(matching_results_image, print_top=True)

In [None]:
# Check Other Matched Images (Optional)
# You can access the other two matched images using:

print("---------------Matched Images------------------\n")
display_images(
    [
        matching_results_image[0]["img_path"],
        matching_results_image[1]["img_path"],
    ],
    resize_ratio = 0.8
)

In [None]:
image_query_path = "class_a_share.png"

# Print a message indicating the input image
print("***Input image from user:***")

# Display the input image
Image.load_from_file(image_query_path)

In [None]:
# Load the input image using Pillow
user_image_object = Image.load_from_file(image_query_path)

# Define the comparison query
compare_query = """Question: How has nasdaq performed with respect to Class A and Class B shares of Google?
Answer: """
instructions = """instructions: Compare two images and base your reasoning only on the images provided.
Provide detail reasoning of your conclusions.
Images: """

# Find similar images based on the input image
image_selected_based_on_source_image = get_similar_image_from_query(
    PROJECT_ID,
    text_metadata_df,
    image_metadata_df,
    image_query_path=image_query_path,
    column_name="mm_embedding_from_img_only",
    image_emb=True,
    top_n=3,
    embedding_size=1408,
)

# Select the best matching image from the search results
selected_image_object = image_selected_based_on_source_image[0]["image_object"]

# Prepare the model input
model_input = [instructions, user_image_object, selected_image_object, compare_query]

# Generate Gemini response with streaming output
Markdown(get_gemini_response(model, model_input=model_input, stream=True))


In [None]:
# image selected by the model to make the comparision based on user query
Image.load_from_file(image_selected_based_on_source_image[0]["img_path"])


In [None]:
# citations
print_text_to_image_citation(image_selected_based_on_source_image, print_top=True)

## Multimodal retrieval augmented generation (RAG)

In [None]:
# this time we are not passing any images, but just a simple text query.

query = """Question: How has nasdaq and s&p performed with respect to class A shares and class C shares?
Which one would be better to buy and why?
Answer: """

# query = """Question: Find the total revenues and other related financial numbers for Alphabet
# Answer: """

In [None]:
# Retrieve relevant chunks of text based on the query
matching_results_chunks_data = get_similar_text_from_query(
    PROJECT_ID,
    query,
    text_metadata_df,
    column_name="text_embedding_chunk",
    top_n=5,
    embedding_size=1408,
    chunk_text=True,
)


In [None]:
# Get all relevant images based on user query
matching_results_image_fromdescription_data = get_similar_image_from_query(
    PROJECT_ID,
    text_metadata_df,
    image_metadata_df,
    query=query,
    column_name="text_embedding_from_image_description",
    image_emb=False,
    top_n=3,
    embedding_size=1408,
)


In [None]:
# combine all the selected relevant text chunks
context_text = []
for key, value in matching_results_chunks_data.items():
    context_text.append(value["chunk_text"])
final_context_text = "\n".join(context_text)

# combine all the relevant images and their description generated by Gemini
context_images = []
for key, value in matching_results_image_fromdescription_data.items():
    context_images.extend(
        ["Image: ", value["image_object"], "Caption: ", value["image_description"]]
    )


In [None]:
instructions = """The context of extraction of destails should be based on the text context given in "text_context" and Image context given in "image_context" along with its Caption: \n
Base your response on "text_context" and "image_context". Do not use any numbers or percentages that are not present in the "image_context".
Do not include any cumulative total return in the answer. Context: 
"""

final_prompt = [
    query,
    instructions,
    "text_context:",
    "\n".join(context_text),
    "image_context:",
]
final_prompt.extend(context_images)


In [None]:
Markdown(get_gemini_response(model, model_input = final_prompt,stream=True))

In [None]:
print("---------------Matched Images------------------\n")
display_images(
    [
        matching_results_image_fromdescription_data[0]["img_path"],
        matching_results_image_fromdescription_data[1]["img_path"],
    ],
    resize_ratio = 0.8
)


In [None]:
# Image citations. You can check how Gemini generated metadata helped in grounding the answer.

print_text_to_image_citation(matching_results_image_fromdescription_data, print_top=False)

In [None]:
# Text citations

print_text_to_text_citation(
    matching_results_chunks_data,
    print_top=False,
    chunk_text=True,
)
