## Build a Generative AI solution using a RAG Framework: Challenge Lab L400

In [1]:
!pip install --upgrade --user google-cloud-aiplatform google-cloud-storage firebase-admin
!pip install langchain_community
!pip install google-cloud-aiplatform
!pip install google-cloud-storage
!pip install langchain
!pip install pymuPDF

Collecting google-cloud-aiplatform
  Downloading google_cloud_aiplatform-1.63.0-py2.py3-none-any.whl.metadata (31 kB)
Collecting google-cloud-storage
  Downloading google_cloud_storage-2.18.2-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting firebase-admin
  Downloading firebase_admin-6.5.0-py3-none-any.whl.metadata (1.5 kB)
Collecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.34.1 (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.34.1->google-cloud-aiplatform)
  Downloading google_api_core-2.19.1-py3-none-any.whl.metadata (2.7 kB)
Collecting google-resumable-media>=2.7.2 (from google-cloud-storage)
  Downloading google_resumable_media-2.7.2-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting cachecontrol>=0.12.6 (from firebase-admin)
  Downloading cachecontrol-0.14.0-py3-none-any.whl.metadata (3.1 kB)
Collecting google-cloud-firestore>=2.9.1 (from firebase-admin)
  Downloading googl

In [2]:
import IPython
from IPython.display import Markdown, display
import time

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

In [1]:
# get project ID
PROJECT_ID = ! gcloud config get project
PROJECT_ID = PROJECT_ID[0]
LOCATION = "us-central1"

# generate an unique id for this session
from datetime import datetime

UID = datetime.now().strftime("%m%d%H%M")

In [3]:
! gcloud services enable run.googleapis.com compute.googleapis.com aiplatform.googleapis.com storage.googleapis.com --project "{PROJECT_ID}"

Operation "operations/acf.p2-885281464405-89a00720-0de8-405a-9534-955e7cb24322" finished successfully.


In [4]:
import requests

# Define the URL of the PDF file to be downloaded
url = "https://www.nyc.gov/assets/doh/downloads/pdf/rii/fpc-manual.pdf"

# Send a GET request to the URL to download the PDF file
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Define the filename for the downloaded PDF
    pdf_filename = "fpc-manual.pdf"

    # Open the file in write-binary mode and save the content of the response
    with open(pdf_filename, 'wb') as file:
        file.write(response.content)

    # Print a success message
    print(f"PDF downloaded successfully as {pdf_filename}")
else:
    # Print an error message if the request was not successful
    print(f"Failed to download PDF. Status code: {response.status_code}")


PDF downloaded successfully as fpc-manual.pdf


In [25]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

pdf_document = "fpc-manual.pdf"

loader = PyMuPDFLoader(pdf_document)
data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
texts = text_splitter.create_documents([loader])
print(texts[0])
print(texts[1])

AttributeError: 'Document' object has no attribute 'get_text'

In [13]:
import fitz  # PyMuPDF

# Define the path to the PDF document
pdf_document = "fpc-manual.pdf"

# Open the PDF document using PyMuPDF
pdf = fitz.open(pdf_document)

# Initialize a list to store the content of each page
pages_content = []

# Loop through each page in the PDF document
for page_num in range(len(pdf)):
    # Get the current page
    page = pdf[page_num]
    # Extract the text from the current page and store it in the list
    pages_content.append(page.get_text())
    
# Function to split text into chunks
def split_text_into_chunks(text, chunk_size=1000):
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

for index, content in enumerate(pages_content):
    doc_id = str(index + 1)

    # Split the content into chunks
    chunks = split_text_into_chunks(content)

    # Create a document reference
    doc_ref = db.collection(collection_name).document(doc_id)

    try:
        # Store each chunk in Firestore
        for chunk_index, chunk in enumerate(chunks):
            chunk_doc_id = f"{doc_id}_chunk_{chunk_index + 1}"
            chunk_doc_ref = db.collection(collection_name).document(chunk_doc_id)
            chunk_doc_ref.set({"content": chunk})
            print(f"Stored chunk {chunk_index + 1} of page {index + 1} in Firestore with document ID {chunk_doc_id}")
    except Exception as e:
        # Print an error message if the operation fails
        print(f"Failed to store chunks of page {index + 1} in Firestore. Error: {e}")

# Close the PDF document
pdf.close()
print(len(pages_content))

Stored chunk 1 of page 1 in Firestore with document ID 1_chunk_1
Stored chunk 2 of page 1 in Firestore with document ID 1_chunk_2
Stored chunk 3 of page 1 in Firestore with document ID 1_chunk_3
Stored chunk 4 of page 1 in Firestore with document ID 1_chunk_4
Stored chunk 5 of page 1 in Firestore with document ID 1_chunk_5
Stored chunk 1 of page 2 in Firestore with document ID 2_chunk_1
Stored chunk 2 of page 2 in Firestore with document ID 2_chunk_2
Stored chunk 3 of page 2 in Firestore with document ID 2_chunk_3
Stored chunk 4 of page 2 in Firestore with document ID 2_chunk_4
Stored chunk 1 of page 3 in Firestore with document ID 3_chunk_1
Stored chunk 2 of page 3 in Firestore with document ID 3_chunk_2
Stored chunk 3 of page 3 in Firestore with document ID 3_chunk_3
Stored chunk 4 of page 3 in Firestore with document ID 3_chunk_4
Stored chunk 1 of page 4 in Firestore with document ID 4_chunk_1
Stored chunk 2 of page 4 in Firestore with document ID 4_chunk_2
Stored chunk 3 of page 4 

In [9]:
from google.cloud import firestore

# Initialize Firestore
db = firestore.Client(project=PROJECT_ID)

# Firestore collection name
collection_name = "food_safety"


for index, content in enumerate(pages_content):
    id = str(index + 1)

    # Create a document reference
    ref = db.collection(collection_name).document(id)

    try:
        # Set the document with the content
        ref.set({"content": content})
        print(f"Stored page {index + 1} in Firestore with ID {id}")
    except Exception as e:
        # Print an error message if the operation fails
        print(f"Failed to store page {index + 1} in Firestore. Error: {e}")

I0000 00:00:1724290357.526511     760 config.cc:230] gRPC experiments enabled: call_status_override_on_cancellation, event_engine_dns, event_engine_listener, http2_stats_fix, monitoring_experiment, pick_first_new, trace_record_callops, work_serializer_clears_time_cache


Stored page 1 in Firestore with ID 1
Stored page 2 in Firestore with ID 2
Stored page 3 in Firestore with ID 3
Stored page 4 in Firestore with ID 4
Stored page 5 in Firestore with ID 5
Stored page 6 in Firestore with ID 6
Stored page 7 in Firestore with ID 7
Stored page 8 in Firestore with ID 8
Stored page 9 in Firestore with ID 9
Stored page 10 in Firestore with ID 10
Stored page 11 in Firestore with ID 11
Stored page 12 in Firestore with ID 12
Stored page 13 in Firestore with ID 13
Stored page 14 in Firestore with ID 14
Stored page 15 in Firestore with ID 15
Stored page 16 in Firestore with ID 16
Stored page 17 in Firestore with ID 17
Stored page 18 in Firestore with ID 18
Stored page 19 in Firestore with ID 19
Stored page 20 in Firestore with ID 20
Stored page 21 in Firestore with ID 21
Stored page 22 in Firestore with ID 22
Stored page 23 in Firestore with ID 23
Stored page 24 in Firestore with ID 24
Stored page 25 in Firestore with ID 25
Stored page 26 in Firestore with ID 26
Stor

In [30]:
# Example text data (replace with your actual text data)
texts = [
    "This is a sample text for embedding.",
    "Another example text to generate embeddings."
]

# Assuming you have a model to generate embeddings (this is just a placeholder)
# You need to define your actual model here, e.g., using a PredictionServiceClient
embeddings = [
    [0.1, 0.2, 0.3],  # Example embedding for the first text
    [0.4, 0.5, 0.6]   # Example embedding for the second text
]

import json

# Create the JSON-L file
jsonl_filename = "embeddings.jsonl"
with open(jsonl_filename, 'w') as f:
    for i, embedding in enumerate(embeddings):
        doc_id = str(i + 1)
        entry = {"id": doc_id, "embedding": embedding}
        f.write(json.dumps(entry) + "\n")
    print(f"JSON-L file created: {jsonl_filename}")

JSON-L file created: embeddings.jsonl


In [31]:
from google.cloud import storage

# Initialize a storage client
client = storage.Client()

# Upload the JSON-L file
bucket_name = PROJECT_ID
bucket = client.bucket(bucket_name)
blob = bucket.blob(jsonl_filename)
blob.upload_from_filename(jsonl_filename)

print(f"{jsonl_filename} uploaded to {bucket_name}.")

embeddings.jsonl uploaded to qwiklabs-gcp-02-8ea1f4c7128f.


In [26]:
from google.cloud import aiplatform
from vertexai.language_models import TextEmbeddingModel

# Initialize Vertex AI with your project and location
aiplatform.init(project=PROJECT_ID, location=LOCATION)

# Load the Text Embedding Model
embedding_model = TextEmbeddingModel.from_pretrained("textembedding-gecko@002")

In [32]:
BUCKET_URI = f"gs://{PROJECT_ID}/embeddings.json"

In [33]:
! gsutil mb -l "$LOCATION" -p "$PROJECT_ID" "$BUCKET_URI"
! gsutil cp "gs://github-repo/data/vs-quickstart/product-embs.json" "$BUCKET_URI"

I0000 00:00:1724292323.484569     760 work_stealing_thread_pool.cc:320] WorkStealingThreadPoolImpl::PrepareFork


CommandException: The mb command requires a URL that specifies a bucket.
"gs://qwiklabs-gcp-02-8ea1f4c7128f/embeddings.json" is not valid.


I0000 00:00:1724292326.466466     760 work_stealing_thread_pool.cc:320] WorkStealingThreadPoolImpl::PrepareFork


Copying gs://github-repo/data/vs-quickstart/product-embs.json [Content-Type=application/json]...
- [1 files][ 79.3 MiB/ 79.3 MiB]                                                
Operation completed over 1 objects/79.3 MiB.                                     


In [34]:
# create Index
my_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name=f"assessment-index-endpoint",
    contents_delta_uri=BUCKET_URI,
    dimensions=768,
    approximate_neighbors_count=10,
)

Creating MatchingEngineIndex
Create MatchingEngineIndex backing LRO: projects/885281464405/locations/us-central1/indexes/1335541589887418368/operations/769669306358169600
MatchingEngineIndex created. Resource name: projects/885281464405/locations/us-central1/indexes/1335541589887418368
To use this MatchingEngineIndex in another session:
index = aiplatform.MatchingEngineIndex('projects/885281464405/locations/us-central1/indexes/1335541589887418368')


In [37]:
# create IndexEndpoint
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=f"assessment-index-endpoint", public_endpoint_enabled=True
)

Creating MatchingEngineIndexEndpoint
Create MatchingEngineIndexEndpoint backing LRO: projects/885281464405/locations/us-central1/indexEndpoints/4979938850848636928/operations/2039684401276649472
MatchingEngineIndexEndpoint created. Resource name: projects/885281464405/locations/us-central1/indexEndpoints/4979938850848636928
To use this MatchingEngineIndexEndpoint in another session:
index_endpoint = aiplatform.MatchingEngineIndexEndpoint('projects/885281464405/locations/us-central1/indexEndpoints/4979938850848636928')


In [38]:
DEPLOYED_INDEX_ID = f"assessment_index_endpoint"

In [None]:
my_index_endpoint.deploy_index(index=my_index, deployed_index_id=DEPLOYED_INDEX_ID)

Deploying index MatchingEngineIndexEndpoint index_endpoint: projects/885281464405/locations/us-central1/indexEndpoints/4979938850848636928
Deploy index MatchingEngineIndexEndpoint index_endpoint backing LRO: projects/885281464405/locations/us-central1/indexEndpoints/4979938850848636928/operations/5016563754968547328
