Copyright 2024 Google, LLC. This software is provided as-is,
without warranty or representation for any use or purpose. Your
use of it is subject to your agreement with Google.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

# Example Agent Workflow using Google's ADK

This notebook provides an example of building an agentic workflow with Google's new ADK. For more information please visit https://google.github.io/adk-docs/

# Multi-Agent Nest Support System with Vertex AI RAG and ADK

This notebook will help setup the RAG Engine and BQ Datasets used in this ADK Workshop

## 0. Install Dependencies

Install the necessary libraries for Vertex AI, Google Cloud, ADK, and HTTP requests.

## 1. Import Libraries

In [None]:
# Vertex AI Modules
import vertexai
from vertexai.preview import rag

# Vertex AI Platform Modules
from google.cloud import aiplatform_v1beta1 as aiplatform # This module helps parse the info for delete_rag_corpa function

# Google Cloud Storage
from google.cloud import storage

#Google BigQuery
from google.cloud import bigquery


# Other Python Modules
import os
from typing import List, Dict, TypedDict, Any
import json
from urllib.parse import urlparse
import warnings
import logging

print("Libraries imported successfully.")

Ignore warning messages

In [None]:
warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.ERROR)

## 2. Configuration

**Important:** Update the `project_id`, `corpa_document_bucket`, `local_documents`, and `ticket_server_url` variables below with your specific values.

In [None]:
project_id = "YOUR_PROJECT_ID" # Your GCP Project ID
location = "global" # You can leave this setting as global
region = "us-central1" # Your region. This notebook has only been tested in us-central1

corpa_name = "nest-rag-corpus" # This will be the display name of your RAG Engine corpus
bq_dataset_id = f"{project_id}.Product_Inventory"
bq_table_id = f"{bq_dataset_id}.product_data"

corpa_document_bucket = "gs://YOUR_BUCKET_ID/nest/docs/" # The GCS path to the files you want to ingest into your RAG Engine corpus
bq_data_bucket = "gs://YOUR_BUCKET_ID/wip/bq_import/" # The GCS path to the files you want to ingest into your BQ datastore

support_documents = "./nest_docs/" # Local directory containing Nest support files to copy
bq_data = "./bq_data/" # Local directory containing BQ data files

ticket_server_url = "http://localhost:8001" # The url to the mock ticket system. This will be a GCE VM running the ticket_server.py web service.

## 3. Environment Setup and Vertex AI Initialization

Set environment variables for Google libraries and initiate the vertex ai client

In [None]:
os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = "1"
os.environ["GOOGLE_CLOUD_PROJECT"] = project_id
os.environ["GOOGLE_CLOUD_LOCATION"] = region

In [None]:
vertexai.init(project=project_id, location=region)

## 4. Google Cloud Storage Setup

This function checks if the specified GCS bucket and folder exist, creates them if necessary, and uploads documents from the local directory.

In [None]:
def create_bucket(bucket_path: str):
    parsed_uri = urlparse(bucket_path)
    bucket_name = parsed_uri.netloc
    prefix = parsed_uri.path.lstrip('/')

    storage_client = storage.Client()

    # Get the bucket object
    bucket = storage_client.bucket(bucket_name)

    # Check if the bucket exists, create it if not
    if not bucket.exists():
        bucket.create()
        print(f"Bucket '{bucket_name}' created successfully.")
    else:
        print(f"Bucket '{bucket_name}' already exists.")

    # Create the folder prefix if it doesn't implicitly exist
    if prefix:
        blob_name = f"{prefix}" if prefix.endswith('/') else f"{prefix}/"
        placeholder_blob = bucket.blob(blob_name + ".placeholder")
        if not placeholder_blob.exists():
            placeholder_blob.upload_from_string('')
            print(f"Simulated folder '{bucket_path}' created.")
        else:
            print(f"Simulated folder '{bucket_path}' already exists.")

In [None]:
def upload_files(folder_path: str, bucket_path: str):
    parsed_uri = urlparse(bucket_path)
    bucket_name = parsed_uri.netloc
    prefix = parsed_uri.path.lstrip('/')
    
    storage_client = storage.Client()

    # Get the bucket object
    bucket = storage_client.bucket(bucket_name)

    if os.path.exists(folder_path) and os.path.isdir(folder_path):
        for filename in os.listdir(folder_path):
            local_file_path = os.path.join(folder_path, filename)
            if os.path.isfile(local_file_path):
                gcs_blob_name = f"{prefix}{filename}"
                blob = bucket.blob(gcs_blob_name)
                blob.upload_from_filename(local_file_path)
                print(f"Uploaded '{local_file_path}' to 'gs://{bucket_name}/{gcs_blob_name}'")
    else:
        print(f"Local directory '{folder_path}' does not exist or is not a directory.")

Create the GCS buckets for the support documents and BQ data files

In [None]:
create_bucket(corpa_document_bucket)

In [None]:
create_bucket(bq_data_bucket)

Upload the Nest support documents and BQ data files to GCS

In [None]:
upload_files(support_documents, corpa_document_bucket)

In [None]:
upload_files(bq_data, bq_data_bucket)

## 5. Helper Functions

Define functions for interacting with the agent, managing RAG corpora, and defining tools.

In [None]:
# @title Define RAG Corpus Management Functions

# NOTE: The delete function is defined but not used in the main flow.
# It can be useful for cleanup. It expects a pager object, which is
# typically obtained from `rag.list_corpora()`.
def delete_rag_corpora(rag_corpora_pager: aiplatform.services.vertex_rag_data_service.pagers.ListRagCorporaPager):
    """
    Deletes all RAG corpora listed in the provided pager object.
    USE WITH CAUTION! THIS WILL PERMANENTLY DELETE CORPORA.

    Args:
        rag_corpora_pager: The pager object from rag.list_corpora().
    """
    names_list = []
    print("Identifying corpora to delete...")
    try:
        for rag_corpus_obj in rag_corpora_pager: # Iterate through the actual RagCorpus objects
            if hasattr(rag_corpus_obj, 'name'):
                print(f" - Found corpus: {rag_corpus_obj.display_name} ({rag_corpus_obj.name})")
                names_list.append(rag_corpus_obj.name)
            else:
                print(f" - Skipping object without a 'name' attribute: {rag_corpus_obj}")
    except Exception as e:
        print(f"Error iterating through corpora pager: {e}")
        return # Stop if we can't list them properly

    if not names_list:
        print("No corpora found to delete.")
        return

    print("\nStarting deletion process...")
    deleted_count = 0
    failed_count = 0
    for corpus_name_to_delete in names_list:
        print(f"Attempting to delete corpus: {corpus_name_to_delete}...")
        try:
            # Optional: You might want to double-check existence before deleting
            # rag.get_corpus(name=corpus_name_to_delete)
            rag.delete_corpus(name=corpus_name_to_delete, force=True) # Use force=True to delete non-empty corpora
            print(f"  Successfully deleted {corpus_name_to_delete}")
            deleted_count += 1
        except Exception as e:
            print(f"  Failed to delete {corpus_name_to_delete}: {e}")
            failed_count += 1
    print(f"\nDeletion complete. Deleted: {deleted_count}, Failed: {failed_count}")


def create_rag_corpora(display_name, source_bucket):
    EMBEDDING_MODEL = "publishers/google/models/text-embedding-004"  # @param {type:"string", isTemplate: true}
    embedding_model_config = rag.EmbeddingModelConfig(publisher_model=EMBEDDING_MODEL)

    rag_corpus = rag.create_corpus(
        display_name=display_name, embedding_model_config=embedding_model_config
    )
    

    
    INPUT_GCS_BUCKET = (
        source_bucket
    )

    response = rag.import_files(
        corpus_name=rag_corpus.name,
        paths=[INPUT_GCS_BUCKET],
        chunk_size=1024,  # Optional
        chunk_overlap=100,  # Optional
        max_embedding_requests_per_min=900,  # Optional
    )
    
    # This code shows how to upload local files to the corpus. 
    #rag_file = rag.upload_file(
    #    corpus_name=rag_corpus.name,
    #    path="./test.txt",
    #    display_name="test.txt",
    #    description="my test file"
    #)
    
    return rag_corpus

    

print("RAG corpus management functions defined.")

In [None]:
def get_gcs_uri(query: str) -> str:
    """
    Retrieves Google Cloud Storage (GCS) URIs for documents relevant to a given query.

    This function queries a pre-configured Retrieval-Augmented Generation (RAG)
    corpus to find documents related to the input query string. It extracts
    the source GCS URIs from the top relevant documents identified by the
    RAG system based on semantic similarity. Use this function when you need
    to find the source files in GCS that contain information related to a
    specific question or topic.

    Args:
        query: str - The natural language query or topic to search for within
                 the RAG corpus. For example: "What were the Q3 sales figures?"
                 or "Tell me about project Alpha's latest status".

    Returns:
         str - A JSON string representing a list of unique GCS URIs. These URIs
               point to the source documents found to be relevant to the query.
               Returns a JSON string representing an empty list ('[]') if no
               relevant documents meet the similarity criteria.
               Example return value: '["gs://my-bucket/doc1.pdf", "gs://my-bucket/report_q3.txt"]'
    """
    query_response = rag.retrieval_query(
        rag_resources=[
            rag.RagResource(
                rag_corpus=rag_corpus.name,
                # Optional: supply IDs from `rag.list_files()`.
                # rag_file_ids=["rag-file-1", "rag-file-2", ...],
            )
        ],
        text=f'''
        {query}
        ''',
        similarity_top_k=10,  # Optional
        vector_distance_threshold=0.5,  # Optional
    )
    #print(response)
    uri_set = set()
    for context in query_response.contexts.contexts:
        uri_set.add(context.source_uri)
        #json.dumps(list(uri_set))
    #doc_uri = uri_set.pop()
    doc_uri = json.dumps(list(uri_set))
    return doc_uri

## 6. RAG Corpus Setup

Check if the RAG Corpus configured in step 2 exists. If not, create it and initiate file import.

In [None]:
existing_corpora = rag.list_corpora()

print(existing_corpora)

# Variable to hold the corpus if found
found_corpus = None

In [None]:
# Iterate through all existing RAG corpora
for corpus in existing_corpora.rag_corpora: # Ensure you iterate the correct attribute
    # Check if display_name exists and matches
    if getattr(corpus, 'display_name', None) == corpa_name:
        print(f"Existing Corpa found. Using {corpus.name}")
        
        # You already have the corpus object, no need to call get_corpus usually
        # If 'corpus' object from the list is sufficient, use it directly.
        # If you MUST get a fresh object or different type, uncomment the next line:
        # rag_corpus = rag.get_corpus(name=corpus.name) 
        found_corpus = corpus # Store the found corpus object
        
        print(f"This corpus contains the following files:")
        try:
            # List files associated with the found corpus
            for file in rag.list_files(corpus.name): # Use corpus.name
                print(getattr(file, 'display_name', 'N/A')) # Safer access
        except Exception as e:
            print(f"Warning: Could not list files for {corpus.name}. Error: {e}")
            
        break # Exit the loop as soon as we find the match

# After the loop, check if we found anything
if found_corpus is None:
    # The loop completed without finding the corpus
    print(f"No existing {corpa_name} resource found. Creating one now.")
    try:
        rag_corpus = create_rag_corpora(corpa_name, corpa_document_bucket)
        print(f"New RAG corpus created at {rag_corpus.name}")
    except Exception as e:
        print(f"Error creating corpus {corpa_name}: {e}")
        rag_corpus = None # Indicate failure
else:
    # The corpus was found in the loop
    rag_corpus = found_corpus # Assign the found corpus to the main variable

# Now 'rag_corpus' holds either the found or newly created corpus (or None if creation failed)
# You can proceed to use 'rag_corpus' here
if rag_corpus:
    print(f"\nProceeding with corpus: {rag_corpus.name}")
    # ... your next steps using rag_corpus ...
else:
    print(f"\nFailed to find or create corpus '{corpa_name}'. Cannot proceed.")

In [None]:
test = get_gcs_uri('How do I install a Nest E thermostat')
print(test)

## 7. Create BQ Dataset

Create the BQ Dataset and product_data table for the BQ Agent example

In [None]:
# Construct a BigQuery client object.
client = bigquery.Client()

Create the BQ Dataset

In [None]:
dataset = bigquery.Dataset(bq_dataset_id)

dataset.location = "US"
dataset = client.create_dataset(dataset, timeout=30)
print("Created dataset {}.{}".format(client.project, dataset.dataset_id))

Create the BQ table

In [None]:
job_config = bigquery.LoadJobConfig(source_format=bigquery.SourceFormat.AVRO)
uri = f"{bq_data_bucket}product_data.avro"

load_job = client.load_table_from_uri(
    uri, bq_table_id, job_config=job_config
)  # Make an API request.

load_job.result()  # Waits for the job to complete.

destination_table = client.get_table(bq_table_id)
print("Loaded {} rows.".format(destination_table.num_rows))

## 8. Cleanup (Optional)

If you want to delete the RAG Corpus and BQ Dataset created during this session, uncomment and run the following cell. **Warning:** This permanently deletes the corpus and its indexed data.

In [None]:
# @title Delete the RAG Corpus (USE WITH CAUTION!)

# Set this flag to True only if you are sure you want to delete the corpus
confirm_delete = False

if confirm_delete == True:
    print(f"Attempting to delete RAG Corpus: {rag_corpus.name} ({corpa_name})")
    try:
        rag.delete_corpus(name=rag_corpus.name)
        print(f"Successfully deleted corpus: {rag_corpus.name}")
    except Exception as e:
        print(f"Failed to delete corpus {rag_corpus.name}: {e}")
    
    print(f"Attempting to delete BQ Table: {bq_table_id}")
    try:
        client.delete_table(bq_table_id, not_found_ok=True)
        print("Deleted table '{}'.".format(bq_table_id))
    except Exception as e:
        print(f"Failed to delete BQ table {bq_table_id}: {e}")

    print(f"Attempting to delete BQ Dataset: {bq_dataset_id}")
    try:
        client.delete_dataset(bq_dataset_id, delete_contents=True, not_found_ok=True)
        print("Deleted dataset '{}'.".format(bq_dataset_id))
    except Exception as e:
        print(f"Failed to delete BQ dataset {bq_dataset_id}: {e}")
        
else:
    print("Skipping deletion: confirm_delete is set to False.")

**WARNING**: Uncomment this next line to delete ALL existing RAG Engine corpora within this project.

ONLY USE THIS OPTION IN A NON PRODUCTION ENVIRONMENT!

In [None]:
#delete_rag_corpora(rag.list_corpora()) # This option will delete ALL RAG Engine instances.