In [None]:
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# **DIY RAG APIs- Vertex AI  builder APIs**

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/language/examples/reference-architectures/question_answering_with_large_documents.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/language/examples/reference-architectures/question_answering_with_large_documents.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/blob/main/language/examples/reference-architectures/question_answering_with_large_documents.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>
</table>

### DIY RAG

In this notebook, we show you how to use Vertex AI Builder APIs for RAG to build a custom search solution on your own documents.

Building a robust custom (DIY) Retrieval Augmented Generation (RAG) system for grounding can be challenging. Vertex AI simplifies the process with a suite of flexible standalone APIs to help your create your own search solutions.

**Document AI Layout Parser:** Transforms documents into structured representations, making content easily accessible. Creates context-aware chunks for improved information retrieval in generative AI and discovery applications.

**Ranking API:** Re-ranks search results based on relevance to the original query. Enhances RAG accuracy by optimizing retrieval beyond initial nearest neighbor search.

**Check Grounding API:** Acts as a "validator" to determine whether statements or claims are supported by provided facts (essentially how grounded a given piece of text is in a given set of reference text). Enables online flagging of ungrounded responses and offline evaluation of generative responses.

## Getting Started

### Install Vertex AI SDK & Other dependencies

In [None]:
! pip install google-cloud-aiplatform --upgrade --quiet
! pip install google-cloud-discoveryengine --upgrade --quiet
! pip install google-cloud-documentai google-cloud-documentai-toolbox --upgrade --quiet
! pip install google-cloud-storage --upgrade --quiet
!pip install backoff==1.11.1
%pip install --upgrade --quiet  langchain langchain-google-vertexai "langchain-google-community[featurestore]"

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m319.1/319.1 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.1/43.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m467.5/467.5 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m47.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m78.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.3/38.3 MB[0m [31m25.2 MB/s[0m eta [36m0:00:00

**Colab only**: Run the following cell to restart the kernel. For Vertex AI Workbench you can restart the terminal using the button on top.

In [None]:
import sys
if "google.colab" in sys.modules:
    # Automatically restart kernel after installs so that your environment can access the new packages
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)
else:
    # Otherwise, attempt to discover local credentials as described on https://cloud.google.com/docs/authentication/application-default-credentials
    pass

## Colab Only
You will need to run the following cell to authenticates your Colab environment with your Google Cloud account.

In [None]:
from google.colab import auth
auth.authenticate_user()

### Enable the APIs in your project

In [None]:
from __future__ import annotations
import backoff
from tenacity import retry, stop_after_attempt, wait_random_exponential
from google.api_core.exceptions import ResourceExhausted
from google.api_core.client_options import ClientOptions
from google.api_core.exceptions import AlreadyExists
from google.cloud import documentai
import numpy as np
import glob
import os
import textwrap
from typing import Dict, List
import pandas as pd
from logging import error
import re
import textwrap
from typing import Tuple, List
import vertexai
from vertexai.generative_models import GenerationConfig, GenerativeModel, Image, Part
from vertexai.language_models import TextEmbeddingModel
import vertexai
from google.cloud import documentai
from google.cloud import discoveryengine

In [None]:
!gcloud config set project ""

!gcloud services enable documentai.googleapis.com storage.googleapis.com aiplatform.googleapis.com

[1;31mERROR:[0m (gcloud.config.set) value for field [projectId] in collection [cloudresourcemanager.projects] is required but was not provided
[1;31mERROR:[0m (gcloud.services.enable) The required property [project] is not currently set.
It can be set on a per-command basis by re-running your command with the [--project] flag.

You may set it for your current workspace by running:

  $ gcloud config set project VALUE

or it can be set temporarily by the environment variable [CLOUDSDK_CORE_PROJECT]


### Preparing data files

To begin, you will need to download PDFs for the summarizing tasks below.
For this notebook, you will be using Alphabet earnings report PDFs hosted in a public Google Cloud Storage bucket.

In [None]:
# Copying the files from the GCS bucket to local storage
!gsutil -m cp -r gs://github-repo/documents/docai .

Copying gs://github-repo/documents/docai/20230203_alphabet_10K-pages-1-14-compressed.pdf...
Copying gs://github-repo/documents/docai/20230426_alphabet_10Q-pages-1-14.pdf...
Copying gs://github-repo/documents/docai/documents_20220202_alphabet_10K.pdf...


### Create Document AI OCR Processor

A [Document AI processor](https://cloud.google.com/document-ai/docs/overview#dai-processors) is an interface between a document file and a machine learning model that performs document processing actions. They can be used to classify, split, parse, or analyze a document. Each Google Cloud project needs to create its own processor instances.

There are two types of Document AI processors:

* Pre-trained processors: These processors are pre-trained on a large dataset of documents and can be used to perform common document processing tasks, such as Optical Character Recognition (OCR), form parsing, and entity extraction.
* Custom processors: These processors can be trained on your own dataset of documents to perform specific tasks that are not covered by the pre-trained processors.

Refer to [Full processor and detail list](https://cloud.google.com/document-ai/docs/processors-list) for all supported processors.

Processors take a PDF or image file as input and output the data in the [`Document`](https://cloud.google.com/document-ai/docs/reference/rest/v1/Document) format.

### Create a processor

Set these variables below:

1. Enter `YOUR_PROJECT_ID` in project_id
2. Enter `YOUR_PROCESSOR_DISPLAY_NAME` for `processor_display_name`. For example `processor_display_name = "my processor"`, before running the code below.

Note:  Run this code only once to create the processor.
You cannot create multiple processors with the same display name. If you receieve an error, change the name of the processor and rerun.

In [None]:
# TODO(develokaggper): Edit these variables before running the code.
project_id = ""

# See https://cloud.google.com/document-ai/docs/regions for all options.
location = "us"

# Must be unique per project, e.g.: "My Processor"
processor_display_name = "my-layoutprocessor"

# You must set the `api_endpoint` if you use a location other than "us".
client_options = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")


def create_processor(
    project_id: str, location: str, processor_display_name: str
) -> documentai.Processor:
    client = documentai.DocumentProcessorServiceClient(client_options=client_options)

    # The full resource name of the location
    # e.g.: projects/project_id/locations/location
    parent = client.common_location_path(project_id, location)

    # Create a processor
    return client.create_processor(
        parent=parent,
        processor=documentai.Processor(
            display_name=processor_display_name, type_="LAYOUT_PARSER_PROCESSOR"
        ),
    )


try:
    processor = create_processor(project_id, location, processor_display_name)
    print(f"Created Processor {processor.name}")
except AlreadyExists as e:
    print(
        f"Processor already exits, change the processor name and rerun this code. {e.message}"
    )

Created Processor projects/474775107710/locations/us/processors/55eae21ba8befb69


### Process the documents

Process document takes the processor name and file path of the document and extracts the text from the document.

In [None]:
def process_document(
    processor_name: str,
    file_path: str,
    chunk_size: int = 1000,  # Add chunk_size parameter
    include_ancestor_headings: bool = True  # Add include_ancestor_headings paramet
) -> documentai.Document:
    client = documentai.DocumentProcessorServiceClient(client_options=client_options)
    # Configure chunking options
    process_options = documentai.ProcessOptions( #Removed extra space here
        layout_config=documentai.ProcessOptions.LayoutConfig(
            chunking_config=documentai.ProcessOptions.LayoutConfig.ChunkingConfig(
                chunk_size=chunk_size,
                include_ancestor_headings=include_ancestor_headings,
            )
        )
    )
    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Load Binary Data into Document AI RawDocument Object
    raw_document = documentai.RawDocument(
        content=image_content, mime_type="application/pdf"
    )

    # Configure the process request
    request = documentai.ProcessRequest(name=processor_name, raw_document=raw_document,process_options=process_options)


    result = client.process_document(request=request) # Changed process_options to process_document

    return result.document

In [None]:


# If you already have a Document AI Processor in your project, assign the full processor resource name here.
processor_name = processor.name
chunk_size = 5000
extracted_data: List[Str] = []

# Loop through each PDF file in the "docai" directory.
for path in glob.glob("docai/*.pdf"):
    # Extract the file name and type from the path.
    file_name, file_type = os.path.splitext(path)

    print(f"Processing {file_name}")

    # Process the document.
    document = process_document(processor_name, file_path=path)

    for page in document.pages:
        print(page.page_content)
    for chunk in document.chunked_document.chunks:
        #print(chunk.content)
        extracted_data.append(chunk.content)



Processing docai/20230203_alphabet_10K-pages-1-14-compressed
Processing docai/documents_20220202_alphabet_10K
Processing docai/20230426_alphabet_10Q-pages-1-14


In [None]:
print(extracted_data)

['(Mark One)☑☐\n\n# UNITED STATES SECURITIES AND EXCHANGE COMMISSION\n\nWashington, D.C. 20549\n\n# FORM 10-K\n\nANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\n\n## For the fiscal year ended December 31, 2022\n\nORTRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934For the transition period fromtoCommission file number: 001-37580', '# Alphabet Inc.\n\n(Exact name of registrant as specified in its charter)Delaware61-1767919(State or other jurisdiction of incorporation or organization)(I.R.S. Employer Identification No.)1600 Amphitheatre Parkway Mountain View, CA 94043(Address of principal executive offices, including zip code)(650) 253-0000(Registrant\'s telephone number, including area code) Securities registered pursuant to Section 12(b) of the Act:\n\n|-|-|-|\n| Title of each class | Trading Symbol(s) | Name of each exchange on which registered |\n| Class A Common Stock, $0.001 par value | GOOGL | Nasdaq Stock

## Question Answering and Summarization

You have just used Document AI to extract text from PDF files.

In the next section, you will perform question answering and summarization on the extracted text using the Gemini and Gecko Embedding model with Vertex AI.

1. Represent the documents that contain the answer as vectors using the Gecko embedding model.
2. Compare the question to the document vector store.
3. The document with the most similar vector to the question is the document that contains the answer.

### Authenticating your notebook environment
* If you are using **Colab** to run this notebook, run the cell below and continue.
* If you are using **Vertex AI Workbench**, check out the setup instructions [here](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/setup-env).

In [None]:


PROJECT_ID = ""  # @param {type:"string"}
vertexai.init(project=PROJECT_ID, location="us-central1")

### Import  models

In [None]:
from langchain_google_vertexai import VertexAI
llm = VertexAI(model_name="gemini-pro")

In [None]:
from langchain_google_vertexai import VertexAIEmbeddings

embedding = VertexAIEmbeddings(
    model_name="textembedding-gecko@latest", project=PROJECT_ID
)

In [None]:
DATASET = "my_langchain_dataset"  # @param {type: "string"}
TABLE = "doc_and_vectors"  # @param {type: "string"}
REGION = "us-central1"
from langchain_google_community import BigQueryVectorStore

store = BigQueryVectorStore(
    project_id=PROJECT_ID,
    dataset_name=DATASET,
    table_name=TABLE,
    location=REGION,
    embedding=embedding,
)

INFO:langchain_google_community.bq_storage_vectorstores._base:BigQuery table kaggle-on-gcp.my_langchain_dataset.doc_and_vectors initialized/validated as persistent storage. Access via BigQuery console:
 https://console.cloud.google.com/bigquery?project=kaggle-on-gcp&ws=!1m5!1m4!4m3!1skaggle-on-gcp!2smy_langchain_dataset!3sdoc_and_vectors


In [None]:

metadatas = [{"len": len(t)} for t in extracted_data]

store.add_texts(extracted_data, metadatas=metadatas)

['79d91aa1704d44f3a57a23e6c2dfe47a',
 'ded95596030646bfa1f70cacdff033bd',
 '34a702a0bb8e4683a180f30ad67e1529',
 '99aed0006e6a4aa09546d4866dd765bb',
 '76bcd01aa87f4b5789c7bfcda1d1fae1',
 '6a184c33d57f4c449d6106242007a28f',
 '13be2a03551b4e3b9cb3bd17f74bf753',
 '510781f27e604c81bc5da4549195fa90',
 'ccd0bd2960d6424480b7c61336f0827a',
 '2d00257a94b447b587d8ae0a7a83814a',
 '9f886d2da17140b6b4ea6b9ca815799d',
 'e432cfd3630740afb334a1bff5dc9791',
 '5d103889a232496c912678b5993119d0',
 'e3ef09516a14471aa6c19f12fc8b6348',
 '80b5c0e61e3640b68858c70da787b40d',
 'aa6938cf8cd34e5ab3ca231f09108cb6',
 '433a26839d8d4b43a3d88f45691b5655',
 'b61cb0b43949442f87e11bb70227f79f',
 '6381114027c04043bf75efe2b0ccfb15',
 'c65eab5376d84b5e8943ed8cb61a6e80',
 'b3d5d6d31c944e5086d138aa36e4a51b',
 '5df59b3b4256468099cfbd1601ca1de3',
 '7b15791a9fcf4c198dd5b24d077bf54a',
 '5d9a03100eab4d0cb94280568d8f589c',
 'a613aaea8cdf4c8b95607bda8ec73d54',
 '91282062f2534dd6a376df859c114381',
 'a0f1bcb024d04abc9afaf95cf37d404d',
 

In [None]:
# Expose index to the retriever
retriever = store.as_retriever(search_type="similarity", search_kwargs={"k": 2})

The Vertex AI Search Ranking API is one of the standalone APIs in Vertex AI Agent Builder. It takes a list of documents and reranks those documents based on how relevant the documents are to a query. Compared to embeddings, which look only at the semantic similarity of a document and a query, the ranking API can give you precise scores for how well a document answers a given query. The ranking API can be used to improve the quality of search results after retrieving an initial set of candidate documents.

The ranking API is stateless so there's no need to index documents before calling the API. All you need to do is pass in the query and documents. This makes the API well suited for reranking documents from any document retrievers.

For more information, see Rank and rerank documents.

3.1 Define and combine retriever using Vector Search and reranker using the Vertex AI Ranking API.

In [None]:
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_google_community import VertexAIRank

# Instantiate the VertexAIReranker with the SDK manager
reranker = VertexAIRank(
    project_id=PROJECT_ID,
    location_id="global",
    ranking_config="default_ranking_config",
    title_field="source",  # metadata field to preserve with reranked results
    top_n=5,
)

basic_retriever = store.as_retriever(
    search_kwargs={"k": 5}
)  # fetch top 5 documents

# Create the ContextualCompressionRetriever with the VertexAIRanker as a Reranker
retriever_with_reranker = ContextualCompressionRetriever(
    base_compressor=reranker, base_retriever=basic_retriever
)

In [None]:
query = "what was google cloud revenue in 2023 ?"
search_kwargs = {"k": 5}
print(basic_retriever.invoke(query, search_kwargs=search_kwargs))
print(retriever_with_reranker.invoke(query, search_kwargs=search_kwargs))

[Document(metadata={'doc_id': '081ced04624b4b2799f2c1d12eceef62', 'len': 2579, 'score': 0.7116202634913744}, page_content='# Disaggregated Revenues\n\nThe following table presents revenues disaggregated by type (in millions):\n\n|-|-|\n|  | Three Months Ended March 31, |\n|  | 2022 | 2023 |\n| Google Search & other | 39,618 $ $ | 40,359 |\n| YouTube ads | 6,869 | 6,693 |\n| Google Network | 8,174 | 7,496 |\n| Google advertising | 54,661 | 54,548 |\n| Google other | 6,811 | 7,413 |\n| Google Services total | 61,472 | 61,961 |\n| Google Cloud | 5,821 | 7,454 |\n| Other Bets | 440 | 288 |\n| Hedging gains (losses) | 278 | 84 |\n| Total revenues | $ 68,011 | $ 69,787 |\n\nThe following table presents revenues disaggregated by geography, based on the addresses of our customers (in millions):(1)\n\n|-|-|-|-|-|\n|  |  | Three Months Ended |  |  |\n|  |  | March 31, |  |  |\n|  | 2022 |  | 2023 |  |\n| United States | $ 31,733 | 47% $ | 32,864 | 47% |\n| EMEA (1) | 20,317 | 30 | 21,078 | 30 |\

Query in Real Time and Check Grounding
Let's now configure a standard retrieval and answer generation chain that follows: query -> vector search -> retrieve documents -> LLM for answer generation with a couple of changes:

We will pass retrieved documents to the reranker API via the VertexAIRank and get the reranked documents to generate the answer.

After the answer is generated by the LLM, pass the answer and the retrieved documents from vector search as facts to the VertexAICheckGroundingWrapper to check how grounded the response from the LLM is.

In [None]:
from typing import List

from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

from langchain.docstore.document import Document
from langchain_core.runnables import chain

from langchain_google_vertexai import VertexAI
from langchain.prompts import PromptTemplate

from langchain_google_community import VertexAICheckGroundingWrapper

from rich import print

retriever = store.as_retriever(search_kwargs={"k": 5})

Configure LLM with prompt template to generate answer

In [None]:
llm = VertexAI(model_name="gemini-1.5-pro-001", max_output_tokens=1024)
template = """
Answer the question based only on the following context:
{context}

Question:
{query}
"""
prompt = PromptTemplate.from_template(template)

create_answer = prompt | llm

In [None]:
#Define wrapper to call Vertex AI Check Grounding API on the generated answer
output_parser = VertexAICheckGroundingWrapper(
    project_id=PROJECT_ID,
    location_id="global",
    grounding_config="default_grounding_config",
    top_n=3,
)

In [None]:
@chain
def check_grounding_output_parser(answer_candidate: str, documents: List[Document]):
    return output_parser.with_config(configurable={"documents": documents}).invoke(
        answer_candidate
    )


setup_and_retrieval = RunnableParallel(
    {"context": retriever, "query": RunnablePassthrough()}
)


@chain
def qa_with_check_grounding(query):
    docs = setup_and_retrieval.invoke(query)
    answer_candidate = create_answer.invoke(docs)
    check_grounding_output = check_grounding_output_parser.invoke(
        answer_candidate, documents=docs["context"]
    )
    return check_grounding_output


In [None]:
result = qa_with_check_grounding.invoke("what was google cloud revenue in 2023 ?")
print(result)

TypeError: 2579 has type int, but expected one of: bytes, unicode

## Clean Up

If you no longer need the Document AI processor, you can delete it using the following code.

Alternatively, you can use the Cloud Console to delete the processor as outlined in [Creating and managing processors > Delete a processor](https://cloud.google.com/document-ai/docs/create-processor#documentai_delete_processor-web).

In [None]:
def delete_processor(processor_name: str) -> None:
    client = documentai.DocumentProcessorServiceClient(client_options=client_options)

    # Delete a processor
    operation = client.delete_processor(name=processor_name)
    # Print operation details
    print(operation.operation.name)
    # Wait for operation to complete
    operation.result()


delete_processor(processor_name)

PermissionDenied: 403 Cloud Document AI API has not been used in project 522309567947 before or it is disabled. Enable it by visiting https://console.developers.google.com/apis/api/documentai.googleapis.com/overview?project=522309567947 then retry. If you enabled this API recently, wait a few minutes for the action to propagate to our systems and retry. [links {
  description: "Google developers console API activation"
  url: "https://console.developers.google.com/apis/api/documentai.googleapis.com/overview?project=522309567947"
}
, reason: "SERVICE_DISABLED"
domain: "googleapis.com"
metadata {
  key: "consumer"
  value: "projects/522309567947"
}
metadata {
  key: "service"
  value: "documentai.googleapis.com"
}
]