# Setup

In [2]:
# %pip install -r requirements.txt

# Environment Variables

In [1]:
# Get environment variables

from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

True

# Google Colab Auth

In [2]:
import sys

# If running in Colab, use the permissions of the currently authenticated user
if "google.colab" in sys.modules:
	print('Running in Google Colab')
	
	from google.colab import auth
	
	auth.authenticate_user()

# If not, set the GOOGLE_APPLICATION_CREDENTIALS to the service account credentials file 
else:
	print("Running locally")

Running locally


# Utilities

In [3]:
import utils

# Environment Checks

In [4]:
# Check GPC permissions

import os

import google.auth
from google.cloud import storage

PROJECT_ID = os.environ["PROJECT_ID"]
print(PROJECT_ID)
creds, _ = google.auth.default(quota_project_id=PROJECT_ID)

# Now, you can use the Google Cloud client libraries
client = storage.Client(credentials=creds)

# List all buckets in your project
buckets = list(client.list_buckets())
print(buckets)

era4-447717
[<Bucket: 788677989752_us_import_content_with_faq_csv>, <Bucket: 788677989752_us_import_document>, <Bucket: docs-10k>]


In [5]:
# Check Google API key

import requests

GOOGLE_API_KEY = os.environ['GOOGLE_API_KEY']
url = f'https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent?key={GOOGLE_API_KEY}'
  
headers = {
	"Content-Type": "application/json",
}
  
data = {"contents":[{"parts":[{"text":"What was Uber's annual revenue for 2022?"}]}]}

response = requests.post(url, headers=headers, json=data)
print(response.text)

{
  "candidates": [
    {
      "content": {
        "parts": [
          {
            "text": "Uber's annual revenue for 2022 was **$31.9 billion**.\n"
          }
        ],
        "role": "model"
      },
      "finishReason": "STOP",
      "avgLogprobs": -0.0030171158058302744
    }
  ],
  "usageMetadata": {
    "promptTokenCount": 14,
    "candidatesTokenCount": 21,
    "totalTokenCount": 35
  },
  "modelVersion": "gemini-1.5-flash-latest"
}



In [6]:
# Check Vertex AI Agent Builder data store

import json

import google.auth
from google.auth.transport.requests import Request
import requests

def query_chunks(query: str, 
				page_size: int, 
				access_token:str) -> str:
		
	PROJECT_ID = os.environ['PROJECT_ID']
	LOCATION_ID = os.environ['LOCATION_ID']
	DATA_STORE_ID = os.environ['DATA_STORE_ID']

	if LOCATION_ID == 'us':
		api_endpoint = 'us-discoveryengine.googleapis.com'
	else:
		api_endpoint = 'discoveryengine.googleapis.com'

	url = f"https://{api_endpoint}/v1alpha/projects/{PROJECT_ID}/locations/{LOCATION_ID}/collections/default_collection/dataStores/{DATA_STORE_ID}/servingConfigs/default_search:search"
	print(url)
	
	headers = {
			"Authorization": f"Bearer {access_token}",
			"Content-Type": "application/json",
	}
	
	post_data = {
			"servingConfig": f"projects/{PROJECT_ID}/locations/{LOCATION_ID}/collections/default_collection/dataStores/{DATA_STORE_ID}/servingConfigs/default_search",
			"pageSize": page_size,
			"query": query,
			"contentSearchSpec": {"searchResultMode": "CHUNKS"},
	}
	
	response = requests.post(url, headers=headers, json=post_data)

	if response.status_code != 200:
		print(
				f"Error retrieving search results: {response.status_code} -"
				f" {response.text}"
		)

	return response.json()

#####

def test_query_chunks():
	creds, _ = google.auth.default()
	creds.refresh(Request())
	access_token = creds.token
	print(access_token)
	response = query_chunks(query = "What is the annual revenue of Uber?", page_size=3, access_token=access_token)
	print(json.dumps(response, indent=4))

test_query_chunks()

ya29.a0ARW5m76XUuMNuwcS8hp_3X43FO1C_0MLnQLz3RHM8_BFYp6J_WnYdGAB8LjlABGzUxkVPSHYv9YxxJr7UYBYdsqeilaYNXf4dc11Ngs-90ryAbYFVcGFzTlABRfj5MFL0wN7gOkLdQYP-Rwv7LG5vUGs5ixigpOOsEj03wC1nAaCgYKAaASARESFQHGX2MiBd_rpN_bgtXho3WOm6hShg0177
https://discoveryengine.googleapis.com/v1alpha/projects/era4-447717/locations/global/collections/default_collection/dataStores/docs-10k_1736800282057/servingConfigs/default_search:search
{
    "results": [
        {
            "chunk": {
                "name": "projects/788677989752/locations/global/collections/default_collection/dataStores/docs-10k_1736800282057/branches/0/documents/2/chunks/c134",
                "id": "c134",
                "content": "# Highlights for 2023\n\nIn the fourth quarter of 2023, our MAPCs were 150 million, growing 8 million, or 6%, quarter-over-quarter, and growing 15% compared to the same period in 2022. Overall Gross Bookings increased by $22.5 billion in 2023, up 19%, or 20% on a constant currency basis, compared to 2022. Mobil

# Test 2 - Gemini and Vertex AI

In [9]:
# Build RAG chain using Vertex AI Agent Builder datastore

retreiver = utils.create_retriever_vertexai()
chat_prompt_template = utils.create_chat_prompt_template()
chain = utils.create_chain_vertexai(model_name='gemini-1.5-flash', 
                     prompt_template=chat_prompt_template, 
                     retriever=retreiver)



In [10]:
# Test the chain with a few questions 

questions = ["What is the annual revenue of Uber?",
"What is the annual revenue of Lyft?",
"How does Uber's revenue compare to Lyft's revenue?",]

for question in questions:
	print(question)
	result = chain.invoke({"question" : question})
	print(result)
	print(result["response"].content)
	print("\n*****")

What is the annual revenue of Uber?
{'response': AIMessage(content="Uber's revenue in 2023 was $37.3 billion.  This is stated in the provided text from the Uber 10-K report.", additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': []}, id='run-9df9adbe-ae5b-48f4-9612-191e30403254-0', usage_metadata={'input_tokens': 1849, 'output_tokens': 37, 'total_tokens': 1886, 'input_token_details': {'cache_read': 0}}), 'context': [Document(metadata={'id': '2', 'source': 'gs://docs-10k/pdf/uber.pdf', 'company': 'Uber', 'type': '10-K', 'year': '2023', 'url': 'https://storage.cloud.google.com/docs-10k/pdf/uber.pdf', 'previous_segments': [], 'next_segments': [], 'relevance_score': 0.7851943373680115}, page_content='# Highlights for 2023\n\nIn the fourth quarter of 2023, our MAPCs were 150 million, growing 8 million, or 6%, quarter-over-quarter, and growing 15% compared to the same period in 2022. Overall Gross Bo

In [11]:
# Evaluate the chain using Ragas

import time

import pandas as pd
from ragas.metrics import (
    answer_correctness,
    answer_relevancy,
    context_precision,
    context_recall,
    faithfulness)

# Get the questions and groundtruths from the dataframe
testset_df = pd.read_csv("testsets/10k_testset.csv")

questions = testset_df["user_input"].values.tolist()
questions = [str(question) for question in questions]

groundtruths = testset_df["reference"].values.tolist()
groundtruths = [str(ground_truth) for ground_truth in groundtruths]

eval_metrics = [
	answer_correctness,
	answer_relevancy,
	context_precision,
	context_recall,
	faithfulness
]

ragas_results, ragas_results_df = utils.run_ragas_evaluation(chain, 
														questions, 
														groundtruths, 
														eval_metrics)

# Write the results to disk
timestr = time.strftime("%Y%m%d%H%M%S")
ragas_results_df.to_csv(f"evaluations/10x_test2_testset_evaluation_{timestr}.csv")

# Show the resutls
print(ragas_results)

What kind of information does the Form 10-K provide about a company's future performance and risks?
{'response': AIMessage(content='The provided Lyft (2023) and Uber (2023) 10-K reports offer information about future performance and risks primarily through their "Forward-Looking Statements" sections.  These sections don\'t predict specific future outcomes but rather outline the *types* of uncertainties and risks that could materially affect their future financial performance and operations.\n\nBoth reports list numerous factors that could impact their future, including:\n\n* **Financial Performance:**  Uncertainty around revenue, costs (including cost of revenue and operating expenses), profitability, capital expenditures, and liquidity.  They acknowledge the difficulty in predicting future demand for their services and the impact of pricing strategies on their competitive position and financial results.\n\n* **Competition:**  The highly competitive nature of their markets is highlight

Evaluating:   0%|          | 0/255 [00:00<?, ?it/s]

{'answer_correctness': 0.5411, 'answer_relevancy': 0.3699, 'context_precision': 0.9314, 'context_recall': 0.6913, 'faithfulness': 0.7673}
