# Setup

In [1]:
# %pip install -r requirements.txt

# Environment Variables

In [None]:
# Get environment variables

from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Google Colab Auth

In [None]:
import sys

# If running in Colab, use the permissions of the currently authenticated user
if "google.colab" in sys.modules:
	print('Running in Google Colab')
	
	from google.colab import auth
	
	auth.authenticate_user()

# If not, set the GOOGLE_APPLICATION_CREDENTIALS to the service account credentials file 
else:
	print("Running locally")

# Utilities

In [4]:
import utils

# Environment Checks

In [None]:
# Check GPC permissions

import os

import google.auth
from google.cloud import storage

PROJECT_ID = os.environ["PROJECT_ID"]
creds, _ = google.auth.default(quota_project_id=PROJECT_ID)

# Now, you can use the Google Cloud client libraries
client = storage.Client(credentials=creds)

# List all buckets in your project
buckets = list(client.list_buckets())
print(buckets)

In [None]:
# Check Google API key

import requests

GOOGLE_API_KEY = os.environ['GOOGLE_API_KEY']
url = f'https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent?key={GOOGLE_API_KEY}'
  
headers = {
	"Content-Type": "application/json",
}
  
data = {"contents":[{"parts":[{"text":"What was Uber's annual revenue for 2022?"}]}]}

response = requests.post(url, headers=headers, json=data)
print(response.text)

In [None]:
# Check Vertex AI Agent Builder data store

import json

import google.auth
from google.auth.transport.requests import Request
import requests

def query_chunks(query: str, 
				page_size: int, 
				access_token:str) -> str:
		
	PROJECT_ID = os.environ['PROJECT_ID']
	LOCATION_ID = os.environ['LOCATION_ID']
	DATA_STORE_ID = os.environ['DATA_STORE_ID']

	if LOCATION_ID == 'us':
		api_endpoint = 'us-discoveryengine.googleapis.com'
	else:
		api_endpoint = 'discoveryengine.googleapis.com'

	url = f"https://{api_endpoint}/v1alpha/projects/{PROJECT_ID}/locations/{LOCATION_ID}/collections/default_collection/dataStores/{DATA_STORE_ID}/servingConfigs/default_search:search"
	print(url)
	
	headers = {
			"Authorization": f"Bearer {access_token}",
			"Content-Type": "application/json",
	}
	
	post_data = {
			"servingConfig": f"projects/{PROJECT_ID}/locations/{LOCATION_ID}/collections/default_collection/dataStores/{DATA_STORE_ID}/servingConfigs/default_search",
			"pageSize": page_size,
			"query": query,
			"contentSearchSpec": {"searchResultMode": "CHUNKS"},
	}
	
	response = requests.post(url, headers=headers, json=post_data)

	if response.status_code != 200:
		print(
				f"Error retrieving search results: {response.status_code} -"
				f" {response.text}"
		)

	return response.json()

#####

def test_query_chunks():
	creds, _ = google.auth.default()
	creds.refresh(Request())
	access_token = creds.token
	print(access_token)
	response = query_chunks(query = "What is the annual revenue of Uber?", page_size=3, access_token=access_token)
	print(json.dumps(response, indent=4))

test_query_chunks()

# Test 2 - Gemini and Vertex AI

In [None]:
# Build RAG chain using Vertex AI Agent Builder datastore

retreiver = utils.create_retriever_vertexai()
chat_prompt_template = utils.create_chat_prompt_template()
chain = utils.create_chain(model_name='gemini-1.5-flash', 
                     prompt_template=chat_prompt_template, 
                     retriever=retreiver)

In [None]:
# Test the chain with a few questions 

questions = ["What is the annual revenue of Uber?",
"What is the annual revenue of Lyft?",
"How does Uber's revenue compare to Lyft's revenue?",]

for question in questions:
	print(question)
	result = chain.invoke({"question" : question})
	print(result)
	print(result["response"].content)
	print("\n*****")

In [None]:
# Evaluate the chain using Ragas

import time

import pandas as pd

# Get the questions and groundtruths from the dataframe
testset_df = pd.read_csv("testsets/10k_testset.csv")

questions = testset_df["user_input"].values.tolist()
questions = [str(question) for question in questions]

groundtruths = testset_df["reference"].values.tolist()
groundtruths = [str(ground_truth) for ground_truth in groundtruths]

eval_metrics = [
	answer_correctness,
	answer_relevancy,
	context_precision,
	context_recall,
	faithfulness
]

ragas_results, ragas_results_df = run_ragas_evaluation(chain, 
														questions, 
														groundtruths, 
														eval_metrics)

# Write the results to disk
timestr = time.strftime("%Y%m%d%H%M%S")
ragas_results_df.to_csv(f"evaluations/10x_test2_testset_evaluation_{timestr}.csv")

# Show the resutls
print(ragas_results)