In this notebook, we will use GCP to evaluate the performance of a RAG system using a vertexai index and Gemini.

This notebook does not cover how GCP work and will not go over the steps to create the vertexai index (we will however show how to get our embeddings). For more details, you can check [this resource](https://thenewstack.io/how-to-store-embeddings-in-vector-search-and-implement-rag/) or the GCP documentation.

# Import libraries

In [None]:
from dotenv import load_dotenv
from google.cloud import aiplatform
from google.cloud import storage
import json
from langchain.docstore.document import Document as LangchainDocument
from langchain.text_splitter import RecursiveCharacterTextSplitter
from openai import AzureOpenAI
import os
import pandas as pd
import time
from tqdm import tqdm
import vertexai
from vertexai.generative_models import GenerativeModel, Tool
from vertexai.language_models import TextEmbeddingModel
from vertexai.preview import rag

In [None]:
load_dotenv()

In [None]:
PROJECT_ID = os.environ.get("PROJECT_ID")
BUCKET_NAME = os.environ.get("BUCKET_NAME")

# Helper functions to generate our embeddings

In [None]:
def generate_text_embeddings(sentences, embedding_model = "text-embedding-004", location = "us-central1"):
    """
    Generates a list of embedding from a list of texts to embed and the name of the embedding model to use.
    
    input:
        sentences (list)
        embedding_model (str)

    output:
        vectors (list)
    """
    aiplatform.init(project=PROJECT_ID, location=location)
    model = TextEmbeddingModel.from_pretrained(embedding_model)
    embeddings = model.get_embeddings(sentences)
    vectors = [embedding.values for embedding in embeddings]
    return vectors

In [None]:
def generate_and_save_embeddings(document_list, sentence_file_path, embed_file_path, batch_size = 100):
    """
    Takes a list of sentences to embed, embeds them and saves the text chunks and embeddings to the specified directories.
    
    input:
        document_list (list)
        sentence_file_path (str)
        embed_file_path (str)
        batch_size (int)
    
    output:
        None
    """
    # GCP embedding has a maximum number of texts you can send at once, that is why we send them in batches.
    embeddings = []
    for i in range(len(document_list) // batch_size +1):
        embeddings += generate_text_embeddings(document_list[i*batch_size:(i+1)*batch_size])
    with open(embed_file_path, 'w') as embed_file, open(sentence_file_path, 'w') as sentence_file:
        for sentence, embedding in zip(document_list, embeddings):
            id = hash(sentence)
            
            embed_item = {"id": id, "embedding": embedding}
            sentence_item = {"id": id, "sentence": sentence}
            
            json.dump(sentence_item, sentence_file)
            sentence_file.write('\n')
            json.dump(embed_item, embed_file)
            embed_file.write('\n') 

In [None]:
def upload_file(bucket_name, file_path, location = "us-central1"):
    """
    Uploads a file to a GCP bucket.

    input:
        bucket_name (str)
        file_path (str)
        location (str)
    
    output:
        None
    """
    storage_client = storage.Client(project=PROJECT_ID)
    # in this function, we create a new bucket. We can adapt the function if we want to use an existing GCP bucket.
    #bucket = storage_client.get_bucket(bucket_name)
    bucket = storage_client.create_bucket(bucket_name,location=location)
    blob = bucket.blob(file_path)
    blob.upload_from_filename(file_path)

In [None]:
def load_file(sentence_file_path):
    """
    Loads a json data containing a json on each line

    input:
        sentence_file_path (str)

    output:
        data (list)
    """
    data = []
    with open(sentence_file_path, 'r') as file:
        for line in file:
            entry = json.loads(line)
            data.append(entry)
    return data

In [None]:
def find_all_corresponding_document(list_index, sentence_file_list):
    """
    Given a list of index to look for, a list of json with indexes and the corresponding text chunks, returns the text chunks.

    input:
        list_index (list)
        sentence_file_list (list)
    output:
        document_context (list)
    """
    document_context = []
    i=0
    for element in sentence_file_list:
        if element["id"] in list_index:
            i+=1
            document_context.append(element["sentence"])
        if i == len(list_index):
            return document_context
    return document_context

Chunking of our documents, we upload them online. GCP has no built-in options for chunking. We use langchain to do our chunking.

In [None]:
# We retrieve our documents

list_doc = []
for document in os.listdir("sample_data"):
    with open(f"sample_data/{document}", encoding='utf-8') as f:
        list_doc.append(f.read())

In [None]:
# We do our chunking using langchain

langchain_docs = [
    LangchainDocument(page_content=doc)
    for doc in tqdm(list_doc)
]

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=100,
    add_start_index=True,
    separators=["\n\n", "\n", ".", " ", ""],
)

docs_processed = []
for doc in langchain_docs:
    docs_processed += text_splitter.split_documents([doc])

chunk_list = [docs_processed[i].page_content for i in range(len(docs_processed))]

In [None]:
generate_and_save_embeddings(chunk_list, sentence_file_path="sentence_file.json", embed_file_path="embed_file.json")
upload_file(BUCKET_NAME, "embed_file.json")

# Interrogation of the documents in our RAG
---

**Important**: In the meantime, we created in GCP a vector index in vertexai, we deployed this index and created an endpoint for this index.

In [None]:
# Connecting to GCP
aiplatform.init(project=PROJECT_ID,location="us-central1")
vertexai.init()
rag_model = GenerativeModel("gemini-1.5-flash-001")
index_test_rag = aiplatform.MatchingEngineIndexEndpoint(index_endpoint_name="<Enter Index of your Index Endpoint>")

In [None]:
# List of text chunks.
sentence_file_list = load_file("sentence_file.json")

In [None]:
GEMINI_PROMPT = """
Your task is to answer a complicated factoid question given documents at your disposal.
The questions demand a good understanding of the documents.
Your factoid answer should be specific, concise informations retrieved from the documents.

Documents:::
{document}

Question:::
{question}

Answer:::"""

In [None]:
# Change name of file depending on the name you chose for your question dataset
question_dataset = pd.read_csv("comparison.csv", sep=";")

In [None]:
# Ask questions to our LLM

list_gemini_answer = []

for my_question in question_dataset["question"].values:
    question_embedding = generate_text_embeddings([my_question])
    response = index_test_rag.find_neighbors(
        deployed_index_id = "<Enter Deployed Index Name here>",
        queries = [question_embedding[0]],
        num_neighbors = 5
    )
    index_to_search = [int(element.id) for element in response[0]]
    document_context = find_all_corresponding_document(index_to_search)
    document_str = ""
    for i in range(len(document_context)):
        document_str += f"document number {i}: {document_context[i]}\n"

    llm_response = rag_model.generate_content(
        GEMINI_PROMPT.format(document=document_str, question=my_question)
    )
    list_gemini_answer.append(llm_response.text)

# Evaluation des réponses

---

On utilise un modèle GPT-4 pour évaluer les réponses produites par Gemini

In [None]:
OPENAI_API_ENDPOINT = os.environ.get("OPENAI_API_ENDPOINT") 
OPENAI_API_VERSION = os.environ.get("OPENAI_API_VERSION")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

aoai_client = AzureOpenAI(
    azure_endpoint=OPENAI_API_ENDPOINT,
    api_key=OPENAI_API_KEY,
    api_version=OPENAI_API_VERSION,
)

In [None]:
EVALUATION_PROMPT = """###Task Description:
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 5}}\"
4. Please do not generate any other opening, closing, and explanations. Be sure to include [RESULT] in your output.

###The instruction to evaluate:
{instruction}

###Response to evaluate:
{response}

###Reference Answer (Score 5):
{reference_answer}

###Score Rubrics:
[Is the response correct, accurate, and factual based on the reference answer?]
Score 1: The response is completely incorrect, inaccurate, and/or not factual.
Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
Score 3: The response is somewhat correct, accurate, and/or factual.
Score 4: The response is mostly correct, accurate, and factual.
Score 5: The response is completely correct, accurate, and factual.

###Feedback:"""

In [None]:
all_questions = question_dataset["question"].values
all_answer = question_dataset["answer"].values
evaluation_gemini = []

for i in tqdm(range(len(all_questions))):
    question_i = all_questions[i]
    reference_answer_i = all_answer[i]
    gemini_answer_i = list_gemini_answer[i]
    evaluation_gemini.append(aoai_client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": EVALUATION_PROMPT.format(instruction = question_i, response = gemini_answer_i, reference_answer = reference_answer_i)}],
        temperature=0.2,
        top_p = 1,
        max_tokens=800).choices[0].message.content)
    time.sleep(2)

In [None]:
gemini_critique = []
gemini_grade = []

for i in tqdm(range(len(evaluation_gemini))):
    gemini_critique.append(evaluation_gemini[i].split("[RESULT]")[0])
    gemini_grade.append(evaluation_gemini[i].split("[RESULT]")[1])

question_dataset["gemini_anwer"], question_dataset["gemini_critique"], question_dataset["gemini_grade"] = list_gemini_answer, gemini_critique, gemini_grade

In [None]:
question_dataset.to_csv("comparison.csv", index=False, sep=";")