# Llama index evaluation
https://www.llamaindex.ai/blog/evaluating-the-ideal-chunk-size-for-a-rag-system-using-llamaindex-6207e5d3fec5

In [71]:
import nest_asyncio

nest_asyncio.apply()

from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    ServiceContext,
    Document
)
from llama_index.core.evaluation import (
    DatasetGenerator,
    FaithfulnessEvaluator,
    RelevancyEvaluator
)
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding

import time
from llama_index.core import Settings

In [17]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [42]:
path = '../data/Constitución Española'

### Create questions

load documents splitted by articles

In [47]:
import pickle
file_path = path + '/chunks/documents_spanisharticlesplitter.pkl'
# Load the texts from the pickle file
with open(file_path, "rb") as file:
    langdocs = pickle.load(file)

# Documents to llama-index docs
llamadocs = [Document(text=doc.page_content, metadata=doc.metadata) for doc in docs]

In [48]:
llm = AzureOpenAI(
    openai_api_version=os.environ["OPENAI_API_VERSION"],
    azure_deployment=os.environ["AZURE_GPT3TURBO_DEPLOYMENT"],
)

In [52]:
from llama_index.core.prompts.base import PromptTemplate

QUESTION_GENERATION_PROMPT = PromptTemplate("""\
Context information is below.
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge.
generate only questions based on the below query.
Generate the questions in Spanish.
{query_str}
""")

In [56]:
data_generator = DatasetGenerator.from_documents(documents=llamadocs, llm=llm, text_question_template=QUESTION_GENERATION_PROMPT)
eval_questions = data_generator.generate_questions_from_nodes(num=300)

  return cls(
  return QueryResponseDataset(queries=queries, responses=responses_dict)


In [58]:
import pandas as pd

# Create a DataFrame from the eval_questions list
df = pd.DataFrame(eval_questions, columns=['Questions'])

# Save the DataFrame to a CSV file
df.to_csv(path + '/eval_questions.csv', index=False)

### Eval Questions

In [76]:
# We will use GPT-4 for evaluating the responses
gpt4 = AzureOpenAI(
    openai_api_version=os.environ["OPENAI_API_VERSION"],
    azure_deployment=os.environ["AZURE_GPT4TURBO_DEPLOYMENT"],
    api_key=os.environ["AZURE_OPENAI_API_KEY"]
)

embed_model = AzureOpenAIEmbedding(
    openai_api_version=os.environ["OPENAI_API_VERSION"],
    azure_deployment=os.environ["AZURE_ADA2_DEPLOYMENT"],
    api_key=os.environ["AZURE_OPENAI_API_KEY"]
)

# Define service context for GPT-4 for evaluation
service_context_gpt4 = ServiceContext.from_defaults(llm=gpt4, embed_model=embed_model)

# Define Faithfulness and Relevancy Evaluators which are based on GPT-4
faithfulness_gpt4 = FaithfulnessEvaluator(service_context=service_context_gpt4)
relevancy_gpt4 = RelevancyEvaluator(service_context=service_context_gpt4)

  service_context_gpt4 = ServiceContext.from_defaults(llm=gpt4, embed_model=embed_model)


In [85]:
def evaluate_response_time_and_accuracy(eval_documents, eval_questions):
    """
    Evaluate the average response time, faithfulness, and relevancy of responses generated by GPT-3.5-turbo for given documents.
    
    Parameters:
    eval_documents (list): The list of pre-split documents to be evaluated.
    eval_questions (list): The list of questions to evaluate responses for.
    
    Returns:
    tuple: A tuple containing the average response time, faithfulness, and relevancy metrics.
    """

    total_response_time = 0
    total_faithfulness = 0
    total_relevancy = 0

    # Create vector index
    llm = AzureOpenAI(
        openai_api_version=os.environ["OPENAI_API_VERSION"],
        azure_deployment=os.environ["AZURE_GPT3TURBO_DEPLOYMENT"],
        api_key=os.environ["AZURE_OPENAI_API_KEY"]
    )
    service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)
    vector_index = VectorStoreIndex.from_documents(
        eval_documents, service_context=service_context
    )
    # Build query engine
    query_engine = vector_index.as_query_engine()
    num_questions = len(eval_questions)

    # Iterate over each question in eval_questions to compute metrics.
    # While BatchEvalRunner can be used for faster evaluations (see: https://docs.llamaindex.ai/en/latest/examples/evaluation/batch_eval.html),
    # we're using a loop here to specifically measure response time for different chunk sizes.
    for question in eval_questions:

        start_time = time.time()
        response_vector = query_engine.query(question)
        elapsed_time = time.time() - start_time
        
        faithfulness_result = faithfulness_gpt4.evaluate_response(
            response=response_vector
        ).passing
        
        relevancy_result = relevancy_gpt4.evaluate_response(
            query=question, response=response_vector
        ).passing

        total_response_time += elapsed_time
        total_faithfulness += faithfulness_result
        total_relevancy += relevancy_result

    average_response_time = total_response_time / num_questions
    average_faithfulness = total_faithfulness / num_questions
    average_relevancy = total_relevancy / num_questions

    return average_response_time, average_faithfulness, average_relevancy

In [None]:
avg_response_time, avg_faithfulness, avg_relevancy = evaluate_response_time_and_accuracy(llamadocs, eval_questions[:10])

In [86]:
avg_response_time, avg_faithfulness, avg_relevancy

(0.6059009552001953, 0.9, 0.9)