# Llama index evaluation
https://www.llamaindex.ai/blog/evaluating-the-ideal-chunk-size-for-a-rag-system-using-llamaindex-6207e5d3fec5

In [7]:
import nest_asyncio
import tqdm

nest_asyncio.apply()

from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    ServiceContext,
    Document
)
from llama_index.core.evaluation import (
    DatasetGenerator,
    FaithfulnessEvaluator,
    RelevancyEvaluator,
    ContextRelevancyEvaluator
)
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding

import time
from llama_index.core import Settings
import pickle

In [2]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [3]:
path = '../data/Constitución Española'

### Create questions

load documents splitted by articles

In [5]:
file_path = path + '/chunks/documents_spanisharticlesplitter.pkl'
# Load the texts from the pickle file
with open(file_path, "rb") as file:
    langdocs = pickle.load(file)

# Documents to llama-index docs
llamadocs = [Document(text=doc.page_content, metadata=doc.metadata) for doc in langdocs]

In [48]:
llm = AzureOpenAI(
    openai_api_version=os.environ["OPENAI_API_VERSION"],
    azure_deployment=os.environ["AZURE_GPT3TURBO_DEPLOYMENT"],
)

In [52]:
from llama_index.core.prompts.base import PromptTemplate

QUESTION_GENERATION_PROMPT = PromptTemplate("""\
Context information is below.
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge.
generate only questions based on the below query.
Generate the questions in Spanish.
{query_str}
""")

In [56]:
data_generator = DatasetGenerator.from_documents(documents=llamadocs, llm=llm, text_question_template=QUESTION_GENERATION_PROMPT)
eval_questions = data_generator.generate_questions_from_nodes(num=300)

  return cls(
  return QueryResponseDataset(queries=queries, responses=responses_dict)


In [None]:
import pandas as pd

# Create a DataFrame from the eval_questions list
df = pd.DataFrame(eval_questions, columns=['Questions'])

# Save the DataFrame to a CSV file
df.to_csv(path + '/eval_questions.csv', index=False)

### Eval Questions

In [4]:
# We will use GPT-4 for evaluating the responses
gpt4 = AzureOpenAI(
    openai_api_version=os.environ["OPENAI_API_VERSION"],
    azure_deployment=os.environ["AZURE_GPT4TURBO_DEPLOYMENT"],
    api_key=os.environ["AZURE_OPENAI_API_KEY"]
)

embed_model = AzureOpenAIEmbedding(
    openai_api_version=os.environ["OPENAI_API_VERSION"],
    azure_deployment=os.environ["AZURE_ADA2_DEPLOYMENT"],
    api_key=os.environ["AZURE_OPENAI_API_KEY"]
)

# Define service context for GPT-4 for evaluation
service_context_gpt4 = ServiceContext.from_defaults(llm=gpt4, embed_model=embed_model)

# Define Faithfulness and Relevancy Evaluators which are based on GPT-4
faithfulness_gpt4 = FaithfulnessEvaluator(service_context=service_context_gpt4)
relevancy_gpt4 = RelevancyEvaluator(service_context=service_context_gpt4)
contextrelevancy_gpt4 = ContextRelevancyEvaluator(service_context=service_context_gpt4)

  service_context_gpt4 = ServiceContext.from_defaults(llm=gpt4, embed_model=embed_model)


In [5]:
async def evaluate_response_time_and_accuracy(eval_documents, eval_questions):
    """
    Evaluate the average response time, faithfulness, and relevancy of responses generated by GPT-3.5-turbo for given documents.
    
    Parameters:
    eval_documents (list): The list of pre-split documents to be evaluated.
    eval_questions (list): The list of questions to evaluate responses for.
    
    Returns:
    tuple: A tuple containing the average response time, faithfulness, and relevancy metrics.
    """

    total_response_time = 0
    total_faithfulness = 0
    total_relevancy = 0
    total_contextrelevancy = 0.0

    # Create vector index
    llm = AzureOpenAI(
        openai_api_version=os.environ["OPENAI_API_VERSION"],
        azure_deployment=os.environ["AZURE_GPT3TURBO_DEPLOYMENT"],
        api_key=os.environ["AZURE_OPENAI_API_KEY"]
    )
    service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)
    vector_index = VectorStoreIndex.from_documents(
        eval_documents, service_context=service_context
    )
    # Build query engine
    query_engine = vector_index.as_query_engine()
    num_questions = len(eval_questions)

    # Iterate over each question in eval_questions to compute metrics.
    # While BatchEvalRunner can be used for faster evaluations (see: https://docs.llamaindex.ai/en/latest/examples/evaluation/batch_eval.html),
    # we're using a loop here to specifically measure response time for different chunk sizes.

    df = []

    for question in tqdm.tqdm(eval_questions):

        start_time = time.time()
        response_vector = query_engine.query(question)
        elapsed_time = time.time() - start_time
        
        faithfulness_result = faithfulness_gpt4.evaluate_response(
            response=response_vector
        ).passing
        
        relevancy_result = relevancy_gpt4.evaluate_response(
            query=question, response=response_vector
        ).passing

        contextrelevancy_result = await contextrelevancy_gpt4.aevaluate_response(question, response_vector)

        total_response_time += elapsed_time
        total_faithfulness += faithfulness_result
        total_relevancy += relevancy_result
        total_contextrelevancy += contextrelevancy_result.score


        data = {'question': question, 
                'response': response_vector.response,
                'context': [{'text': c.text, 'metadata': c.metadata} for c in response_vector.source_nodes],
                'faithfulness': faithfulness_result, 
                'relevancy': relevancy_result,
                'context_relevancy_score': contextrelevancy_result.score,
                'context_relevancy_feedback': contextrelevancy_result.feedback,
                'response_time': elapsed_time}

        df.append(data.copy())

    average_response_time = total_response_time / num_questions
    average_faithfulness = total_faithfulness / num_questions
    average_relevancy = total_relevancy / num_questions
    average_contextrelevancy = total_contextrelevancy / num_questions

    return average_response_time, average_faithfulness, average_relevancy, average_contextrelevancy, df

In [8]:
import os
import json
import pandas as pd

df = pd.read_csv( path + '/eval_questions.csv')
eval_questions = df['Questions'].tolist()

results = {}
directory = path + '/chunks'
for filename in os.listdir(directory):
    print("-----------------------------------")
    print("CHUNKERIZATION FILE: ", filename)
    file_path = os.path.join(directory, filename)
    with open(file_path, "rb") as file:
        langdocs = pickle.load(file)
    # Documents to llama-index docs
    llamadocs = [Document(text=doc.page_content, metadata=doc.metadata) for doc in langdocs]
    avg_response_time, avg_faithfulness, avg_relevancy, average_contextrelevancy, df = await evaluate_response_time_and_accuracy(llamadocs, eval_questions)
    results[filename] = { "Average response time": avg_response_time, "Average faithfulness": avg_faithfulness, "Average relevancy": avg_relevancy , "Average context relevancy": average_contextrelevancy}

    file_path = os.path.join(path, 'eval_chunks', filename.replace('.pkl', '_answers.json'))
    # Save the metrics dictionary as a JSON file
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(df, file, ensure_ascii=False, indent=4)


file_path = os.path.join(path, 'metrics.json')
# Save the metrics dictionary as a JSON file
with open(file_path, 'w') as file:
    json.dump(results, file)

-----------------------------------
CHUNKERIZATION FILE:  documents_spanisharticlesplitter.pkl


  service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)
100%|██████████| 300/300 [56:04<00:00, 11.21s/it] 
  service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)


-----------------------------------
CHUNKERIZATION FILE:  documents_charactersplitter_400.pkl


100%|██████████| 300/300 [44:01<00:00,  8.81s/it]


-----------------------------------
CHUNKERIZATION FILE:  documents_recursivecharactersplitter_400.pkl


100%|██████████| 300/300 [44:34<00:00,  8.92s/it] 


-----------------------------------
CHUNKERIZATION FILE:  documents_recursivecharactersplitter_200.pkl


100%|██████████| 300/300 [42:37<00:00,  8.53s/it]


-----------------------------------
CHUNKERIZATION FILE:  documents_charactersplitter_300.pkl


100%|██████████| 300/300 [34:42<00:00,  6.94s/it]


-----------------------------------
CHUNKERIZATION FILE:  documents_recursivecharactersplitter_300.pkl


100%|██████████| 300/300 [38:06<00:00,  7.62s/it]


-----------------------------------
CHUNKERIZATION FILE:  documents_charactersplitter_200.pkl


100%|██████████| 300/300 [36:58<00:00,  7.40s/it]


## Metrics

In [3]:
import json
import pandas as pd
import os

In [5]:

path = '../data/Constitución Española'
metrics_file_path = os.path.join(path,'metrics.json')

# Read the metrics from the JSON file
with open(metrics_file_path, 'r') as file:
    metrics = json.load(file)

# Create a DataFrame from the metrics dictionary
df = pd.DataFrame(metrics).T

In [7]:
def highlight_best_metrics(s):
    styles = []
    for val in s:
        if val == s.max():
            styles.append('text-decoration: underline; color: green')
        else:
            styles.append('')
    return styles

# Apply the style function to the DataFrame and display it
df.style.apply(highlight_best_metrics)

Unnamed: 0,Average response time,Average faithfulness,Average relevancy,Average context relevancy
documents_spanisharticlesplitter.pkl,1.559313,0.943333,0.943333,0.856458
documents_charactersplitter_400.pkl,1.521924,0.95,0.906667,0.717917
documents_recursivecharactersplitter_400.pkl,1.504329,0.926667,0.866667,0.690417
documents_recursivecharactersplitter_200.pkl,1.432389,0.946667,0.85,0.647917
documents_charactersplitter_300.pkl,1.298296,0.91,0.826667,0.66625
documents_recursivecharactersplitter_300.pkl,1.387602,0.94,0.896667,0.689583
documents_charactersplitter_200.pkl,1.360128,0.913333,0.816667,0.645833
