https://www.llamaindex.ai/blog/building-and-evaluating-a-qa-system-with-llamaindex-3f02e9d87ce1

1. Question Generation

In [1]:
import os

import nest_asyncio
nest_asyncio.apply()

from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader

from llama_index.core.evaluation import DatasetGenerator
# from llama_index.core.llama_dataset.generator import RAGDatasetGenerator

In [7]:
LLAMAPARSE_API_KEY = os.environ.get('LLAMAPARSE_API_KEY')
if LLAMAPARSE_API_KEY is not None:
    print('API key found')
else:
    print('Check for API key in environment variable')

API key found


In [8]:
# instantiate parser
parser = LlamaParse(
    api_key=LLAMAPARSE_API_KEY,
    result_type="markdown", # or text
    # num_workers=4 # for multiple files
    verbose=True,
    language="en", # default is english
)

In [9]:
# load document and parse it 
documents = parser.load_data('../data/axis-press-release-q3fy24.pdf')

Started parsing the file under job_id 50404232-fc90-4e48-a203-fab55680c4c5


#### Generate Questions

In [48]:
llm = OpenAI(temperature=0.3, model="gpt-3.5-turbo")
data_generator = DatasetGenerator.from_documents(documents, llm=llm)
questions = data_generator.generate_questions_from_nodes(20) 
# number in bracket to determine number of questions

  return cls(
  return QueryResponseDataset(queries=queries, responses=responses_dict)


In [49]:
len(questions)

20

#### Generate Answers from source nodes (Context)

In [12]:
from llama_index.core import VectorStoreIndex, load_index_from_storage, StorageContext

In [8]:
# build index
index = VectorStoreIndex.from_documents(documents)

# save to disk
index.set_index_id("axis_pr_vector_index")
index.storage_context.persist('../data/storage')

In [13]:
# rebuild storage
storage_context = StorageContext.from_defaults(persist_dir='../data/storage')

# load index
index = load_index_from_storage(storage_context, index_id="axis_pr_vector_index")

In [38]:
# generate query engine
query_engine = index.as_query_engine(similarity_top_k=3)

https://docs.llamaindex.ai/en/stable/module_guides/evaluating/

https://docs.llamaindex.ai/en/stable/examples/evaluation/retrieval/retriever_eval/

##### Measuring RAG App performance

* Response Evaluation - Does the response match the 
    * Retrieved context - Faithfulness; i.e. checks for hallucination
    * Query - context relevancy and answer relevancy
    * Reference Answer or guidelines

* Retrieval Evaluation - Are the retrieved sources relevant to the query

Evaluation answers 3 questions
* Response and source nodes (retrieved context) match - Response + Source Nodes (Retrieved Context) - Hallucination check - `FaitfulnessEvaluator`
* Response, source nodes (context) and query match? - query + response + source nodes (context) `RelevancyEvaluator`
* Which of the retrieved source nodes used to generate a response? - query + response + individual source nodes (context)

##### 1. Response and source nodes match - Response + Source Nodes (Context) - Hallucination

* response object for a query returns both source nodes and response
* we evaluate here without taking into account query
* Checks for model hallucination

https://docs.llamaindex.ai/en/stable/understanding/evaluating/evaluating/

In [15]:
from llama_index.llms.openai import OpenAI
from llama_index.core.evaluation import FaithfulnessEvaluator # outputs an EvaluationResult

In [35]:
# set llm and load evaluator
llm = OpenAI(model="gpt-4", temperature=0.0)
faithfulness_evaluator = FaithfulnessEvaluator(llm=llm)

In [17]:
# index and query engine already defined above
response = query_engine.query("What was Axis Bank's operating profit for the quarter ended 31st December 2023?")

# evalute_response takes in response object that has retrieved context and response both
# outputs an EvaluationResult object
eval_result = faithfulness_evaluator.evaluate_response(response=response)
# print(eval_result)
print(str(eval_result.passing))
print(str(eval_result.score))
print(str(eval_result.feedback))

True
1.0
YES


#### Evaluate each source context individually

In [18]:
response = query_engine.query("What was Axis Bank's operating profit for the quarter ended 31st December 2023?")
response_str = response.response
for source_node in response.source_nodes:
    eval_result = faithfulness_evaluator.evaluate(
        response=response_str, contexts=[source_node.get_content()]
    )
    print(str(eval_result.passing))

True
True
False


##### Display results as a dataframe

In [19]:

# to get response to output as a DF
import pandas as pd
from llama_index.core.evaluation import EvaluationResult
from llama_index.core import Response

In [24]:
# https://docs.llamaindex.ai/en/stable/examples/evaluation/faithfulness_eval/
# define jupyter display function
def display_eval_df(query: query, response: Response, eval_result: EvaluationResult) -> None:
    if response.source_nodes == []:
        print('no response received')
        return
    eval_df = pd.DataFrame(
        {
            "Query": query,
            "Response": str(response),
            "Source": response.source_nodes[0].node.text[:1000],
            "Evaluation Result": "Pass" if eval_result.passing else "Fail",
            "Reasoning": eval_result.feedback,
        },
        index=[0],
    )
    eval_df = eval_df.style.set_properties(
        **{
            "inline-size": "600px",
            "overflow-wrap": "break-word",
        },
        subset=["Response", "Source"]
    )
    display(eval_df)

In [25]:
# index and query engine already defined above
query = "What was Axis Bank's operating profit for the quarter ended 31st December 2023?"
response = query_engine.query(query)

# evalute_response takes in response object that has retrieved context and response both
# outputs an EvaluationResult object
eval_result = faithfulness_evaluator.evaluate_response(response=response)

In [27]:
display_eval_df(query, response, eval_result)

Unnamed: 0,Query,Response,Source,Evaluation Result,Reasoning
0,What was Axis Bank's operating profit for the quarter ended 31st December 2023?,"Axis Bank's operating profit for the quarter ended 31st December 2023 was `9,141 crores.","## AXIS BANK ## PRESS RELEASE ## AXIS BANK ANNOUNCES FINANCIAL RESULTS ## FOR THE QUARTER AND NINE MONTHS ENDED 31st DECEMBER 2023 Q3FY24 Operating profit at `9,141 crores up 6% QOQ, PAT at `6,071 crores up 4% QOQ; Consolidated ROE at 18.61%, aided by a balanced sequential deposit and loan growth of 5% and 4% respectively 9MFY24 PAT at `17,732 crores, up 16% YOY; Consolidated ROE at 18.86%, up 82 bps YOY - Consolidated ROA at 1.84%, with 9 bps contributed by subsidiaries - Net Interest Income grew 9% YOY and 2% QOQ, Net Interest Margin at 4.01% - Fee income grew 29% YOY and 4% QOQ, Retail fee grew 36% YOY and 6% QOQ, granular fees at 93% of total fees - Core Operating revenue up 14% YOY and 2% QOQ - Bank’s total business grew 20% | 5% of which advances grew 22% | 4% and MEB1 deposits grew 18% | 5% on YOY | QOQ basis - On a MEB1, retail term deposits grew 17% YOY & 2% QOQ, CASA grew 12% YOY with CASA ratio at 42% - Retail loans up 27% | 5%, SME up 26% | 4%, Corporate loans (gross",Pass,YES


##### Run benchmarks on generated questions

In [28]:
# questions are gneerated above

In [29]:
import asyncio

In [36]:
def evaluate_query_engine(query_engine, questions):    
    # evaluate generated questions
    
    # limit simultaneous calls with asyncio to avoid rate limit error
    semaphore = asyncio.Semaphore(2)
    
    
    # initialize questions in list of query engine
    c = [query_engine.aquery(semaphore, q) for q in questions]

    # get results; execute concurrently using asyncio
    results = asyncio.run(asyncio.gather(*c))
    print('got the results')
    
    # evaluate using gpt4


In [None]:
# generate query engine
query_engine = index.as_query_engine(similarity_top_k=3)

In [39]:
evaluate_query_engine(query_engine, questions)

TypeError: BaseQueryEngine.aquery() takes 2 positional arguments but 3 were given

In [None]:
# calculate results
de

#### 2. Evaluate Query + Response Relvancy

Evaluates if retrieved context and answer is relevant and consistent with query `RelevancyEvaluator`

In [43]:
from llama_index.core.evaluation import RelevancyEvaluator

In [44]:
# build index and query engine - done above

# define llm for eval and instantiate relevancy evaluator
llm = OpenAI(model="gpt-4", temperature=0.0)
relevancy_evaluator = RelevancyEvaluator(llm=llm)


In [25]:
query = "What was Axis Bank's operating profit for the quarter ended 31st December 2023?"
response = query_engine.query(query)
eval_result = relevancy_evaluator.evaluate_response(query=query, response=response)
print(str(eval_result.passing))
print(str(eval_result.score))
print(str(eval_result.feedback))
# print(str(eval_result))

True
1.0
YES


In [32]:
dict(eval_result).keys()

dict_keys(['query', 'contexts', 'response', 'passing', 'feedback', 'score', 'pairwise_source', 'invalid_result', 'invalid_reason'])

#### Evaluate on a specific source node

In [18]:
query = "What was Axis Bank's operating profit for the quarter ended 31st December 2023?"
response = query_engine.query(query)
response_str=response.response
for source_node in response.source_nodes:
    eval_result = relevancy_evaluator.evaluate(
        query=query,
        response=response_str,
        contexts=[source_node.get_content()],
    )
    print(str(eval_result.passing))

True
True
False


#### Batch Evaluation

* Run a set of evaluators across many questions

In [40]:
from llama_index.core.evaluation import BatchEvalRunner

In [50]:
runner = BatchEvalRunner(
    {"faithfulness": faithfulness_evaluator, "relevancy": relevancy_evaluator},
    workers=1,
)
# https://docs.llamaindex.ai/en/stable/api_reference/evaluation/#llama_index.core.evaluation.BatchEvalRunner.aevaluate_queries
eval_results = await runner.aevaluate_queries(
    query_engine, queries=questions,
)

Retrying llama_index.llms.openai.base.OpenAI._achat in 0.9240047099385316 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4 in organization org-lKk0TW9l7dSYVsQccT9CIjmj on tokens per min (TPM): Limit 40000, Used 39575, Requested 2811. Please try again in 3.579s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}.


CancelledError: 

In [21]:
def get_eval_results(key, eval_results):
    results = eval_results[key]
    correct = 0
    for result in results:
        if result.passing:
            correct += 1
    score = correct / len(results)
    print(f"{key} Score: {score}")
    return score

In [22]:
score = get_eval_results("correctness", eval_results)
score

KeyError: 'correctness'

In [None]:
score = get_eval_results("relevancy", eval_results)

#### 2. Retrieval Evaluation
https://docs.llamaindex.ai/en/stable/module_guides/evaluating/usage_pattern_retrieval/

* Are the retrieved sources relevant to the query

In [33]:
from llama_index.core.evaluation import RetrieverEvaluator

In [34]:
# define retriever and evaluator
retriever = index.as_retriever(similarity_top_k=3)
retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ['mrr', 'hit_rate'],
    retriever=retriever,
)

In [36]:
# perform evaluation
# retriever_evaluator.evaluate(
    # query="query",
    # expected_ids=['node_id1', 'node_id2', 'node_id3']
# )