https://www.llamaindex.ai/blog/building-and-evaluating-a-qa-system-with-llamaindex-3f02e9d87ce1

1. Question Generation

In [25]:
import os

import nest_asyncio
nest_asyncio.apply()

from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader

from llama_index.core.evaluation import DatasetGenerator
# from llama_index.core.llama_dataset.generator import RAGDatasetGenerator

In [3]:
LLAMAPARSE_API_KEY = os.environ.get('LLAMAPARSE_API_KEY')
if LLAMAPARSE_API_KEY is not None:
    print('API key found')
else:
    print('Check for API key in environment variable')

API key found


In [4]:
# instantiate parser
parser = LlamaParse(
    api_key=LLAMAPARSE_API_KEY,
    result_type="markdown", # or text
    # num_workers=4 # for multiple files
    verbose=True,
    language="en", # default is english
)

In [5]:
# load document and parse it 
documents = parser.load_data('../data/axis-press-release-q3fy24.pdf')

Started parsing the file under job_id c478fefa-cb4d-4efe-88eb-2df1d6dccd6c


#### Generate Questions

In [6]:
data_generator = DatasetGenerator.from_documents(documents)
questions = data_generator.generate_questions_from_nodes()

  return cls(
  return QueryResponseDataset(queries=queries, responses=responses_dict)


In [7]:
questions

["What was Axis Bank's operating profit for the quarter ended 31st December 2023?",
 "How much did Axis Bank's PAT increase by quarter on quarter for Q3FY24?",
 'What was the consolidated ROE for Axis Bank for the nine months ended 31st December 2023?',
 'How much did the fee income grow year on year for Axis Bank?',
 'What was the growth percentage of retail loans for Axis Bank on a year on year basis?',
 'What was the GNPA% for Axis Bank in Q3FY24 and how did it change year on year?',
 'How many credit cards were issued by Axis Bank in Q3FY24?',
 'What is the market share of Axis Bank in the CIF market for credit cards?',
 'What new digital banking solution was launched by Axis Bank in the quarter?',
 'What awards did Axis Bank win during the quarter, according to the press release?',
 "What was the focus of Axis Bank's 'Sparsh Week' initiative?",
 "How did Axis Bank celebrate 'Sparsh Week' and how many employees were reached through this initiative?",
 'What was the growth rate of N

#### Generate Answers from source nodes (Context)

In [8]:
from llama_index.core import VectorStoreIndex, load_index_from_storage, StorageContext

In [9]:
# build index
index = VectorStoreIndex.from_documents(documents)

# save to disk
index.set_index_id("axis_pr_vector_index")
index.storage_context.persist('../data/storage')

In [10]:
# rebuild storage
storage_context = StorageContext.from_defaults(persist_dir='../data/storage')
# load index
index = load_index_from_storage(storage_context, index_id="axis_pr_vector_index")

In [11]:
# generate query engine
query_engine = index.as_query_engine(similarity_top_k=3)

#### Evaluate answers
https://docs.llamaindex.ai/en/stable/module_guides/evaluating/

##### Measuring RAG App performance

* Response Evaluation - Does the response match the 
    * Retrieved context - Faithfulness; i.e. checks for hallucination
    * Query - context relevancy and answer relevancy
    * Reference Answer or guidelines

* Retrieval Evaluation - Are the retrieved sources relevant to the query

Evaluation answers 3 questions
* Response and source nodes match - Response + Source Nodes (Context) - Hallucination - `FaitfulnessEvaluator`
* Response, source nodes (context) and query match? - query + response + source nodes (context) `RelevancyEvaluator`
* Which of the retrieved source nodes used to generate a response? - query + response + individual source nodes (context)

##### 1. Response and source nodes match - Response + Source Nodes (Context) - Hallucination

* response object for a query returns both source nodes and response
* we evaluate here without taking into account query
* Checks for model hallucination

https://docs.llamaindex.ai/en/stable/understanding/evaluating/evaluating/

In [12]:
from llama_index.llms.openai import OpenAI
from llama_index.core.evaluation import FaithfulnessEvaluator

In [28]:
# set llm and load evaluator
llm = OpenAI(model="gpt-4", temperature=0.0)
faithfulness_evaluator = FaithfulnessEvaluator(llm=llm)

In [29]:
# index and query engine already defined above
response = query_engine.query("What was Axis Bank's operating profit for the quarter ended 31st December 2023?")
eval_result = faithfulness_evaluator.evaluate_response(response=response)
# print(eval_result)
print(str(eval_result.passing))

True


#### Evaluate each source context individually

In [30]:
response = query_engine.query("What was Axis Bank's operating profit for the quarter ended 31st December 2023?")
response_str = response.response
for source_node in response.source_nodes:
    eval_result = faithfulness_evaluator.evaluate(
        response=response_str, contexts=[source_node.get_content()]
    )
    print(str(eval_result.passing))

True
True
False


#### 2. Evaluate Query + Response Relvancy

Evaluates if retrieved context and answer is relevant and consistent with query `RelevancyEvaluator`

In [20]:
from llama_index.core.evaluation import RelevancyEvaluator

In [31]:
# build index and query engine - done above
relevancy_evaluator = RelevancyEvaluator(llm=llm)


In [32]:
query = "What was Axis Bank's operating profit for the quarter ended 31st December 2023?"
response = query_engine.query(query)
eval_result = relevancy_evaluator.evaluate_response(query=query, response=response)
print(str(eval_result))

query="What was Axis Bank's operating profit for the quarter ended 31st December 2023?" contexts=["## AXIS BANK\n\n## PRESS RELEASE\n\n## AXIS BANK ANNOUNCES FINANCIAL RESULTS\n\n## FOR THE QUARTER AND NINE MONTHS ENDED 31st DECEMBER 2023\n\nQ3FY24 Operating profit at `9,141 crores up 6% QOQ, PAT at `6,071 crores up 4% QOQ; Consolidated ROE at 18.61%, aided by a balanced sequential deposit and loan growth of 5% and 4% respectively\n\n9MFY24 PAT at `17,732 crores, up 16% YOY; Consolidated ROE at 18.86%, up 82 bps YOY\n\n- Consolidated ROA at 1.84%, with 9 bps contributed by subsidiaries\n- Net Interest Income grew 9% YOY and 2% QOQ, Net Interest Margin at 4.01%\n- Fee income grew 29% YOY and 4% QOQ, Retail fee grew 36% YOY and 6% QOQ, granular fees at 93% of total fees\n- Core Operating revenue up 14% YOY and 2% QOQ\n- Bank’s total business grew 20% | 5% of which advances grew 22% | 4% and MEB1 deposits grew 18% | 5% on YOY | QOQ basis\n- On a MEB1, retail term deposits grew 17% YOY & 2

#### Evaluate on a specific source node

In [33]:
query = "What was Axis Bank's operating profit for the quarter ended 31st December 2023?"
response = query_engine.query(query)
response_str=response.response
for source_node in response.source_nodes:
    eval_result = relevancy_evaluator.evaluate(
        query=query,
        response=response_str,
        contexts=[source_node.get_content()],
    )
    print(str(eval_result.passing))

True
True
False


#### Batch Evaluation

* Run a set of evaluators across many questions

In [26]:
from llama_index.core.evaluation import BatchEvalRunner

In [38]:
runner = BatchEvalRunner(
    {"faithfulness": faithfulness_evaluator, "relevancy": relevancy_evaluator},
    workers=1,
)

eval_results = await runner.aevaluate_queries(
    query_engine, queries=questions,
)

In [35]:
def get_eval_results(key, eval_results):
    results = eval_results[key]
    correct = 0
    for result in results:
        if result.passing:
            correct += 1
    score = correct / len(results)
    print(f"{key} Score: {score}")
    return score

In [36]:
score = get_eval_results("correctness", eval_results)
score

NameError: name 'eval_results' is not defined

In [None]:
score = get_eval_results("relevancy", eval_results)

#### 2. Retrieval Evaluation
https://docs.llamaindex.ai/en/stable/module_guides/evaluating/usage_pattern_retrieval/

* Are the retrieved sources relevant to the query

In [15]:
from llama_index.core.evaluation import RetrieverEvaluator

In [16]:
# define retriever and evaluator
retriever = index.as_retriever(similarity_top_k=3)
retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ['mrr', 'hit_rate'],
    retriever=retriever,
)

In [18]:
# perform evaluation
retriever_evaluator.evaluate(
    query="query",
    expected_ids=['node_id1', 'node_id2', 'node_id3']
)

RetrievalEvalResult(query='query', expected_ids=['node_id1', 'node_id2', 'node_id3'], expected_texts=None, retrieved_ids=['e3bba795-3145-409c-b341-45cc1e0dcbc3', '4d0117bd-8f10-4e59-afe8-df2763a962e6', '3254b367-5984-4d6a-8bc1-c542020f2193'], retrieved_texts=['Amitabh Chaudhry, MD&CEO, Axis Bank said, “The conversations on India are buoyant and it’s being looked upon as an important investment destination, evident in discussions at global platforms like the World Economic Forum. The Indian economic momentum has been strong in FY24, and we believe the trend will continue well into FY25. At Axis Bank, our focus has been on sustainable and inclusive growth, with customer taking the centerstage in every discussion. This quarter we celebrated ‘Sparsh Week’, a week-long agenda focused on educative customer centric activities, with 15 events covering 5000+ branches and retail asset centers, reaching out to 95000+ employees.”\n\n1 Monthly End balances; 2 Inter Bank Participation Certificates 3