## Document loading

In [1]:
from pprint import pp
%reload_ext dotenv
%dotenv

In [2]:
import pandas as pd

pd.set_option(
    "display.max_colwidth", None
)

In [3]:
from llama_index.core import Document
import os
from elasticsearch import Elasticsearch

def get_documents_from_es_sources():
    client = Elasticsearch(
        cloud_id=os.getenv("es_cloud_id"),  # found within the deployment page
        basic_auth=(os.getenv("es_user"), os.getenv("es_password"))
    )

    index_name = "paper"
    query = {
        "match_all": {}
    }

    response = client.search(index=index_name, query=query, _source=["content", "metadata"], size=1000)
    print(len(response['hits']['hits']))

    documents = []

    for hit in response['hits']['hits']:
        doc = Document(
            metadata=hit['_source']['metadata'],
            text=hit['_source']['content']
        )
        documents.append(doc)
    return documents

In [4]:
from indexing.components.loading import DocumentReader
from indexing.components.indexing import ChunkStrategyModule

def get_documents_from_simple_chunking():
    rag_documents = DocumentReader(file_path=["indexing/data/papers/rag_survey.pdf"]).pdf_reader()
    eval_documents = DocumentReader(file_path=["indexing/data/papers/eval_survey.pdf"]).pdf_reader()

    documents = rag_documents + eval_documents

    print(f"Document counts: {len(documents)}")

    nodes = ChunkStrategyModule(documents=documents).base_parser(chunk_size=512, chunk_overlap=32)

    print(f"Node counts: {len(nodes)}")
    return nodes

In [5]:
documents = get_documents_from_simple_chunking()

Document counts: 41
Node counts: 112


In [6]:
doc_id_2_gen_questions = {}

for document in documents:
    doc_id_2_gen_questions[document.id_] = []

## Setup agents for question generation

In [7]:
from inferences.components.models import get_bedrock_li_text_model, get_openai_model, ModelEnum


def call_llm(prompt: str):
    llm = get_openai_model(
        temperature=0.5,
        max_tokens=1000
    )

    return llm.complete(
        prompt
    ).text

In [20]:
# Your factoid question and answer should be unique in meaning to any of the given previous (question, answer) list.
# Now here is the previous questions.
# Previous questions list: {previous_questions}

In [9]:
QA_generation_prompt = """
Your task is to write a factoid question and an answer given a context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)

Now here is the context.

Context: {context}\n
Output:::"""

In [10]:
from tqdm import tqdm
import random

N_GENERATIONS = 150

outputs = []

for document in tqdm(random.choices(documents, k=N_GENERATIONS)):
    output_QA_couple = call_llm(QA_generation_prompt.format(context=document.text, previous_questions=doc_id_2_gen_questions[document.id_]))
    try:
        question = output_QA_couple.split("Factoid question: ")[-1].split("Answer: ")[0]
        answer = output_QA_couple.split("Answer: ")[-1]
        assert len(answer) < 300, "Answer is too long"
        outputs.append(
            {
                "question": question.strip(),
                "context": document.text.strip(),
                "answer": answer.strip()
            }
        )
        doc_id_2_gen_questions[document.id_].append((question.strip(), answer.strip()))
    except Exception:
        continue

100%|██████████| 150/150 [02:52<00:00,  1.15s/it]


In [11]:
display(pd.DataFrame(outputs))

Unnamed: 0,question,context,answer
0,What is reranking in the RAG system?,"10\nintroduces an innovative method for integrating knowledge\ninto white-box models via directive fine-tuning [75]. In this\napproach, the retriever module is directly substituted to gen-\nerate relevant documents according to a query. This method\nassists in addressing the difficulties encountered during the\nfine-tuning process and enhances model performance.\nIV. G ENERATION\nAfter retrieval, it is not a good practice to directly input all\nthe retrieved information to the LLM for answering questions.\nFollowing will introduce adjustments from two perspectives:\nadjusting the retrieved content and adjusting the LLM.\nA. Context Curation\nRedundant information can interfere with the final gener-\nation of LLM, and overly long contexts can also lead LLM\nto the “Lost in the middle” problem [98]. Like humans, LLM\ntends to only focus on the beginning and end of long texts,\nwhile forgetting the middle portion. Therefore, in the RAG\nsystem, we typically need to further process the retrieved\ncontent.\n1) Reranking: Reranking fundamentally reorders document\nchunks to highlight the most pertinent results first, effectively\nreducing the overall document pool, severing a dual purpose\nin information retrieval, acting as both an enhancer and a\nfilter, delivering refined inputs for more precise language\nmodel processing [70]. Reranking can be performed using\nrule-based methods that depend on predefined metrics like\nDiversity, Relevance, and MRR, or model-based approaches\nlike Encoder-Decoder models from the BERT series (e.g.,\nSpanBERT), specialized reranking models such as Cohere\nrerank or bge-raranker-large, and general large language mod-\nels like GPT [12], [99].\n2) Context Selection/Compression: A common misconcep-\ntion in the RAG process is the belief that retrieving as many\nrelevant documents as possible and concatenating them to form\na lengthy retrieval prompt is beneficial. However, excessive\ncontext can introduce more noise, diminishing the LLM’s\nperception of key information .","Reranking fundamentally reorders document chunks to highlight the most pertinent results first, effectively reducing the overall document pool."
1,What is the main focus of the survey on RAG evaluation?,"Evaluation of Retrieval-Augmented Generation: A Survey 3\nmethods, indicators, and tools, particularly given the black-box LLM generation. Eval-\nuating RAG systems thus involves considering quite a few specific components and\nthe complexity of overall system assessment. On the other hand, the complexity of\nRAG systems is further compounded by the dynamic external database and the various\ndownstream tasks, such as content creation or open domain question answering [14,53].\nThese challenges necessitate the development of comprehensive evaluation metrics that\ncan effectively capture the interplay between retrieval accuracy and generative qual-\nity [2,6]. To clarify the elements further, we conducted this survey on RAG evalua-\ntion to address the current gaps in the area, which differs from the prior RAG surveys\n[57,14,21] that predominantly collected specific RAG methods or data. We have com-\npiled 12 distinct evaluation frameworks, encompassing a range of aspects of the RAG\nsystem. We conduct a comparative analysis and synthesize the specific evaluation meth-\nods of various components, focusing on aspects such as accuracy, faithfulness, and rel-\nevance. We also discuss the constraints of the existing methodology and the prospects\nfor future RAG evaluations. We hope to provide the readers with a comprehensive un-\nderstanding of the RAG evaluation.\nFor this paper, we contribute in the following aspects:\n1.Challenge of Evaluation : This is the first work that summarize and classify the\nchallenges in evaluating RAG systems through the structure of RAG systems, in-\ncluding three parts retrieval, generation, and the whole system.\n2.Analysis Framework : Based on the challenges, we propose an analysis framework\n(RGAG ) for RAG benchmarks, which is designed to navigate the unique complex-\nities inherent to RAG systems, offering a fundamental methodology for assessing\ntheir efficacy across many facets.\n3.RAG Benchmark Analysis : With the help of the RGAG framework, we provide a\ncomprehensive analysis of existing RAG benchmarks, highlighting their strengths\nand limitations and proposing recommendations for future developments in RAG\nsystem evaluation.\n2 Challenges in Evaluating RAG Systems\nEvaluating hybrid RAG systems entails evaluating retrieval, generation and the RAG\nsystem as a whole. These evaluations are multifaceted, requiring careful consideration\nand analysis.",The main focus of the survey on RAG evaluation is to address the current gaps in the area and provide a comprehensive understanding of the RAG evaluation.
2,What is the core task of RAG models?,"12\nprobability of generated terms [24]. When the probability falls\nbelow a certain threshold would activates the retrieval system\nto collect relevant information, thus optimizing the retrieval\ncycle. Self-RAG [25] introduces “reflection tokens” that allow\nthe model to introspect its outputs. These tokens come in\ntwo varieties: “retrieve” and “critic”. The model autonomously\ndecides when to activate retrieval, or alternatively, a predefined\nthreshold may trigger the process. During retrieval, the gen-\nerator conducts a fragment-level beam search across multiple\nparagraphs to derive the most coherent sequence. Critic scores\nare used to update the subdivision scores, with the flexibility\nto adjust these weights during inference, tailoring the model’s\nbehavior. Self-RAG’s design obviates the need for additional\nclassifiers or reliance on Natural Language Inference (NLI)\nmodels, thus streamlining the decision-making process for\nwhen to engage retrieval mechanisms and improving the\nmodel’s autonomous judgment capabilities in generating ac-\ncurate responses.\nVI. T ASK AND EVALUATION\nThe rapid advancement and growing adoption of RAG\nin the field of NLP have propelled the evaluation of RAG\nmodels to the forefront of research in the LLMs community.\nThe primary objective of this evaluation is to comprehend\nand optimize the performance of RAG models across diverse\napplication scenarios.This chapter will mainly introduce the\nmain downstream tasks of RAG, datasets, and how to evaluate\nRAG systems.\nA. Downstream Task\nThe core task of RAG remains Question Answering (QA),\nincluding traditional single-hop/multi-hop QA, multiple-\nchoice, domain-specific QA as well as long-form scenarios\nsuitable for RAG. In addition to QA, RAG is continuously\nbeing expanded into multiple downstream tasks, such as Infor-\nmation Extraction (IE), dialogue generation, code search, etc.\nThe main downstream tasks of RAG and their corresponding\ndatasets are summarized in Table II.\nB. Evaluation Target\nHistorically, RAG models assessments have centered on\ntheir execution in specific downstream tasks. These evaluations\nemploy established metrics suitable to the tasks at hand.",The core task of RAG models is Question Answering (QA).
3,"What is the title of the paper by X. Li, R. Zhao, Y. K. Chia, B. Ding, L. Bing, S. Joty, and S. Poria?","Yih, “Dense passage retrieval for open-domain question\nanswering,” arXiv preprint arXiv:2004.04906 , 2020.\n[103] Y . Ma, Y . Cao, Y . Hong, and A. Sun, “Large language model is\nnot a good few-shot information extractor, but a good reranker for\nhard samples!” ArXiv , vol. abs/2303.08559, 2023. [Online]. Available:\nhttps://api.semanticscholar.org/CorpusID:257532405\n[104] J. Cui, Z. Li, Y . Yan, B. Chen, and L. Yuan, “Chatlaw: Open-source\nlegal large language model with integrated external knowledge bases,”\narXiv preprint arXiv:2306.16092 , 2023.\n[105] O. Yoran, T. Wolfson, O. Ram, and J. Berant, “Making retrieval-\naugmented language models robust to irrelevant context,” arXiv\npreprint arXiv:2310.01558 , 2023.\n[106] X. Li, R. Zhao, Y . K. Chia, B. Ding, L. Bing, S. Joty, and S. Poria,\n“Chain of knowledge: A framework for grounding large language mod-\nels with structured knowledge bases,” arXiv preprint arXiv:2305.13269 ,\n2023.\n[107] H. Yang, S. Yue, and Y . He, “Auto-gpt for online decision\nmaking: Benchmarks and additional opinions,” arXiv preprint\narXiv:2306.02224 , 2023.\n[108] T. Schick, J. Dwivedi-Yu, R. Dess `ı, R. Raileanu, M. Lomeli, L. Zettle-\nmoyer, N. Cancedda, and T. Scialom, “Toolformer: Language models\ncan teach themselves to use tools,” arXiv preprint arXiv:2302.04761 ,\n2023.",Chain of knowledge: A framework for grounding large language models with structured knowledge bases
4,"What is the title of the paper authored by S. Zhuang, B. Liu, B. Koopman, and G. Zuccon?","Ling, “Corrective retrieval\naugmented generation,” arXiv preprint arXiv:2401.15884 , 2024.\n[68] P. Jain, L. B. Soares, and T. Kwiatkowski, “1-pager: One pass answer\ngeneration and evidence retrieval,” arXiv preprint arXiv:2310.16568 ,\n2023.\n[69] H. Yang, Z. Li, Y . Zhang, J. Wang, N. Cheng, M. Li, and J. Xiao, “Prca:\nFitting black-box large language models for retrieval question answer-\ning via pluggable reward-driven contextual adapter,” arXiv preprint\narXiv:2310.18347 , 2023.\n[70] S. Zhuang, B. Liu, B. Koopman, and G. Zuccon, “Open-source large\nlanguage models are strong zero-shot query likelihood models for\ndocument ranking,” arXiv preprint arXiv:2310.13243 , 2023.\n[71] F. Xu, W. Shi, and E. Choi, “Recomp: Improving retrieval-augmented\nlms with compression and selective augmentation,” arXiv preprint\narXiv:2310.04408 , 2023.\n[72] W. Shi, S. Min, M. Yasunaga, M. Seo, R. James, M. Lewis, L. Zettle-\nmoyer, and W.-t. Yih, “Replug: Retrieval-augmented black-box lan-\nguage models,” arXiv preprint arXiv:2301.12652 , 2023.\n[73] E. Melz, “Enhancing llm intelligence with arm-rag: Auxiliary ra-\ntionale memory for retrieval augmented generation,” arXiv preprint\narXiv:2311.04177 , 2023.\n[74] H. Wang, W. Huang, Y . Deng, R. Wang, Z. Wang, Y . Wang, F. Mi,\nJ. Z. Pan, and K.-F.","""Open-source large language models are strong zero-shot query likelihood models for document ranking"""
...,...,...,...
145,"When was the paper ""The cot collection: Improving zero-shot and few-shot learning of language models via chain-of-thought fine-tuning"" published?","[142] O. Levy, M. Seo, E. Choi, and L. Zettlemoyer, “Zero-shot relation ex-\ntraction via reading comprehension,” arXiv preprint arXiv:1706.04115 ,\n2017.\n[143] R. Zellers, A. Holtzman, Y . Bisk, A. Farhadi, and Y . Choi, “Hel-\nlaswag: Can a machine really finish your sentence?” arXiv preprint\narXiv:1905.07830 , 2019.\n[144] S. Kim, S. J. Joo, D. Kim, J. Jang, S. Ye, J. Shin, and M. Seo,\n“The cot collection: Improving zero-shot and few-shot learning of\nlanguage models via chain-of-thought fine-tuning,” arXiv preprint\narXiv:2305.14045 , 2023.\n[145] A. Saha, V . Pahuja, M. Khapra, K. Sankaranarayanan, and S. Chandar,\n“Complex sequential question answering: Towards learning to converse\nover linked question answer pairs with a knowledge graph,” in Proceed-\nings of the AAAI conference on artificial intelligence , vol. 32, no. 1,\n2018.\n[146] D. Hendrycks, C. Burns, S. Basart, A. Zou, M. Mazeika, D. Song, and\nJ. Steinhardt, “Measuring massive multitask language understanding,”\narXiv preprint arXiv:2009.03300 , 2020.\n[147] S. Merity, C. Xiong, J. Bradbury, and R. Socher, “Pointer sentinel\nmixture models,” arXiv preprint arXiv:1609.07843 , 2016.\n[148] M. Geva, D. Khashabi, E. Segal, T. Khot, D. Roth, and J. Berant,\n“Did aristotle use a laptop? a question answering benchmark with\nimplicit reasoning strategies,” Transactions of the Association for\nComputational Linguistics , vol. 9, pp. 346–361, 2021.",2023
146,"When was the paper ""Best Practices for LLM Evaluation of RAG Applications"" published?","Transactions of the Association for Computational Linguistics 7, 453–\n466 (2019). https://doi.org/10.1162/tacl_a_00276 ,https://doi.org/\n10.1162/tacl_a_00276\n26. Lanchantin, J., Toshniwal, S., Weston, J., Szlam, A., Sukhbaatar, S.: Learning to reason\nand memorize with self-notes (May 2023). https://doi.org/10.48550/ARXIV.\n2305.00833\n27. LangChain: Evaluating rag architectures on benchmark tasks (Nov 2023), https:\n//langchain-ai.github.io/langchain-benchmarks/notebooks/\nretrieval/langchain_docs_qa.html\n28. Leng, Q., Uhlenhuth, K., Polyzotis, A.: Best Practices for LLM Evaluation of\nRAG Applications (Dec 2023), https://www.databricks.com/blog/\nLLM-auto-eval-best-practices-RAG\n29. Lewis, P., Perez, E., Piktus, A., Petroni, F., Karpukhin, V ., Goyal, N., Küttler, H., Lewis,\nM., Yih, W.t., Rocktäschel, T., Riedel, S., Kiela, D.: Retrieval-augmented generation for\nknowledge-intensive NLP tasks. In: Proceedings of the 34th International Conference on\nNeural Information Processing Systems. pp. 9459–9474. NIPS’20, Curran Associates Inc.,\nRed Hook, NY , USA (Dec 2020)",December 2023
147,What is the analysis framework proposed for RAG benchmarks?,"Diverse and compre-\nhensive datasets that accurately reflect real-world scenarios are crucial. Challenges also\narise in the realm of metrics, encompassing generative evaluation criteria for distinct\ndownstream tasks, human preferences, and practical considerations within the RAG\nsystem. Most prior benchmarks predominantly tackle one or several aspects of the RAG\nassessment but lack a comprehensive, holistic analysis.\nTo provide a better understanding of RAG benchmarks, we propose an analysis\nframework named RGAR (Retrieval, Generation, and Additional Requirement). It takes\ninto account the Target ,Dataset , and Metric respectively. The Target module is in-\ntended to determine the evaluation direction. The Dataset module facilitates the com-\nparison of various data constructions in RAG benchmarks. The final module, Metrics,\nintroduces the metrics that correspond to specific targets and datasets used during eval-\nuation. Overall, it is designed to provide a systematic methodology for assessing the\neffectiveness of RAG systems across various aspects by covering all possible pairs be-\ntween the “Evaluable Outputs” (EOs) and “Ground Truths” (GTs). In the following","RGAR (Retrieval, Generation, and Additional Requirement)"
148,What is the title of the paper with the arXiv preprint number arXiv:2310.11511?,"Yang,\nJ. Callan, and G. Neubig, “Active retrieval augmented generation,”\narXiv preprint arXiv:2305.06983 , 2023.\n[25] A. Asai, Z. Wu, Y . Wang, A. Sil, and H. Hajishirzi, “Self-rag:\nLearning to retrieve, generate, and critique through self-reflection,”\narXiv preprint arXiv:2310.11511 , 2023.\n[26] Z. Ke, W. Kong, C. Li, M. Zhang, Q. Mei, and M. Bendersky,\n“Bridging the preference gap between retrievers and llms,” arXiv\npreprint arXiv:2401.06954 , 2024.\n[27] X. V . Lin, X. Chen, M. Chen, W. Shi, M. Lomeli, R. James, P. Ro-\ndriguez, J. Kahn, G. Szilvasy, M. Lewis et al. , “Ra-dit: Retrieval-\naugmented dual instruction tuning,” arXiv preprint arXiv:2310.01352 ,\n2023.\n[28] O. Ovadia, M. Brief, M. Mishaeli, and O. Elisha, “Fine-tuning or\nretrieval? comparing knowledge injection in llms,” arXiv preprint\narXiv:2312.05934 , 2023.\n[29] T. Lan, D. Cai, Y . Wang, H. Huang, and X.-L. Mao, “Copy is all\nyou need,” in The Eleventh International Conference on Learning\nRepresentations , 2022.\n[30] T. Chen, H. Wang, S. Chen, W. Yu, K. Ma, X. Zhao, D. Yu, and\nH. Zhang, “Dense x retrieval: What retrieval granularity should we\nuse?” arXiv preprint arXiv:2312.06648 , 2023.","Self-rag: Learning to retrieve, generate, and critique through self-reflection"


In [12]:
doc_id_2_gen_questions

{'2cccb30f-7b79-427c-81c2-1e224650cdb4': [],
 '34899a27-1a74-4629-9004-a28103f310f7': [('What technology enhances language models by retrieving relevant document chunks from an external knowledge base through semantic similarity calculation?',
   'Retrieval-Augmented Generation (RAG)'),
  ('What technology enhances LLMs by retrieving relevant document chunks from external knowledge bases through semantic similarity calculation?',
   'Retrieval-Augmented Generation (RAG)')],
 '208d02e1-dbf9-4268-ba40-60da18c11304': [('What are the core stages analyzed in the paper in relation to RAG?',
   '"Retrieval," "Generation," and "Augmentation" stages are analyzed in the paper in relation to RAG.')],
 '7c934bad-f6d1-4909-bb8f-15e91fd7421b': [('What are the main components integral to the RAG process?',
   '"Retrieval", "Generation", and "Augmentation"'),
  ('What are the three main components integral to the RAG process?',
   '"Retrieval", "Generation", and "Augmentation" are the three main compo

In [13]:
from pathlib import Path
from datetime import datetime


def get_current_timestamp():
    now = datetime.now()
    return str(now).split(".")[0].replace(" ", "-").replace(":", "-")

cur_dir_str = f"evaluations/{get_current_timestamp()}"
cur_dir = Path(cur_dir_str)
cur_dir.mkdir(parents=True, exist_ok=True)

pd.DataFrame(outputs).to_csv(f"{cur_dir_str}/intermediate_output.csv", index=False)

## Setup critique agents

In [14]:
question_groundedness_critique_prompt = """
You will be given a context and a question.
Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the question and context.

Question: {question}\n
Context: {context}\n
Answer::: """

question_relevance_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how useful this question can be to machine learning developers building Retrieval-augmented generation(RAG) application with the RAG survey paper.
Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

question_standalone_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how context-independant this question is.
Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself.
For instance, if the question refers to a particular setting, like 'in the context' or 'in the document', the rating must be 1.
The questions can contain obscure technical nouns or acronyms like Gradio, Hub, Hugging Face or Space and still be a 5: it must simply be clear to an operator with access to documentation what the question is about.

For instance, "What is the name of the checkpoint from which the ViT model is imported?" should receive a 1, since there is an implicit mention of a context, thus the question is not independant from the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

In [15]:
for output in tqdm(outputs):
    evaluations = {
        "groundedness": call_llm(question_groundedness_critique_prompt.format(
            context=output["context"], question=output["question"]
        )),
        "relevance": call_llm(question_relevance_critique_prompt.format(
            question=output["question"]
        )),
        "standalone": call_llm(question_standalone_critique_prompt.format(
            question=output["question"]
        ))
    }

    try:
        for criterion, evaluation in evaluations.items():
            score, eval = (
                int(evaluation.split("Total rating: ")[-1].strip()),
                evaluation.split("Total rating: ")[-2].split("Evaluation: ")[1],
            )
            output.update(
                {
                    f"{criterion}_score": score,
                    f"{criterion}_eval": eval
                }
            )
    except Exception as e:
        continue

100%|██████████| 150/150 [09:16<00:00,  3.71s/it]


In [16]:
import pandas as pd
import datasets

pd.set_option("display.max_colwidth", None)

generated_questions = pd.DataFrame.from_dict(outputs)

print("Evaluation dataset before filtering:")
display(
    generated_questions[
        [
            "question",
            "answer",
            "groundedness_score",
            "relevance_score",
            "standalone_score",
        ]
    ]
)

generated_questions = generated_questions.loc[
    (generated_questions["groundedness_score"] >= 4)
    & (generated_questions["relevance_score"] >= 4)
    & (generated_questions["standalone_score"] >= 4)
]
print("============================================")
print("Final evaluation dataset:")
display(
    generated_questions[
        [
            "question",
            "answer",
            "groundedness_score",
            "relevance_score",
            "standalone_score",
        ]
    ]
)

Evaluation dataset before filtering:


Unnamed: 0,question,answer,groundedness_score,relevance_score,standalone_score
0,What is reranking in the RAG system?,"Reranking fundamentally reorders document chunks to highlight the most pertinent results first, effectively reducing the overall document pool.",5.0,,
1,What is the main focus of the survey on RAG evaluation?,The main focus of the survey on RAG evaluation is to address the current gaps in the area and provide a comprehensive understanding of the RAG evaluation.,5.0,4.0,5.0
2,What is the core task of RAG models?,The core task of RAG models is Question Answering (QA).,5.0,5.0,5.0
3,"What is the title of the paper by X. Li, R. Zhao, Y. K. Chia, B. Ding, L. Bing, S. Joty, and S. Poria?",Chain of knowledge: A framework for grounding large language models with structured knowledge bases,5.0,1.0,5.0
4,"What is the title of the paper authored by S. Zhuang, B. Liu, B. Koopman, and G. Zuccon?","""Open-source large language models are strong zero-shot query likelihood models for document ranking""",,,
...,...,...,...,...,...
145,"When was the paper ""The cot collection: Improving zero-shot and few-shot learning of language models via chain-of-thought fine-tuning"" published?",2023,5.0,4.0,5.0
146,"When was the paper ""Best Practices for LLM Evaluation of RAG Applications"" published?",December 2023,5.0,4.0,5.0
147,What is the analysis framework proposed for RAG benchmarks?,"RGAR (Retrieval, Generation, and Additional Requirement)",5.0,4.0,5.0
148,What is the title of the paper with the arXiv preprint number arXiv:2310.11511?,"Self-rag: Learning to retrieve, generate, and critique through self-reflection",5.0,3.0,5.0


Final evaluation dataset:


Unnamed: 0,question,answer,groundedness_score,relevance_score,standalone_score
1,What is the main focus of the survey on RAG evaluation?,The main focus of the survey on RAG evaluation is to address the current gaps in the area and provide a comprehensive understanding of the RAG evaluation.,5.0,4.0,5.0
2,What is the core task of RAG models?,The core task of RAG models is Question Answering (QA).,5.0,5.0,5.0
5,"When was the paper ""Judging llm-as-a-judge with mt-bench and chatbot arena"" published?",June 2023,5.0,4.0,5.0
6,What does the Relevance evaluation in the RAG system measure?,It measures the precision and specificity of the retrieval process.,5.0,5.0,5.0
7,"What does the metric ""Single Query Latency"" measure in RAG systems?","The average time taken to process a single query, including both retrieval and generating phases.",5.0,4.0,5.0
...,...,...,...,...,...
143,What are some of the metrics used for evaluating the aspects of RAG models?,"Accuracy, EM, Recall, Precision, R-Rate, Cosine Similarity, Hit Rate, MRR, NDCG, BLEU, ROUGE/ROUGE-L",5.0,5.0,5.0
144,What technology enhances LLMs by retrieving relevant document chunks from external knowledge bases through semantic similarity calculation?,Retrieval-Augmented Generation (RAG),5.0,5.0,5.0
145,"When was the paper ""The cot collection: Improving zero-shot and few-shot learning of language models via chain-of-thought fine-tuning"" published?",2023,5.0,4.0,5.0
146,"When was the paper ""Best Practices for LLM Evaluation of RAG Applications"" published?",December 2023,5.0,4.0,5.0


In [17]:
print(f"eval counts: {len(generated_questions)}/{len(outputs)}")

eval counts: 82/150


In [18]:
generated_questions.to_csv(f"{cur_dir_str}/eval.csv", index=False)

In [19]:
print(cur_dir_str)

evaluations/2024-05-29-13-14-54
