### Generate test data 

In [1]:
import os
import json
from llama_index import download_loader
from ragas.testset import TestsetGenerator

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SemanticScholarReader = download_loader("SemanticScholarReader")
loader = SemanticScholarReader()
# narrow down the search space
query_space = "large language models"
# increase limit to get more documents
documents = loader.load_data(query=query_space, limit=10)

In [3]:
testsetgenerator = TestsetGenerator.from_default()
test_size = 5  # Number of samples to generate
testset = testsetgenerator.generate(documents, test_size=test_size)

15it [00:53,  3.55s/it]                                                                 


In [11]:
test_df = testset.to_pandas()

In [13]:
test_df

Unnamed: 0,question,context,answer,question_type,episode_done
0,What is the proposed method for reducing the n...,"- We propose Low-Rank Adaptation, or LoRA, whi...",The proposed method for reducing the number of...,simple,True
1,"What is the purpose of the ""Let's think step b...",- Providing these steps for prompting demonstr...,"The purpose of the ""Let's think step by step"" ...",simple,True
2,What technique improves the performance of lar...,"- ""We explore how generating a chain of though...",The technique that improves the performance of...,simple,True
3,What is Codex's success rate in problem-solvin...,"- On HumanEval, a new evaluation set we releas...",Codex's success rate in problem-solving accord...,reasoning,True
4,What are emergent abilities of large language ...,- Large Language Models are Zero-Shot Reasoner...,The emergent abilities of large language model...,simple,True


## Build your RAG & collect questions, retrieved context and generated answer

In [6]:
# attach to the same event-loop
import nest_asyncio

nest_asyncio.apply()

In [24]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext,OpenAIEmbedding
from langchain.embeddings import HuggingFaceEmbeddings

import pandas as pd

In [29]:
openai_model = OpenAIEmbedding()

flag_model = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en")

In [28]:

def build_query_engine(embed_model):
    vector_index = VectorStoreIndex.from_documents(
        documents, service_context=ServiceContext.from_defaults(chunk_size=512),
        embed_model=embed_model,
    )

    query_engine = vector_index.as_query_engine()
    return query_engine

In [18]:
test_questions = test_df['question'].values.tolist()
test_answers = [[item] for item in test_df['answer'].values.tolist()]

In [19]:
from ragas.metrics import (
    context_precision,
    context_recall,
)

metrics = [
    context_precision,
    context_recall,
]

In [20]:
from ragas.llama_index import evaluate
query_engine1 = build_query_engine(openai_model)
result = evaluate(query_engine1, metrics, test_questions, test_answers)

evaluating with [context_precision]


100%|█████████████████████████████████████████████████████| 1/1 [00:13<00:00, 13.76s/it]


evaluating with [context_recall]


100%|█████████████████████████████████████████████████████| 1/1 [00:15<00:00, 15.45s/it]


In [21]:
result

{'ragas_score': 0.3950, 'context_precision': 0.2622, 'context_recall': 0.8000}

In [22]:
result.to_pandas()

Unnamed: 0,question,contexts,answer,ground_truths,context_precision,context_recall
0,What is the proposed method for reducing the n...,[Training Compute-Optimal Large Language Model...,The proposed method for reducing the number of...,[The proposed method for reducing the number o...,0.076923,0.0
1,"What is the purpose of the ""Let's think step b...",[Chain of Thought Prompting Elicits Reasoning ...,"The purpose of the ""Let's think step by step"" ...","[The purpose of the ""Let's think step by step""...",0.055556,1.0
2,What technique improves the performance of lar...,[Large Language Models are Zero-Shot Reasoners...,"Adding the prompt ""Let's think step by step"" b...",[The technique that improves the performance o...,0.5,1.0
3,What is Codex's success rate in problem-solvin...,[A distinct production version of Codex powers...,Codex's success rate in problem-solving accord...,[Codex's success rate in problem-solving accor...,0.25,1.0
4,What are emergent abilities of large language ...,[Emergent Abilities of Large Language Models S...,Emergent abilities of large language models ar...,[The emergent abilities of large language mode...,0.428571,1.0


In [31]:
query_engine2 = build_query_engine(flag_model)
result = evaluate(query_engine2, metrics, test_questions, test_answers)

evaluating with [context_precision]


100%|█████████████████████████████████████████████████████| 1/1 [00:11<00:00, 11.40s/it]


evaluating with [context_recall]


100%|█████████████████████████████████████████████████████| 1/1 [00:13<00:00, 13.15s/it]


In [32]:
result

{'ragas_score': 0.3950, 'context_precision': 0.2622, 'context_recall': 0.8000}

In [33]:
result.to_pandas()

Unnamed: 0,question,contexts,answer,ground_truths,context_precision,context_recall
0,What is the proposed method for reducing the n...,[Training Compute-Optimal Large Language Model...,The proposed method for reducing the number of...,[The proposed method for reducing the number o...,0.076923,0.0
1,"What is the purpose of the ""Let's think step b...",[Chain of Thought Prompting Elicits Reasoning ...,"The purpose of the ""Let's think step by step"" ...","[The purpose of the ""Let's think step by step""...",0.055556,1.0
2,What technique improves the performance of lar...,[Large Language Models are Zero-Shot Reasoners...,"Adding the prompt ""Let's think step by step"" b...",[The technique that improves the performance o...,0.5,1.0
3,What is Codex's success rate in problem-solvin...,[A distinct production version of Codex powers...,Codex's success rate in problem-solving accord...,[Codex's success rate in problem-solving accor...,0.25,1.0
4,What are emergent abilities of large language ...,[Emergent Abilities of Large Language Models S...,Emergent abilities of large language models ar...,[The emergent abilities of large language mode...,0.428571,1.0
