## Loading Datasets


In [1]:
from datasets import load_dataset

hf_dataset = load_dataset(
    "google-research-datasets/nq_open", 
    split="validation",
    cache_dir="/mnt/d/datasets/nq_open"
)

hf_dataset = hf_dataset.select(range(10))

## Evaluating the `QueryEngine`


### Preparing dataset

In [None]:
from agents.rag import RAG
import pandas as pd

rag = RAG()
ragas_dataset = []

for i, item in enumerate(hf_dataset):
    print(f"{i+1} iteration")
    question = item["question"]
    answer = item["answer"]

    response = rag.generate_response(question)
    ragas_dataset.append(
        {
            "user_input": question,
            "retrieved_contexts": response["relevant_docs"],
            "response": response["content"],
            "reference": answer
        }
    )

df = pd.DataFrame(ragas_dataset)
df.to_csv("../data/generated/rag.csv", index=False)

1 iteration


ValueError: Documents and their embeddings are not loaded.

### Evaluating dataset

In [None]:
# import metrics
from ragas.metrics import (
    ContextPrecision,
    ContextRecall,
    Faithfulness,
    AnswerRelevancy,
    AnswerCorrectness
)

# init metrics with evaluator LLM
from ragas.llms import LangchainLLMWrapper

evaluator_llm = LangchainLLMWrapper(evaluator_llm)
metrics = [
    Faithfulness(llm=evaluator_llm),
    AnswerRelevancy(llm=evaluator_llm),
    ContextPrecision(llm=evaluator_llm),
    ContextRecall(llm=evaluator_llm),
    AnswerCorrectness(llm=evaluator_llm)
]

In [None]:
from ragas import evaluate
from langchain_ollama import ChatOllama

evaluator_llm = ChatOllama(
    model="mistral:7b",
    temperature=0.1,
)

result = evaluate(
    metrics=metrics,
    dataset=ragas_dataset,
    llm=evaluator_llm,
)

df = result.to_pandas()
df.to_csv("../data/evaluated/rag_scores.csv", index=False)

Running Query Engine:   0%|          | 0/6 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/30 [00:00<?, ?it/s]