## Loading Datasets


In [50]:
import sys
import os

sys.path.append(os.path.join(os.getcwd(), '..'))  # Add project root to path

project_root = os.path.abspath(os.path.join(os.getcwd(), '../..'))
generated_data_path = os.path.join(project_root, 'data', 'generated', 'hotpot_qa_workflow_plan.csv')

In [51]:
from datasets import load_dataset

hf_dataset = load_dataset(
    "hotpot_qa", 
    "distractor",
    split="validation[:100]",
    cache_dir="/mnt/d/datasets/hotpot_qa"
)

hf_dataset = hf_dataset.select(range(3))

## Evaluating the `QueryEngine`


### Preparing dataset

In [None]:
from contextlib import redirect_stdout
from io import StringIO
from agents.experimental.orchestration_agent import OrchestrationAgent
import pandas as pd

agent = OrchestrationAgent(model="mistral:7b")
ragas_dataset = []

for i, item in enumerate(hf_dataset):
    print(f"{i+1} iteration")
    question = item["question"]
    answer = item["answer"]
    context = item["context"]["sentences"]

    # Silence the agent's output
    with redirect_stdout(StringIO()):
        response = agent.generate_response(question, context)
        
    ragas_dataset.append(
        {
            "user_input": question,
            "retrieved_contexts": [str(item) for item in context],
            "response": response["content"].strip().replace('\n', ' '),
            "reference": answer
        }
    )


df = pd.DataFrame(ragas_dataset)
df.to_csv(generated_data_path, index=False)

1 iteration
2 iteration
3 iteration


### Evaluating dataset

In [53]:
# import metrics
from ragas.metrics import (
    ContextPrecision,
    ContextRecall,
    Faithfulness,
    AnswerRelevancy,
    AnswerCorrectness
)

# init metrics with evaluator LLM
from ragas.llms import LangchainLLMWrapper
from langchain_ollama import ChatOllama

evaluator_llm = ChatOllama(
    model="mistral:7b",
    temperature=0.1,
)

evaluator_llm = LangchainLLMWrapper(evaluator_llm)

metrics = [
    Faithfulness(llm=evaluator_llm),
    AnswerCorrectness(llm=evaluator_llm)
]

In [55]:
import pandas as pd
from ragas import evaluate
from ragas import EvaluationDataset

# Load the CSV you created
df = pd.read_csv(generated_data_path)

# Convert to RAGAS dataset format
ragas_dataset_from_csv = EvaluationDataset.from_pandas(df)

# Evaluate using the CSV data
result = evaluate(
    metrics=metrics,
    dataset=ragas_dataset_from_csv,
    llm=evaluator_llm,
)

# Save evaluation results
df_results = result.to_pandas()
df.to_csv(os.path.join(project_root, 'data', 'generated', 'hotpot_qa_workflow_plan.csv'), index=False)

ValidationError: 1 validation error for SingleTurnSample
retrieved_contexts
  Input should be a valid list [type=list_type, input_value='[\'[\\\'Ed Wood is a 199...g car accident.\\\']\']', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/list_type