## Loading Datasets


In [20]:
from datasets import load_dataset

hf_dataset = load_dataset(
    "hotpot_qa", 
    "distractor",
    split="validation[:100]",
    cache_dir="/mnt/d/datasets/hotpot_qa"
)

hf_dataset = hf_dataset.select(range(10))

In [None]:
import os
from agents.orchestration_agent import OrchestrationAgent
from agents.baseline.cot import ChainOfThoughtAgent
import pandas as pd
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

EXPERIMENT_NAME = "hotpot_qa_orchestrate"

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
generated_data_path = os.path.join(project_root, 'data', 'generated', f'{EXPERIMENT_NAME}.parquet')

llm = ChatOpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY"),
    model="openai/gpt-oss-20b:free",
    temperature=0.7
)

agent = OrchestrationAgent(model="mistral:7b", llm=llm)

## Evaluating the `QueryEngine`


### Preparing dataset

In [22]:
from contextlib import redirect_stdout
from io import StringIO

ragas_dataset = []

for i, item in enumerate(hf_dataset):
    print(f"{i+1} iteration")
    question = item["question"]
    answer = item["answer"]
    context = item["context"]["sentences"]

    # Silence the agent's output
    with redirect_stdout(StringIO()):
        response = agent.generate_response(question, context)
        
    ragas_dataset.append(
        {
            "user_input": question,
            "retrieved_contexts": [str(item) for item in context],
            "response": response["content"].strip(),
            "reference": answer
        }
    )


df = pd.DataFrame(ragas_dataset)
df.to_parquet(generated_data_path, index=False)

1 iteration
2 iteration
3 iteration
4 iteration
5 iteration
6 iteration
7 iteration
8 iteration
9 iteration
10 iteration


### Evaluating dataset

In [None]:
# import metrics
from ragas.metrics import (
    ContextPrecision,
    ContextRecall,
    Faithfulness,
    AnswerRelevancy,
    AnswerCorrectness
)

# init metrics with evaluator LLM
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_ollama import ChatOllama
from langchain_huggingface import HuggingFaceEmbeddings

evaluator_llm = ChatOllama(
    model="mistral:7b",
    temperature=0.1,
)

evaluator_llm = LangchainLLMWrapper(evaluator_llm)

evaluator_embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

evaluator_embeddings = LangchainEmbeddingsWrapper(evaluator_embeddings)

metrics = [
    Faithfulness(llm=evaluator_llm),
    AnswerCorrectness(llm=evaluator_llm)
]

In [27]:
import pandas as pd
from ragas import evaluate
from ragas import EvaluationDataset

# Load the CSV you created
df = pd.read_parquet(generated_data_path)

# Convert to RAGAS dataset format
ragas_dataset_from_csv = EvaluationDataset.from_pandas(df)

# Evaluate using the CSV data
result = evaluate(
    metrics=metrics,
    dataset=ragas_dataset_from_csv,
    llm=evaluator_llm,
    embeddings=evaluator_embeddings
)

# Save evaluation results
df_results = result.to_pandas()
df_results.to_csv(os.path.join(project_root, 'data', 'evaluated', f'{EXPERIMENT_NAME}_eval.csv'), index=False)

Evaluating:   0%|          | 0/20 [00:00<?, ?it/s]Exception raised in Job[0]: TimeoutError()
Exception raised in Job[1]: TimeoutError()
Evaluating:   5%|▌         | 1/20 [03:07<59:15, 187.11s/it]Exception raised in Job[2]: TimeoutError()
Exception raised in Job[3]: TimeoutError()
Exception raised in Job[4]: TimeoutError()
Exception raised in Job[5]: TimeoutError()
Exception raised in Job[6]: TimeoutError()
Exception raised in Job[7]: TimeoutError()
Exception raised in Job[8]: TimeoutError()
Exception raised in Job[9]: TimeoutError()
Exception raised in Job[10]: TimeoutError()
Exception raised in Job[11]: TimeoutError()
Exception raised in Job[12]: TimeoutError()
Exception raised in Job[13]: TimeoutError()
Exception raised in Job[14]: TimeoutError()
Exception raised in Job[15]: TimeoutError()
Evaluating:  80%|████████  | 16/20 [04:01<01:00, 15.07s/it]


KeyboardInterrupt: 

Exception raised in Job[16]: TimeoutError()
Exception raised in Job[17]: TimeoutError()
Exception raised in Job[18]: TimeoutError()
Exception raised in Job[19]: TimeoutError()


In [None]:
df_results.head()

Unnamed: 0,user_input,retrieved_contexts,response,reference,faithfulness,answer_correctness
0,Were Scott Derrickson and Ed Wood of the same ...,[['Ed Wood is a 1994 American biographical per...,Scott Derrickson and Ed Wood are both American.,yes,0.5,0.768335
1,What government position was held by the woman...,"[[""Meet Corliss Archer, a program from radio's...","Shirley Temple Black, who portrayed Corliss Ar...",Chief of Protocol,0.5,
2,"What science fantasy young adult series, told ...",[['The Andre Norton Award for Young Adult Scie...,The science fantasy young adult series told in...,Animorphs,0.0,
3,Are the Laleli Mosque and Esma Sultan Mansion ...,[['Esma Sultan (21 March 1873 – 7 May 1899) wa...,The Laleli Mosque and Esma Sultan Mansion are ...,no,1.0,0.748326
4,"The director of the romantic comedy ""Big Stone...",[['Just Another Romantic Wrestling Comedy is a...,"Adriana Trigiani, the director of ""Big Stone G...","Greenwich Village, New York City",1.0,0.847043
