## Loading Dataset

In [None]:
import os
import pandas as pd
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

EXPERIMENTED_MODEL_NAME = "gemini-2.0-flash"

EXPERIMENT_NAME = f"hotpot_qa_orchestration_{EXPERIMENTED_MODEL_NAME}"

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
generated_data_path = os.path.join(project_root, 'data', 'generated', f'{EXPERIMENT_NAME}.parquet')

# Load dataset
df = pd.read_parquet(generated_data_path)
print(f"Loaded {len(df)} rows")
print(f"Columns: {df.columns.tolist()}")

Loaded 7 rows
Columns: ['user_input', 'contexts', 'response', 'ground_truth', 'workflow_plan', 'planner_reasoning', 'custom_prompts', 'latency', 'input_tokens', 'output_tokens', 'total_tokens', 'cost']


## Initialize LLM Judge

In [3]:
from langchain_ollama import ChatOllama

# Initialize LLM Judge
judge_llm = ChatOllama(
    model="qwen3:8b",
    temperature=0.1,
)

print("LLM Judge initialized successfully")

  from .autonotebook import tqdm as notebook_tqdm


LLM Judge initialized successfully


## Define Evaluation Prompt and Function

In [4]:
# Define evaluation prompt
EVAL_PROMPT = """You are an impartial judge evaluating the correctness of an AI-generated answer.

Question: {question}

Ground Truth Answer: {ground_truth}

Generated Answer: {response}

Determine if the generated answer is correct by comparing it to the ground truth.
The answer should be considered correct if it conveys the same key information as the ground truth, even if worded differently.

Provide your evaluation in this exact format:
Verdict: [True/False]
Reasoning: [brief explanation of why the answer is correct or incorrect]"""

# Evaluation function
def evaluate_answer(row):
    prompt = EVAL_PROMPT.format(
        question=row['user_input'],
        ground_truth=row['ground_truth'],
        response=row['response']
    )
    
    try:
        result = judge_llm.invoke(prompt)
        content = result.content
        
        # Parse verdict
        verdict_line = [line for line in content.split('\n') if 'Verdict:' in line][0]
        verdict_str = verdict_line.split(':')[1].strip().lower()
        is_correct = 'true' in verdict_str
        
        # Parse reasoning
        reasoning_line = [line for line in content.split('\n') if 'Reasoning:' in line][0]
        reasoning = reasoning_line.split(':', 1)[1].strip()
        
        return is_correct, reasoning
    except Exception as e:
        print(f"Error evaluating row: {e}")
        print(f"Content: {content}")
        return None, str(e)

print("Evaluation function defined")

Evaluation function defined


## Run Evaluation

In [5]:
from tqdm import tqdm

print("Starting evaluation...\n")
correctness_results = []
reasonings = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Evaluating"):
    is_correct, reasoning = evaluate_answer(row)
    correctness_results.append(1 if is_correct == True else 0)
    reasonings.append(reasoning)

# Add results to dataframe
df['is_correct'] = correctness_results
df['reasoning'] = reasonings

print("\nEvaluation complete!")

Starting evaluation...



Evaluating: 100%|██████████| 7/7 [01:20<00:00, 11.43s/it]


Evaluation complete!





## Save Results

In [6]:
# Save results
output_path = os.path.join(project_root, 'data', 'evaluated', f'{EXPERIMENT_NAME}_eval.parquet')
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df.to_parquet(output_path, index=False)
print(f"Results saved to: {output_path}")

Results saved to: /home/mounty-ed/stuff/orchestration_research/data/evaluated/hotpot_qa_orchestration_gemini-2.0-flash_pilot_5_eval.parquet


## Summary

In [7]:
# Calculate accuracy (excluding None values)
valid_results = [r for r in correctness_results if r is not None]
correct_count = sum(valid_results)
accuracy = (correct_count / len(valid_results) * 100) if valid_results else 0

print(f"="*60)
print(f"EVALUATION SUMMARY")
print(f"="*60)
print(f"Total Questions: {len(df)}")
print(f"Successfully Evaluated: {len(valid_results)}")
print(f"Correct Answers: {correct_count}")
print(f"Incorrect Answers: {len(valid_results) - correct_count}")
print(f"Accuracy: {accuracy:.2f}%")
print(f"="*60)

# Show distribution
print(f"\nCorrectness Distribution:")
print(df['is_correct'].value_counts())

df.head()

EVALUATION SUMMARY
Total Questions: 7
Successfully Evaluated: 7
Correct Answers: 6
Incorrect Answers: 1
Accuracy: 85.71%

Correctness Distribution:
is_correct
1    6
0    1
Name: count, dtype: int64


Unnamed: 0,user_input,contexts,response,ground_truth,workflow_plan,planner_reasoning,custom_prompts,latency,input_tokens,output_tokens,total_tokens,cost,is_correct,reasoning
0,Are Toshi and Warrel Dane both in the band San...,[['Sanctuary is an American heavy metal band f...,"Warrel Dane was in Sanctuary, but Toshi was not.",no,"{'beginning': [], 'end': ['aggregator', 'refin...",1. CONTEXT-QUERY INTERACTION: The query requir...,{'debater': {'advocate': 'You are an expert mu...,27.989571,18880,2946,21826,0.003066,1,The generated answer correctly states that War...
1,What location under Charing Cross railway stat...,"[['G-A-Y is a gay nightclub in London.', ' It ...",Heaven,venue Heaven,"{'beginning': [], 'end': ['aggregator', 'refin...",1. CONTEXT-QUERY INTERACTION: The query requir...,{'debater': {'advocate': 'Your task is to dete...,21.944888,14281,2610,16891,0.002472,1,"The generated answer ""Heaven"" directly matches..."
2,What date was the movie originally supposed to...,[['Copyright Alert System (CAS) was a voluntar...,May 2008,The film was originally set to be released in ...,"{'beginning': [], 'end': ['aggregator', 'refin...",1. CONTEXT-QUERY INTERACTION: The query requir...,{'debater': {'advocate': 'You are an expert in...,24.409796,16000,2772,18772,0.002709,1,"The generated answer ""May 2008"" directly match..."
3,David Wayne Hull (born 1962 or 1963) is a lead...,[['Westside High School is a public high schoo...,White Knights of the Ku Klux Klan,White Knights,"{'beginning': [], 'end': ['aggregator', 'refin...",1. CONTEXT-QUERY INTERACTION: The query requir...,{'debater': {'advocate': 'You are an expert on...,30.983602,15163,3801,18964,0.003037,1,"The generated answer ""White Knights of the Ku ..."
4,when was american stunt performer which Stunt ...,"[['Danny Aiello III (January 27, 1957 – May 1,...","October 17, 1938","October 17, 1938","{'beginning': [], 'end': ['aggregator', 'refin...",1. CONTEXT-QUERY INTERACTION: The query requir...,"{'debater': None, 'predictor': 'You are an exp...",13.144304,8557,1483,10040,0.001449,1,The generated answer exactly matches the groun...
