## Loading Dataset

In [17]:
import os
import pandas as pd
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

EVALUATER_MODEL_NAME = "gemini-2.5-flash-lite"

EXPERIMENTED_MODEL_NAME = "gemini-2.0-flash"

EXPERIMENT_NAME = f"hotpot_qa_orchestration_{EXPERIMENTED_MODEL_NAME}"

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
generated_data_path = os.path.join(project_root, 'data', 'generated', f'{EXPERIMENT_NAME}.parquet')

# Load dataset
df = pd.read_parquet(generated_data_path)
print(f"Loaded {len(df)} rows")
print(f"Columns: {df.columns.tolist()}")

Loaded 300 rows
Columns: ['user_input', 'contexts', 'response', 'ground_truth', 'workflow_plan', 'planner_reasoning', 'custom_prompts', 'latency', 'input_tokens', 'output_tokens', 'total_tokens', 'cost']


## Initialize LLM Judge

In [18]:
from langchain_ollama import ChatOllama
from langchain_google_genai import ChatGoogleGenerativeAI

# Initialize LLM Judge
judge_llm = ChatGoogleGenerativeAI(
    model=EVALUATER_MODEL_NAME,
    temperature=0.3,
)

print("LLM Judge initialized successfully")

LLM Judge initialized successfully


E0000 00:00:1760076277.525631   55861 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


## Define Evaluation Prompt and Function

In [19]:
# Define evaluation prompt
EVAL_PROMPT = """You are an impartial judge evaluating the correctness of an AI-generated answer.

Question: {question}

Ground Truth Answer: {ground_truth}

Generated Answer: {response}

Determine if the generated answer is correct by comparing it to the ground truth.
The answer should be considered correct if it conveys the same key information as the ground truth, even if worded differently.

Provide your evaluation in this exact format:
Verdict: [True/False]
Reasoning: [brief explanation of why the answer is correct or incorrect]"""

# Evaluation function
def evaluate_answer(row):
    prompt = EVAL_PROMPT.format(
        question=row['user_input'],
        ground_truth=row['ground_truth'],
        response=row['response']
    )
    
    try:
        result = judge_llm.invoke(prompt)
        content = result.content
        
        # Parse verdict
        verdict_line = [line for line in content.split('\n') if 'Verdict:' in line][0]
        verdict_str = verdict_line.split(':')[1].strip().lower()
        is_correct = 'true' in verdict_str
        
        # Parse reasoning
        reasoning_line = [line for line in content.split('\n') if 'Reasoning:' in line][0]
        reasoning = reasoning_line.split(':', 1)[1].strip()
        
        return is_correct, reasoning
    except Exception as e:
        print(f"Error evaluating row: {e}")
        return None, str(e)

## Run Evaluation

In [20]:
from tqdm import tqdm

print("Starting evaluation...\n")
correctness_results = []
reasonings = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Evaluating"):
    is_correct, reasoning = evaluate_answer(row)
    correctness_results.append(1 if is_correct == True else 0)
    reasonings.append(reasoning)

# Add results to dataframe
df['is_correct'] = correctness_results
df['reasoning'] = reasonings

print("\nEvaluation complete!")

Starting evaluation...



Evaluating: 100%|██████████| 300/300 [02:36<00:00,  1.92it/s]


Evaluation complete!





## Save Results

In [21]:
# Save results
output_path = os.path.join(project_root, 'data', 'evaluated', f'{EXPERIMENT_NAME}_eval.parquet')
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df.to_parquet(output_path, index=False)
print(f"Results saved to: {output_path}")

Results saved to: /home/mounty-ed/stuff/orchestration_research/data/evaluated/hotpot_qa_orchestration_gemini-2.0-flash_eval.parquet


## Summary

In [22]:
# Calculate accuracy (excluding None values)
valid_results = [r for r in correctness_results if r is not None]
correct_count = sum(valid_results)
accuracy = (correct_count / len(valid_results) * 100) if valid_results else 0

print(f"="*60)
print(f"EVALUATION SUMMARY")
print(f"="*60)
print(f"Total Questions: {len(df)}")
print(f"Successfully Evaluated: {len(valid_results)}")
print(f"Correct Answers: {correct_count}")
print(f"Incorrect Answers: {len(valid_results) - correct_count}")
print(f"Accuracy: {accuracy:.2f}%")
print(f"="*60)

# Show distribution
print(f"\nCorrectness Distribution:")
print(df['is_correct'].value_counts())

df.head()

EVALUATION SUMMARY
Total Questions: 300
Successfully Evaluated: 300
Correct Answers: 225
Incorrect Answers: 75
Accuracy: 75.00%

Correctness Distribution:
is_correct
1    225
0     75
Name: count, dtype: int64


Unnamed: 0,user_input,contexts,response,ground_truth,workflow_plan,planner_reasoning,custom_prompts,latency,input_tokens,output_tokens,total_tokens,cost,is_correct,reasoning
0,VIVA Media AG changed it's name in 2004. What ...,[['Constantin Medien AG (formerly EM.Entertain...,GmbH stands for Gesellschaft mit beschränkter ...,Gesellschaft mit beschränkter Haftung,"{'beginning': [], 'end': ['aggregator', 'refin...",1. CONTEXT-QUERY INTERACTION: The query requir...,{'debater': {'advocate': 'You are an expert in...,28.872688,14641,3531,18172,0.002877,0,The ground truth states that VIVA Media AG cha...
1,Which of Jonny Craig and Pete Doherty has been...,[['Peter Doherty (born 12 March 1979) is an En...,"Jonny Craig: Dance Gavin Dance, Emarosa, Ghost...","Jonny"" Craig","{'beginning': [], 'end': ['aggregator', 'refin...",1. CONTEXT-QUERY INTERACTION: The query requir...,"{'debater': None, 'predictor': 'You are an exp...",13.291206,8803,1454,10257,0.001462,1,The generated answer correctly identifies Jonn...
2,Where was the first governor after the The Mis...,[['The Compromise of 1790 was a compromise bet...,The context does not provide information on wh...,"Bath, Maine","{'beginning': [], 'end': ['aggregator', 'refin...",1. CONTEXT-QUERY INTERACTION: The query requir...,{'debater': {'advocate': 'You are a historian ...,30.154152,17485,3887,21372,0.003303,0,The generated answer incorrectly states that t...
3,"The creator of ""Wallace and Gromit"" also creat...",[['Creature Comforts is a stop motion clay ani...,Creature Comforts,Creature Comforts,"{'beginning': [], 'end': ['aggregator', 'refin...",1. CONTEXT-QUERY INTERACTION: The query requir...,{'debater': {'advocate': 'Your task is to dete...,22.407323,15685,3025,18710,0.002779,1,"The generated answer ""Creature Comforts"" is id..."
4,Woman's Era and Naj are what kind of magazines?,[['Lifestyle changes have been increasing slow...,"Woman's Era is a women's interest magazine, an...",fortnightly women interest magazine,"{'beginning': [], 'end': ['aggregator', 'refin...",1. CONTEXT-QUERY INTERACTION: The query asks a...,"{'debater': None, 'predictor': 'You are a maga...",11.213381,13740,975,14715,0.001764,0,The ground truth specifies that Woman's Era is...
