## Loading Dataset

In [24]:
import os
import pandas as pd
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

EXPERIMENT_NAME = "hotpot_qa_orchestration_2"

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
generated_data_path = os.path.join(project_root, 'data', 'generated', f'{EXPERIMENT_NAME}.parquet')

# Load dataset
df = pd.read_parquet(generated_data_path)
print(f"Loaded {len(df)} rows")
print(f"Columns: {df.columns.tolist()}")

Loaded 24 rows
Columns: ['user_input', 'contexts', 'response', 'ground_truth', 'workflow_plan']


## Initialize LLM Judge

In [25]:
from langchain_ollama import ChatOllama

# Initialize LLM Judge
judge_llm = ChatOllama(
    model="qwen3:8b",
    temperature=0.1,
)

print("LLM Judge initialized successfully")

LLM Judge initialized successfully


## Define Evaluation Prompt and Function

In [26]:
# Define evaluation prompt
EVAL_PROMPT = """You are an impartial judge evaluating the correctness of an AI-generated answer.

Question: {question}

Ground Truth Answer: {ground_truth}

Generated Answer: {response}

Determine if the generated answer is correct by comparing it to the ground truth.
The answer should be considered correct if it conveys the same key information as the ground truth, even if worded differently.

Provide your evaluation in this exact format:
Verdict: [True/False]
Reasoning: [brief explanation of why the answer is correct or incorrect]"""

# Evaluation function
def evaluate_answer(row):
    prompt = EVAL_PROMPT.format(
        question=row['user_input'],
        ground_truth=row['ground_truth'],
        response=row['response']
    )
    
    try:
        result = judge_llm.invoke(prompt)
        content = result.content
        
        # Parse verdict
        verdict_line = [line for line in content.split('\n') if 'Verdict:' in line][0]
        verdict_str = verdict_line.split(':')[1].strip().lower()
        is_correct = 'true' in verdict_str
        
        # Parse reasoning
        reasoning_line = [line for line in content.split('\n') if 'Reasoning:' in line][0]
        reasoning = reasoning_line.split(':', 1)[1].strip()
        
        return is_correct, reasoning
    except Exception as e:
        print(f"Error evaluating row: {e}")
        print(f"Content: {content}")
        return None, str(e)

print("Evaluation function defined")

Evaluation function defined


## Run Evaluation

In [27]:
from tqdm import tqdm

print("Starting evaluation...\n")
correctness_results = []
reasonings = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Evaluating"):
    is_correct, reasoning = evaluate_answer(row)
    correctness_results.append(1 if is_correct == True else 0)
    reasonings.append(reasoning)

# Add results to dataframe
df['is_correct'] = correctness_results
df['reasoning'] = reasonings

print("\nEvaluation complete!")

Starting evaluation...



Evaluating: 100%|██████████| 24/24 [05:31<00:00, 13.83s/it]


Evaluation complete!





## Save Results

In [None]:
# Save results
output_path = os.path.join(project_root, 'data', 'evaluated', f'{EXPERIMENT_NAME}_eval.parquet')
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df.to_parquet(output_path, index=False)
print(f"Results saved to: {output_path}")

Results saved to: /home/mounty-ed/stuff/orchestration_research/data/evaluated/hotpot_qa_orchestration_2_eval.csv


## Summary

In [29]:
# Calculate accuracy (excluding None values)
valid_results = [r for r in correctness_results if r is not None]
correct_count = sum(valid_results)
accuracy = (correct_count / len(valid_results) * 100) if valid_results else 0

print(f"="*60)
print(f"EVALUATION SUMMARY")
print(f"="*60)
print(f"Total Questions: {len(df)}")
print(f"Successfully Evaluated: {len(valid_results)}")
print(f"Correct Answers: {correct_count}")
print(f"Incorrect Answers: {len(valid_results) - correct_count}")
print(f"Accuracy: {accuracy:.2f}%")
print(f"="*60)

# Show distribution
print(f"\nCorrectness Distribution:")
print(df['is_correct'].value_counts())

df.head()

EVALUATION SUMMARY
Total Questions: 24
Successfully Evaluated: 24
Correct Answers: 19
Incorrect Answers: 5
Accuracy: 79.17%

Correctness Distribution:
is_correct
1    19
0     5
Name: count, dtype: int64


Unnamed: 0,user_input,contexts,response,ground_truth,workflow_plan,is_correct,reasoning
0,Were Scott Derrickson and Ed Wood of the same ...,[['Ed Wood is a 1994 American biographical per...,"Yes, Scott Derrickson (born in Los Angeles, Ca...",yes,"{'beginning': [], 'end': ['aggregator', 'refin...",1,The generated answer correctly confirms that b...
1,What government position was held by the woman...,"[[""Meet Corliss Archer, a program from radio's...",Shirley Temple Black portrayed Corliss Archer ...,Chief of Protocol,"{'beginning': [], 'end': ['aggregator', 'refin...",0,The generated answer incorrectly states that S...
2,"What science fantasy young adult series, told ...",[['The Andre Norton Award for Young Adult Scie...,For science fantasy young adult series narrate...,Animorphs,"{'beginning': [], 'end': ['aggregator', 'refin...",1,"The generated answer correctly identifies ""Ani..."
3,Are the Laleli Mosque and Esma Sultan Mansion ...,[['Esma Sultan (21 March 1873 – 7 May 1899) wa...,The Laleli Mosque and Esma Sultan Mansion are ...,no,"{'beginning': [], 'end': ['aggregator', 'refin...",1,The generated answer correctly states that the...
4,"The director of the romantic comedy ""Big Stone...",[['Just Another Romantic Wrestling Comedy is a...,"Adriana Trigiani, the director of 'Big Stone G...","Greenwich Village, New York City","{'beginning': ['summarizer'], 'end': ['aggrega...",1,The generated answer correctly identifies Gree...
