# Sentinella Evaluation Demo

This notebook demonstrates the evaluation pipeline for comparing LLM models.


In [None]:
import asyncio
import json
import pandas as pd
from src.evaluator.evaluator import Evaluator

# Initialize evaluator
evaluator = Evaluator(gateway_url="http://localhost:8000")


## Load Golden Dataset


In [None]:
dataset_path = "src/evaluator/datasets/golden_dataset.json"

with open(dataset_path, "r") as f:
    dataset = json.load(f)

print(f"Loaded {len(dataset)} test cases")


## Evaluate Single Model


In [None]:
# Evaluate GPT-3.5 Turbo
results = await evaluator.evaluate_model(
    model="gpt-3.5-turbo",
    dataset_path=dataset_path,
    output_path="evaluation_results_gpt35.json"
)

print(f"Average Quality Score: {results['average_quality_score']:.2%}")
print(f"Average Latency: {results['average_latency_ms']:.2f}ms")
print(f"Total Cost: ${results['total_cost']:.4f}")


## Compare Multiple Models (A/B Testing)


# Compare GPT-4o-mini vs GPT-3.5-turbo vs Claude Haiku
comparison = await evaluator.compare_models(
    models=["gpt-4o-mini", "gpt-3.5-turbo", "claude-3-haiku"],
    dataset_path=dataset_path
)

# Display comparison
comparison_df = pd.DataFrame({
    model: {
        "Quality Score": comp["average_quality_score"],
        "Latency (ms)": comp["average_latency_ms"],
        "Cost": comp["total_cost"],
    }
    for model, comp in comparison["detailed_results"].items()
})

print("Model Comparison:")
print(comparison_df.T)
print(f"\nBest Quality: {comparison['best_quality']}")
print(f"Fastest: {comparison['fastest']}")
print(f"Cheapest: {comparison['cheapest']}")


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Create comparison charts
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

models = list(comparison["detailed_results"].keys())
quality_scores = [comparison["detailed_results"][m]["average_quality_score"] for m in models]
latencies = [comparison["detailed_results"][m]["average_latency_ms"] for m in models]
costs = [comparison["detailed_results"][m]["total_cost"] for m in models]

axes[0].bar(models, quality_scores, color=['#4CAF50', '#2196F3', '#FF9800'])
axes[0].set_title("Quality Scores", fontsize=14, fontweight='bold')
axes[0].set_ylabel("Score (0-1)")
axes[0].set_ylim([0, 1])
axes[0].tick_params(axis='x', rotation=45)
for i, v in enumerate(quality_scores):
    axes[0].text(i, v + 0.02, f'{v:.2%}', ha='center', va='bottom')

axes[1].bar(models, latencies, color=['#4CAF50', '#2196F3', '#FF9800'])
axes[1].set_title("Average Latency", fontsize=14, fontweight='bold')
axes[1].set_ylabel("Latency (ms)")
axes[1].tick_params(axis='x', rotation=45)
for i, v in enumerate(latencies):
    axes[1].text(i, v + max(latencies)*0.02, f'{v:.0f}ms', ha='center', va='bottom')

axes[2].bar(models, costs, color=['#4CAF50', '#2196F3', '#FF9800'])
axes[2].set_title("Total Cost", fontsize=14, fontweight='bold')
axes[2].set_ylabel("Cost (USD)")
axes[2].tick_params(axis='x', rotation=45)
for i, v in enumerate(costs):
    axes[2].text(i, v + max(costs)*0.02, f'${v:.4f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()


## Detailed Results Analysis


In [None]:
# Display detailed results for each model
for model_name, model_results in comparison["detailed_results"].items():
    print(f"\n{'='*60}")
    print(f"Model: {model_name}")
    print(f"{'='*60}")
    print(f"Total Tests: {model_results['total_tests']}")
    print(f"Successful: {model_results['successful']}")
    print(f"Success Rate: {model_results['successful']/model_results['total_tests']:.2%}")
    print(f"Average Quality Score: {model_results['average_quality_score']:.2%}")
    print(f"Average Latency: {model_results['average_latency_ms']:.2f}ms")
    print(f"Total Cost: ${model_results['total_cost']:.4f}")
    
    # Show sample results
    print(f"\nSample Results:")
    for i, result in enumerate(model_results['results'][:3]):
        if 'error' not in result:
            print(f"\n  Test {i+1}:")
            print(f"    Prompt: {result['prompt'][:50]}...")
            print(f"    Quality: {result['quality_score']:.2%}")
            print(f"    Latency: {result['latency_ms']:.2f}ms")


## Cost-Performance Analysis


In [None]:
# Create cost-performance scatter plot
fig, ax = plt.subplots(figsize=(10, 6))

for model_name in models:
    model_results = comparison["detailed_results"][model_name]
    ax.scatter(
        model_results["average_latency_ms"],
        model_results["average_quality_score"],
        s=model_results["total_cost"] * 10000,  # Size by cost
        alpha=0.6,
        label=model_name,
    )
    ax.annotate(
        model_name,
        (model_results["average_latency_ms"], model_results["average_quality_score"]),
        xytext=(5, 5),
        textcoords='offset points',
    )

ax.set_xlabel("Average Latency (ms)", fontsize=12)
ax.set_ylabel("Quality Score", fontsize=12)
ax.set_title("Cost-Performance Analysis\n(Bubble size = Cost)", fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3)
ax.legend()
plt.tight_layout()
plt.show()


## Export Results


In [None]:
# Save comparison results to JSON
with open("evaluation_comparison.json", "w") as f:
    json.dump(comparison, f, indent=2)

# Save to CSV for easy analysis
comparison_df = pd.DataFrame({
    model: {
        "Quality Score": comp["average_quality_score"],
        "Latency (ms)": comp["average_latency_ms"],
        "Cost": comp["total_cost"],
        "Success Rate": comp["successful"] / comp["total_tests"],
    }
    for model, comp in comparison["detailed_results"].items()
})

comparison_df.T.to_csv("evaluation_comparison.csv")
print("Results saved to:")
print("  - evaluation_comparison.json")
print("  - evaluation_comparison.csv")
