# UQLM Benchmarking Framework Demo

This notebook demonstrates the new benchmarking capabilities:
- **BenchmarkRunner**: Run benchmarks with automatic caching
- **BenchmarkAnalyzer**: Analyze and visualize results
- **FactScoreBenchmark**: Example benchmark implementation

In [None]:
from langchain_google_vertexai import ChatVertexAI
from uqlm.benchmarks import BenchmarkRunner, BenchmarkAnalyzer, FactScoreBenchmark

In [None]:
# Initialize LLMs
test_llm = ChatVertexAI(model="gemini-2.5-flash", temperature=0)
bench_llm = ChatVertexAI(model="gemini-2.5-pro")

# Initialize benchmark runner
runner = BenchmarkRunner(storage_path="~/.uqlm/benchmark_results")

# Initialize benchmark implementation
fs_benchmark = FactScoreBenchmark(judge_llm=bench_llm)

# Run benchmark with automatic caching
results = await runner.run_benchmark(
    benchmark_name="factscore",
    benchmark_implementation=fs_benchmark,
    llm_names=["gemini-2.5-flash"],
    scorer_names=["LongFormUQ"],
    dataset_name="dskar/FActScore",
    sampling_temperature=0.4,
    num_responses=5,
    use_cache=True,  # Check for cached results
    save_results=True,  # Save results to database
)

print(f"Run ID: {results.metadata.run_id}")
print(f"Status: {results.metadata.status}")
print(f"Number of results: {len(results.results)}")

In [None]:
# Initialize analyzer
analyzer = BenchmarkAnalyzer(storage_path="~/.uqlm/benchmark_results")

# List recent benchmark runs
recent_runs = analyzer.list_runs(benchmark_name="factscore", limit=10)
print("Recent runs:")
print(recent_runs)

In [None]:
# Compare LLM performance
comparison = analyzer.compare_llms(benchmark_name="factscore", scorer_names=["LongFormUQ"])
print("\nLLM Comparison:")
print(comparison)

# Get aggregate metrics for specific run
aggregates = analyzer.aggregate_metrics(run_id=results.metadata.run_id, groupby="llm")
print("\nAggregate Metrics:")
print(aggregates)

In [None]:
# Visualize results (requires matplotlib)
if not comparison.empty:
    analyzer.plot_results(comparison, plot_type="bar")

# Export comprehensive report
analyzer.export_report(run_ids=[results.metadata.run_id], output_path="./factscore_report.html", format="html")
print("\nReport exported to factscore_report.html")