In [1]:
from data import Database
from data.repositories import PreparedQuestionRepository, ModelResultRepository, MMULQuestionRepository, BatchJobRepository
from benchmarks import BenchmarkRunner, BenchmarkRegistry, BenchmarkSummary
from ai_models import ModelRegistry

# Initialize database
db = Database()
db.create_all_tables()
mmul_question_repository = MMULQuestionRepository(db)
prepared_question_repo = PreparedQuestionRepository(db)
model_result_repo = ModelResultRepository(db)
batch_job_repo = BatchJobRepository(db)
benchmark_summary_repo = BenchmarkSummary(db)

test_session_id = 81020241819

# Register models
model_registry = ModelRegistry()
model_registry.register_test_models()

# Register benchmarks
benchmark_registry = BenchmarkRegistry(
    mmul_question_repository,
    prepared_question_repo, 
    model_result_repo, 
    batch_job_repo,
    test_session_id = test_session_id, 
    max_tests_per_benchmark = 10)

benchmark_registry.register_mmul_benchmarks()

# Create and run the benchmark runner
runner = BenchmarkRunner(model_registry, benchmark_registry)
runner.estimate_model_results()

# summary = benchmark_summary_repo.get_detailed_summary(test_session_id)
# print(summary.to_string(index=False, justify='center', col_space=20, float_format=lambda x: f'{x:.4f}'))
# print("\n\n\n")
summary = benchmark_summary_repo.get_benchmark_summary(test_session_id)
print(summary.to_string(index=False, justify='center', col_space=20, float_format=lambda x: f'{x:.4f}'))

Filtered test data shape: (570, 9)
Preparing 570 tests for benchmark MMUL-0Shot, session 81020241819, 0-shot
Finished preparing 570 tests for benchmark MMUL-0Shot
Filtered test data shape: (570, 9)
Preparing 570 tests for benchmark MMUL-5Shot, session 81020241819, 5-shot
Finished preparing 570 tests for benchmark MMUL-5Shot
     Benchmark                    Model                     Queries            Avg Score       Total Execution Time      Est Tokens           Act Tokens            Est Cost             Act Cost      
     MMUL-0Shot      test_claude-3-5-sonnet-20240620         570                  None                 None                 65660                NaN                 0.2038                NaN         
     MMUL-0Shot         test_claude-3-haiku-20240307         570                  None                 None                 65660                NaN                 0.0170                NaN         
     MMUL-0Shot          test_claude-3-opus-20240229         570          

In [2]:
runner.run_benchmarks(in_batch=True)

# summary = benchmark_summary_repo.get_detailed_summary(test_session_id)
# print(summary.to_string(index=False, justify='center', col_space=20, float_format=lambda x: f'{x:.4f}'))
# print("\n\n\n")
summary = benchmark_summary_repo.get_benchmark_summary(test_session_id)
print(summary.to_string(index=False, justify='center', col_space=20, float_format=lambda x: f'{x:.4f}'))

Data already exists for benchmark MMUL-0Shot and session 81020241819
Batch - Benchmark[MMUL-0Shot], Session[81020241819], model[test_gpt-4o] created with ID: test_batch_4874
Batch - Benchmark[MMUL-0Shot], Session[81020241819], model[test_gpt-4o-mini] created with ID: test_batch_1911
Batch - Benchmark[MMUL-0Shot], Session[81020241819], model[test_gpt-4-turbo] created with ID: test_batch_1108
Batch - Benchmark[MMUL-0Shot], Session[81020241819], model[test_gpt-4] created with ID: test_batch_2803
Batch - Benchmark[MMUL-0Shot], Session[81020241819], model[test_gpt-3.5-turbo-0125] created with ID: test_batch_9768
Batch - Benchmark[MMUL-0Shot], Session[81020241819], model[test_claude-3-5-sonnet-20240620] created with ID: test_batch_1208
Batch - Benchmark[MMUL-0Shot], Session[81020241819], model[test_claude-3-opus-20240229] created with ID: test_batch_5896
Batch - Benchmark[MMUL-0Shot], Session[81020241819], model[test_claude-3-sonnet-20240229] created with ID: test_batch_2599
Batch - Benchmar