In [1]:
from data import Database
from data.repositories import PreparedQuestionRepository, ModelResultRepository, MMULQuestionRepository, BatchJobRepository
from benchmarks import BenchmarkRunner, BenchmarkRegistry, BenchmarkSummary
from ai_models import ModelRegistry

# Initialize database
db = Database()
db.create_all_tables()
mmul_question_repository = MMULQuestionRepository(db)
prepared_question_repo = PreparedQuestionRepository(db)
model_result_repo = ModelResultRepository(db)
batch_job_repo = BatchJobRepository(db)
benchmark_summary_repo = BenchmarkSummary(db)

test_session_id = 61020241334

# Register models
model_registry = ModelRegistry()
model_registry.register_test_models()

# Register benchmarks
benchmark_registry = BenchmarkRegistry(
    mmul_question_repository,
    prepared_question_repo, 
    model_result_repo, 
    batch_job_repo,
    test_session_id = test_session_id, 
    max_tests_per_benchmark = 10)

benchmark_registry.register_mmul_benchmarks()

# Create and run the benchmark runner
runner = BenchmarkRunner(model_registry, benchmark_registry)
runner.estimate_model_results()

# summary = benchmark_summary_repo.get_detailed_summary(test_session_id)
# print(summary.to_string(index=False, justify='center', col_space=20, float_format=lambda x: f'{x:.4f}'))
# print("\n\n\n")
summary = benchmark_summary_repo.get_benchmark_summary(test_session_id)
print(summary.to_string(index=False, justify='center', col_space=20, float_format=lambda x: f'{x:.4f}'))

Data already exists for benchmark MMUL-0Shot and session 61020241334
Estimation already exists for benchmark MMUL-0Shot, session 61020241334, and model test_gpt-4o
Estimation already exists for benchmark MMUL-0Shot, session 61020241334, and model test_gpt-4o-mini
Estimation already exists for benchmark MMUL-0Shot, session 61020241334, and model test_gpt-4-turbo
Estimation already exists for benchmark MMUL-0Shot, session 61020241334, and model test_gpt-4
Estimation already exists for benchmark MMUL-0Shot, session 61020241334, and model test_gpt-3.5-turbo-0125
Estimation already exists for benchmark MMUL-0Shot, session 61020241334, and model test_claude-3-5-sonnet-20240620
Estimation already exists for benchmark MMUL-0Shot, session 61020241334, and model test_claude-3-opus-20240229
Estimation already exists for benchmark MMUL-0Shot, session 61020241334, and model test_claude-3-sonnet-20240229
Estimation already exists for benchmark MMUL-0Shot, session 61020241334, and model test_claude-3

In [2]:
runner.run_benchmarks(in_batch=True)

# summary = benchmark_summary_repo.get_detailed_summary(test_session_id)
# print(summary.to_string(index=False, justify='center', col_space=20, float_format=lambda x: f'{x:.4f}'))
# print("\n\n\n")
summary = benchmark_summary_repo.get_benchmark_summary(test_session_id)
print(summary.to_string(index=False, justify='center', col_space=20, float_format=lambda x: f'{x:.4f}'))

Data already exists for benchmark MMUL-0Shot and session 61020241334
Running batch for MMUL-0Shot with model test_gpt-4o
Batch file saved at: d:\Master\Repo\Magtest\src\batch\MMUL-0Shot\test_gpt-4o_batch_requests.jsonl
Batch - Benchmark[MMUL-0Shot], Session[61020241334], model[test_gpt-4o] created with ID: test_batch_5701
Running batch for MMUL-0Shot with model test_gpt-4o-mini
Batch file saved at: d:\Master\Repo\Magtest\src\batch\MMUL-0Shot\test_gpt-4o-mini_batch_requests.jsonl
Batch - Benchmark[MMUL-0Shot], Session[61020241334], model[test_gpt-4o-mini] created with ID: test_batch_3156
Running batch for MMUL-0Shot with model test_gpt-4-turbo
Batch file saved at: d:\Master\Repo\Magtest\src\batch\MMUL-0Shot\test_gpt-4-turbo_batch_requests.jsonl
Batch - Benchmark[MMUL-0Shot], Session[61020241334], model[test_gpt-4-turbo] created with ID: test_batch_3407
Running batch for MMUL-0Shot with model test_gpt-4
Batch file saved at: d:\Master\Repo\Magtest\src\batch\MMUL-0Shot\test_gpt-4_batch_req