In [1]:
import sys
import os
from data import Database
from data.repositories import PreparedQuestionRepository, ModelResultRepository, MMULQuestionRepository
from benchmarks import BenchmarkRunner, BenchmarkRegistry, BenchmarkSummary
from ai_models import ModelFactory, ModelsRegistry

# Initialize database
db = Database()
db.create_all_tables()
mmul_question_repository = MMULQuestionRepository(db)
prepared_question_repo = PreparedQuestionRepository(db)
model_result_repo = ModelResultRepository(db)
benchmark_summary_repo = BenchmarkSummary(db)

test_session_id = "Session7"

# Register models
models_registry = ModelsRegistry()
models_registry.register_test_openai_models()
#models_registry.register_openai_models()

# Register benchmarks
benchmark_registry = BenchmarkRegistry(
    mmul_question_repository,
    prepared_question_repo, 
    model_result_repo, 
    test_session_id = test_session_id, 
    max_tests_per_benchmark = 1)

benchmark_registry.register_mmul_benchmarks()

# Create and run the benchmark runner
runner = BenchmarkRunner(models_registry, benchmark_registry)
runner.estimate_model_results()

# summary = benchmark_summary_repo.get_detailed_summary(test_session_id)
# print(summary.to_string(index=False, justify='center', col_space=20, float_format=lambda x: f'{x:.4f}'))
# print("\n\n\n")
summary = benchmark_summary_repo.get_benchmark_summary(test_session_id)
print(summary.to_string(index=False, justify='center', col_space=20, float_format=lambda x: f'{x:.4f}'))

Filtered test data shape: (57, 9)
Przygotowuję 57 testów dla benchmarka MMUL-0Shot, sesji Session7, 0-shot
Zakończono przygotowywanie 57 testów dla benchmarka MMUL-0Shot
Filtered test data shape: (57, 9)
Przygotowuję 57 testów dla benchmarka MMUL-5Shot, sesji Session7, 5-shot
Zakończono przygotowywanie 57 testów dla benchmarka MMUL-5Shot
     Benchmark                Model                 Queries            Avg Score       Total Execution Time      Est Tokens           Act Tokens            Est Cost             Act Cost      
     MMUL-0Shot      test-gpt-3.5-turbo-0125          57                  None                 None                 6779                 NaN                 0.0034                NaN         
     MMUL-0Shot                   test-gpt-4          57                  None                 None                 6779                 NaN                 0.2051                NaN         
     MMUL-0Shot             test-gpt-4-turbo          57                  None      

In [2]:
runner.run_benchmarks()

# summary = benchmark_summary_repo.get_detailed_summary(test_session_id)
# print(summary.to_string(index=False, justify='center', col_space=20, float_format=lambda x: f'{x:.4f}'))
# print("\n\n\n")
summary = benchmark_summary_repo.get_benchmark_summary(test_session_id)
print(summary.to_string(index=False, justify='center', col_space=20, float_format=lambda x: f'{x:.4f}'))

Dane już istnieją dla benchmarka MMUL-0Shot i sesji Session7
Dane już istnieją dla benchmarka MMUL-5Shot i sesji Session7
     Benchmark                Model                 Queries             Avg Score       Total Execution Time      Est Tokens           Act Tokens            Est Cost             Act Cost      
     MMUL-0Shot      test-gpt-3.5-turbo-0125          57                 0.2632                0.0001                6779                 6779                0.0034               0.0034       
     MMUL-0Shot                   test-gpt-4          57                 0.2982                0.0001                6779                 6779                0.2051               0.2051       
     MMUL-0Shot             test-gpt-4-turbo          57                 0.1754                0.0001                6779                 6779                0.0689               0.0689       
     MMUL-0Shot                  test-gpt-4o          57                 0.1579                0.0001     