In [None]:
# Generate test session id based on current time year, month, day, hour, minute, second yyyymmddhhmmss
import datetime
test_session_id = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
print(f"Test session id: {test_session_id}")

In [None]:
from data import Database
from data.repositories import (
    PreparedQuestionRepository,
    ModelResultRepository,
    MMLUQuestionRepository,
    BatchJobRepository
)
from benchmarks import BenchmarkRunner, BenchmarkRegistry
from benchmarks.benchmark_summary import BenchmarkSummary
from ai_models import ModelRegistry, BatchProgressManager

# Initialize database
db = Database()
db.create_all_tables()
mmul_question_repository = MMLUQuestionRepository(db)
prepared_question_repo = PreparedQuestionRepository(db)
model_result_repo = ModelResultRepository(db)
batch_job_repo = BatchJobRepository(db)
benchmark_summary = BenchmarkSummary(db)
batch_manager = BatchProgressManager(batch_job_repo)

test_session_id = 20241105180815
# model_name_list = [] # all models
# model_name_list = ["gpt-4o-mini"] # only gpt-4o-mini
# model_name_list = ["gpt-4o-mini", "claude-3-haiku-20240307"]
model_name_list = ["gemini-1.5-flash-002"]  # only gemini-1.5-flash

# Register models
model_registry = ModelRegistry(model_name_list)
model_registry.register_production_models()
model_registry.print_loaded_models()

# Register benchmarks
benchmark_registry = BenchmarkRegistry(
    mmul_question_repository,
    prepared_question_repo, 
    model_result_repo, 
    batch_job_repo,
    test_session_id = test_session_id, 
    max_tests_per_benchmark = 10)

benchmark_registry.register_mmul_benchmarks()

In [None]:
# Create and run the benchmark runner
runner = BenchmarkRunner(model_registry, benchmark_registry)
runner.estimate_model_results()

# benchmark_summary.print_full_summary(test_session_id)
benchmark_summary.print_benchmark_summary(test_session_id)

In [None]:
runner.run_benchmarks(in_batch=True)

In [5]:
#batch_manager.show_batch_progress_from_db(test_session_id)

In [None]:
runner.check_and_process_batch_results()
# benchmark_summary.print_full_summary(test_session_id)
print("\n")
benchmark_summary.print_benchmark_summary(test_session_id)