In [1]:
# Generate test session id based on current time year, month, day, hour, minute, second yyyymmddhhmmss
import datetime

test_session_id = datetime.datetime.now().strftime("%Y%m%d%H%M%S")

print(f"Test session id: {test_session_id}")


Test session id: 20241029123135


In [2]:
from data import Database
from data.repositories import (
    PreparedQuestionRepository,
    ModelResultRepository,
    MMULQuestionRepository,
    BatchJobRepository
)
from benchmarks import BenchmarkRunner, BenchmarkRegistry
from benchmarks.benchmark_summary import BenchmarkSummary
from ai_models import ModelRegistry, BatchProgressManager

# Initialize database
db = Database()
db.create_all_tables()
mmul_question_repository = MMULQuestionRepository(db)
prepared_question_repo = PreparedQuestionRepository(db)
model_result_repo = ModelResultRepository(db)
batch_job_repo = BatchJobRepository(db)
benchmark_summary = BenchmarkSummary(db)
batch_manager = BatchProgressManager(batch_job_repo)

test_session_id = 20241027212309
# model_name_list = [] # all models
model_name_list = ["gpt-4o-mini"] # only gpt-4o-mini
# model_name_list = ["gpt-4o-mini", "claude-3-haiku-20240307"]

# Register models
model_registry = ModelRegistry(model_name_list)
model_registry.register_production_models()
model_registry.print_loaded_models()

# Register benchmarks
benchmark_registry = BenchmarkRegistry(
    mmul_question_repository,
    prepared_question_repo, 
    model_result_repo, 
    batch_job_repo,
    test_session_id = test_session_id, 
    max_tests_per_benchmark = 10)

benchmark_registry.register_mmul_benchmarks()

# Create and run the benchmark runner
runner = BenchmarkRunner(model_registry, benchmark_registry)
runner.estimate_model_results()

# benchmark_summary.print_full_summary(test_session_id)
benchmark_summary.print_benchmark_summary(test_session_id)


Skipping model claude-3-5-sonnet-20240620 as it's not in the model_name_list
Skipping model claude-3-opus-20240229 as it's not in the model_name_list
Skipping model claude-3-sonnet-20240229 as it's not in the model_name_list
Skipping model claude-3-haiku-20240307 as it's not in the model_name_list
Skipping model gpt-4o as it's not in the model_name_list
Skipping model gpt-4-turbo as it's not in the model_name_list
Skipping model gpt-4 as it's not in the model_name_list
Skipping model gpt-3.5-turbo-0125 as it's not in the model_name_list
Loaded Models:
- gpt-4o-mini

Total loaded models: 1

Filtered by model list: gpt-4o-mini
Data already exists for benchmark MMUL-0Shot and session 20241027212309
Estimation already exists for benchmark MMUL-0Shot, session 20241027212309, and model gpt-4o-mini
Data already exists for benchmark MMUL-5Shot and session 20241027212309
Estimation already exists for benchmark MMUL-5Shot, session 20241027212309, and model gpt-4o-mini
Benchmark Summary
     Benc

In [3]:
runner.run_benchmarks(in_batch=True)

Data already exists for benchmark MMUL-0Shot and session 20241027212309
Created 4 batches
Error adding BatchJob: (sqlite3.ProgrammingError) Error binding parameter 4: type 'list' is not supported
[SQL: INSERT INTO batch_jobs (test_session_id, benchmark_name, model_name, batch_id, status, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?)]
[parameters: (20241027212309, 'MMUL-0Shot', 'gpt-4o-mini', ['batch_6720c79c632881908c1a1de4237d0659', 'batch_6720c79db52881909cef2a24a002fe87', 'batch_6720c79ed5f08190b47ec18998afeebc', 'batch_6720c79fd64c81909f73290d86de3319'], 'pending', '2024-10-29 11:31:43.953196', '2024-10-29 11:31:43.953196')]
(Background on this error at: https://sqlalche.me/e/20/f405)
Batch - Benchmark[MMUL-0Shot], Session[20241027212309], model[gpt-4o-mini] created with ID: ['batch_6720c79c632881908c1a1de4237d0659', 'batch_6720c79db52881909cef2a24a002fe87', 'batch_6720c79ed5f08190b47ec18998afeebc', 'batch_6720c79fd64c81909f73290d86de3319']
Data already exists for benchmark 

In [7]:
batch_manager.show_batch_progress_from_db(test_session_id)

In [5]:
runner.check_and_process_batch_results()
# benchmark_summary.print_full_summary(test_session_id)
print("\n")
benchmark_summary.print_benchmark_summary(test_session_id)

Data already exists for benchmark MMUL-0Shot and session 20241027212309
Data already exists for benchmark MMUL-5Shot and session 20241027212309


Benchmark Summary
     Benchmark              Model                Queries            Avg Score       Total Execution Time      Est Tokens           Act Tokens            Est Cost             Act Cost      
     MMUL-0Shot          gpt-4o-mini              570                  None                 NaN                  64722                NaN                 0.0100                NaN         
     MMUL-5Shot          gpt-4o-mini              570                  None                 NaN                 342723                NaN                 0.0517                NaN         


