In [1]:
# Generate test session id based on current time year, month, day, hour, minute, second yyyymmddhhmmss
import datetime
test_session_id = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
print(f"Test session id: {test_session_id}")

Test session id: 20241107224505


In [2]:
from data import Database
from data.repositories import (
    PreparedQuestionRepository,
    ModelResultRepository,
    MMLUQuestionRepository,
    BatchJobRepository,
    GSM8KQuestionRepository,
    BBHQuestionRepository
)
from benchmarks import BenchmarkRunner, BenchmarkRegistry
from benchmarks.benchmark_summary import BenchmarkSummary
from ai_models import ModelRegistry, BatchProgressManager

# Initialize database
db = Database()
db.create_all_tables()
mmlu_question_repository = MMLUQuestionRepository(db)
gsm8k_question_repository = GSM8KQuestionRepository(db)
bbh_question_repository = BBHQuestionRepository(db)
prepared_question_repo = PreparedQuestionRepository(db)
model_result_repo = ModelResultRepository(db)
batch_job_repo = BatchJobRepository(db)
benchmark_summary = BenchmarkSummary(db)
batch_manager = BatchProgressManager(batch_job_repo)

test_session_id = 20241107224505
# model_name_list = [] # all models
# model_name_list = ["gpt-4o-mini"] # only gpt-4o-mini
# model_name_list = ["gpt-4o-mini", "claude-3-haiku-20240307"]
# model_name_list = ["gemini-1.5-flash-002"]  # only gemini-1.5-flash
# model_name_list = ["claude-3-haiku-20240307"]
model_name_list = ["gemini-1.5-flash-002"]
# model_name_list = ["gemini-1.5-flash-002", "gpt-4o-mini", "claude-3-haiku-20240307"]

# Register models
model_registry = ModelRegistry(model_name_list)
model_registry.register_production_models()
model_registry.print_loaded_models()


# benchmark_name_list = ["MMLU-0Shot", "MMLU-5Shot"]
# benchmark_name_list = ["GSM8K-0Shot"]
benchmark_name_list = ["BBH-3Shot"]

# Register benchmarks
benchmark_registry = BenchmarkRegistry(
    mmlu_question_repository,
    gsm8k_question_repository,
    bbh_question_repository,
    prepared_question_repo, 
    model_result_repo, 
    batch_job_repo,
    test_session_id = test_session_id, 
    max_tests_per_benchmark = 20,
    benchmark_name_list = benchmark_name_list)

benchmark_registry.register_benchmarks()
benchmark_registry.print_loaded_benchmarks()

Skipping model claude-3-5-sonnet-20240620 as it's not in the model_name_list
Skipping model claude-3-opus-20240229 as it's not in the model_name_list
Skipping model claude-3-sonnet-20240229 as it's not in the model_name_list
Skipping model claude-3-haiku-20240307 as it's not in the model_name_list
Skipping model gpt-4o as it's not in the model_name_list
Skipping model gpt-4o-mini as it's not in the model_name_list
Skipping model gpt-4-turbo as it's not in the model_name_list
Skipping model gpt-4 as it's not in the model_name_list
Skipping model gpt-3.5-turbo-0125 as it's not in the model_name_list
Skipping model gemini-1.5-flash-001 as it's not in the model_name_list
Skipping model gemini-1.5-pro-002 as it's not in the model_name_list
Skipping model gemini-1.5-pro-001 as it's not in the model_name_list
Skipping model gemini-1.0-pro-002 as it's not in the model_name_list
Skipping model gemini-1.0-pro-001 as it's not in the model_name_list
Loaded Models:
- gemini-1.5-flash-002

Total loa

In [3]:
# Create and run the benchmark runner
runner = BenchmarkRunner(model_registry, benchmark_registry)
runner.estimate_model_results()

# benchmark_summary.print_full_summary(test_session_id)
benchmark_summary.print_benchmark_summary(test_session_id)

Data file already exists. Skipping download.
Extracting data...


Extraction completed.
Data file already exists. Skipping download.
Extracting data...
Extraction completed.
Preparing 6511 tests for benchmark BBH-3Shot, session 20241107223249, 3-shot
Finished preparing 6511 tests for benchmark BBH-3Shot
Benchmark Summary
     Benchmark              Model                Queries            Avg Score       Total Execution Time      Est Tokens           Act Tokens            Est Cost             Act Cost      
     BBH-3Shot       gemini-1.5-flash-002         6511                 None                 NaN                11986566               NaN                 5.9099                NaN         




In [4]:
# runner.run_benchmarks(in_batch=True)

In [5]:
batch_manager.show_batch_progress_from_db(test_session_id)

In [6]:
runner.check_and_process_batch_results()
# benchmark_summary.print_full_summary(test_session_id)
print("\n")
benchmark_summary.print_benchmark_summary(test_session_id)

Data already exists for benchmark BBH-3Shot and session 20241107223249


Benchmark Summary
     Benchmark              Model                Queries            Avg Score       Total Execution Time      Est Tokens           Act Tokens            Est Cost             Act Cost      
     BBH-3Shot       gemini-1.5-flash-002         6511                 None                 NaN                11986566               NaN                 5.9099                NaN         


