In [1]:
# Generate test session id based on current time year, month, day, hour, minute, second yyyymmddhhmmss
import datetime
test_session_id = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
print(f"Test session id: {test_session_id}")

Test session id: 20241110104819


In [2]:
from data import Database
from data.repositories import (
    PreparedQuestionRepository,
    ModelResultRepository,
    MMLUQuestionRepository,
    BatchJobRepository,
    GSM8KQuestionRepository,
    BBHQuestionRepository
)
from benchmarks import BenchmarkRunner, BenchmarkRegistry
from benchmarks.benchmark_summary import BenchmarkSummary
from ai_models import ModelRegistry, BatchProgressManager

# Initialize database
db = Database()
db.create_all_tables()
mmlu_question_repository = MMLUQuestionRepository(db)
gsm8k_question_repository = GSM8KQuestionRepository(db)
bbh_question_repository = BBHQuestionRepository(db)
prepared_question_repo = PreparedQuestionRepository(db)
model_result_repo = ModelResultRepository(db)
batch_job_repo = BatchJobRepository(db)
benchmark_summary = BenchmarkSummary(db)
batch_manager = BatchProgressManager(batch_job_repo)

test_session_id = 20241108214342
model_name_list = [] # all models
# model_name_list = ["gpt-4o-mini"] # only gpt-4o-mini
# model_name_list = ["gpt-4o-mini", "claude-3-haiku-20240307"]
# model_name_list = ["gemini-1.5-flash-002"]  # only gemini-1.5-flash
# model_name_list = ["claude-3-haiku-20240307"]
# model_name_list = ["gemini-1.5-flash-002"]
# model_name_list = ["gemini-1.5-flash-002", "gpt-4o-mini", "claude-3-haiku-20240307"]

# Register models
model_registry = ModelRegistry(model_name_list)
model_registry.register_production_models()
model_registry.print_loaded_models()


# benchmark_name_list = ["MMLU-0Shot", "MMLU-5Shot"]
# benchmark_name_list = ["GSM8K-0Shot"]
# benchmark_name_list = ["BBH-3Shot"]
benchmark_name_list = [] # all benchmarks

# Register benchmarks
benchmark_registry = BenchmarkRegistry(
    mmlu_question_repository,
    gsm8k_question_repository,
    bbh_question_repository,
    prepared_question_repo, 
    model_result_repo, 
    batch_job_repo,
    test_session_id = test_session_id, 
    benchmark_name_list = benchmark_name_list)

benchmark_registry.register_benchmarks()
benchmark_registry.print_loaded_benchmarks()

Loaded Models:
- gpt-4o
- gpt-4o-mini
- gpt-4-turbo
- gpt-4
- gpt-3.5-turbo-0125
- gemini-1.5-flash-002
- gemini-1.5-flash-001
- gemini-1.5-pro-002
- gemini-1.5-pro-001
- gemini-1.0-pro-002
- gemini-1.0-pro-001

Total loaded models: 11

No model list filter applied.
Loaded Benchmarks:
- mmlu-0shot
- mmlu-5shot
- gsm8k-0shot
- gsm8k-4shot
- bbh-0shot
- bbh-3shot

Total loaded benchmarks: 6

No benchmark list filter applied.


In [3]:
# Create and run the benchmark runner
runner = BenchmarkRunner(model_registry, benchmark_registry)
# runner.estimate_model_results()

# benchmark_summary.print_full_summary(test_session_id)
# benchmark_summary.print_benchmark_summary(test_session_id)

In [4]:
#runner.run_benchmarks(in_batch=True)

In [5]:
#batch_manager.show_batch_progress_from_db(test_session_id)

In [6]:
#runner.check_and_process_batch_results()
# benchmark_summary.print_full_summary(test_session_id)
#print("\n")
#benchmark_summary.print_benchmark_summary(test_session_id)

In [7]:
# benchmark_summary.print_full_summary(test_session_id)
# benchmark_summary.save_full_summary_to_excel(test_session_id, "full_summary.xlsx")
# benchmark_summary.create_all_plots(test_session_id)
# benchmark_summary.plot_cost_analysis(test_session_id)
# benchmark_summary.plot_cost_effectiveness(test_session_id)
# benchmark_summary.analyze_few_shot_impact(test_session_id)
# benchmark_summary.plot_top_cost_effective_models(test_session_id)
# benchmark_summary.analyze_openai_models_comparison(test_session_id)
benchmark_summary.plot_openai_vs_google_comparison(test_session_id)

AttributeError: 'BenchmarkSummary' object has no attribute '_get_model_performance_data'