In [1]:
from data import Database
from data.repositories import (
    PreparedQuestionRepository,
    ModelResultRepository,
    MMULQuestionRepository,
    BatchJobRepository
)
from benchmarks import BenchmarkRunner, BenchmarkRegistry
from benchmarks.benchmark_summary import BenchmarkSummary
from ai_models import ModelRegistry

# Initialize database
db = Database()
db.create_all_tables()
mmul_question_repository = MMULQuestionRepository(db)
prepared_question_repo = PreparedQuestionRepository(db)
model_result_repo = ModelResultRepository(db)
batch_job_repo = BatchJobRepository(db)
benchmark_summary = BenchmarkSummary(db)

test_session_id = 17112010002

# model_name_list = [] # all models
model_name_list = ["gpt-4o-mini"] # only gpt-4o-mini


# Register models
model_registry = ModelRegistry(model_name_list)
model_registry.register_production_models()
model_registry.print_loaded_models()

# Register benchmarks
benchmark_registry = BenchmarkRegistry(
    mmul_question_repository,
    prepared_question_repo, 
    model_result_repo, 
    batch_job_repo,
    test_session_id = test_session_id, 
    max_tests_per_benchmark = 10)

benchmark_registry.register_mmul_benchmarks()

# Create and run the benchmark runner
runner = BenchmarkRunner(model_registry, benchmark_registry)
runner.estimate_model_results()

# benchmark_summary.print_full_summary(test_session_id)
benchmark_summary.print_benchmark_summary(test_session_id)


Skipping model claude-3-5-sonnet-20240620 as it's not in the model_name_list
Skipping model claude-3-opus-20240229 as it's not in the model_name_list
Skipping model claude-3-sonnet-20240229 as it's not in the model_name_list
Skipping model claude-3-haiku-20240307 as it's not in the model_name_list
Skipping model gpt-4o as it's not in the model_name_list
Skipping model gpt-4-turbo as it's not in the model_name_list
Skipping model gpt-4 as it's not in the model_name_list
Skipping model gpt-3.5-turbo-0125 as it's not in the model_name_list
Loaded Models:
- gpt-4o-mini

Total loaded models: 1

Filtered by model list: gpt-4o-mini
Data already exists for benchmark MMUL-0Shot and session 17112010001
Estimation already exists for benchmark MMUL-0Shot, session 17112010001, and model gpt-4o-mini
Data already exists for benchmark MMUL-5Shot and session 17112010001
Estimation already exists for benchmark MMUL-5Shot, session 17112010001, and model gpt-4o-mini
Benchmark Summary
     Benchmark       

In [2]:
print("\n")
runner.run_benchmarks(in_batch=True)
print("\n")
runner.check_and_process_batch_results()
# benchmark_summary.print_full_summary(test_session_id)
print("\n")
benchmark_summary.print_benchmark_summary(test_session_id)



Data already exists for benchmark MMUL-0Shot and session 17112010001
Batch job already exists for Benchmark[MMUL-0Shot], Session[17112010001], model[gpt-4o-mini]
Data already exists for benchmark MMUL-5Shot and session 17112010001
Batch job already exists for Benchmark[MMUL-5Shot], Session[17112010001], model[gpt-4o-mini]


Data already exists for benchmark MMUL-0Shot and session 17112010001
Batch status: completed
Data already exists for benchmark MMUL-5Shot and session 17112010001
Batch status: completed


Benchmark Summary
     Benchmark              Model                Queries             Avg Score       Total Execution Time      Est Tokens           Act Tokens            Est Cost             Act Cost      
     MMUL-0Shot          gpt-4o-mini              570                 0.7754                0.0000                61935                65688               0.0095               0.0051       
     MMUL-5Shot          gpt-4o-mini              570                 0.7526          

In [3]:
from openai import OpenAI
import os
from dotenv import load_dotenv
from colorama import Fore, Back, Style, init

# Inicjalizacja colorama
init()

# Załaduj zmienne środowiskowe z pliku .env
load_dotenv()

# Inicjalizacja klienta OpenAI
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def get_batch_info(batch_id):
    try:
        return client.batches.retrieve(batch_id)
    except Exception as e:
        print(f"Wystąpił błąd podczas pobierania informacji o batchu {batch_id}: {e}")
        return None

def display_batch_progress(batch_info):
    if not batch_info:
        return

    total = batch_info.request_counts.total
    completed = batch_info.request_counts.completed
    failed = batch_info.request_counts.failed
    in_progress = total - completed - failed

    print(f"Batch ID: {batch_info.id}")
    print(f"Status: {batch_info.status}")
    print(f"Total requests: {total}")

    bar_length = 50
    completed_length = int(completed / total * bar_length)
    failed_length = int(failed / total * bar_length)
    in_progress_length = bar_length - completed_length - failed_length

    progress_bar = (
        Fore.GREEN + Back.GREEN + " " * completed_length +
        Fore.RED + Back.RED + " " * failed_length +
        Fore.YELLOW + Back.YELLOW + " " * in_progress_length +
        Style.RESET_ALL
    )

    print(f"Progress: [{progress_bar}]")
    print(f"Completed: {completed} | Failed: {failed} | In Progress: {in_progress}")
    
    # Dodajemy procentowe wartości dla lepszej czytelności
    completed_percent = (completed / total) * 100
    failed_percent = (failed / total) * 100
    in_progress_percent = (in_progress / total) * 100
    
    print(f"Completed: {completed_percent:.2f}% | Failed: {failed_percent:.2f}% | In Progress: {in_progress_percent:.2f}%")
    print("-" * 60)

# Lista ID batchów
batch_ids = [
    "batch_67152006844c8190ac2d4f9a677d7fd1",
    "batch_671520098fdc8190828ed4e7f2aa5465"
]

for batch_id in batch_ids:
    batch_info = get_batch_info(batch_id)
    if batch_info:
        display_batch_progress(batch_info)


Batch ID: batch_67152006844c8190ac2d4f9a677d7fd1
Status: completed
Total requests: 570
Progress: [                                                  ]
Completed: 570 | Failed: 0 | In Progress: 0
Completed: 100.00% | Failed: 0.00% | In Progress: 0.00%
------------------------------------------------------------
Batch ID: batch_671520098fdc8190828ed4e7f2aa5465
Status: completed
Total requests: 570
Progress: [                                                  ]
Completed: 570 | Failed: 0 | In Progress: 0
Completed: 100.00% | Failed: 0.00% | In Progress: 0.00%
------------------------------------------------------------
