This notebook is useful when a batch stopped in the middle and you know the ID or when you want to see how are all your batches

Note: It may make sense to get the latest batch and then use this

In [2]:
import json
from openai import OpenAI
from os import getenv
from dotenv import load_dotenv
from pathlib import Path

from wikipedia_markdown.utils.yaml import load_yaml
from wikipedia_markdown.utils.database import update_llm_cleaned_row

load_dotenv()
openai_token = getenv("OPENAI_TOKEN")

client = OpenAI()

## Check batches

In [3]:
batches = client.batches.list(limit=50)



## Check status single batch

In [9]:
batch_id = "batch_679a7779e2988190b956489961393a5e"

batch = client.batches.retrieve(batch_id)
print(batch)

Batch(id='batch_679a7779e2988190b956489961393a5e', completion_window='24h', created_at=1738176378, endpoint='/v1/chat/completions', input_file_id='file-QKsAqMeY5MCzZHTTX3U75S', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1738220649, error_file_id=None, errors=None, expired_at=None, expires_at=1738262778, failed_at=None, finalizing_at=1738220382, in_progress_at=1738176380, metadata={'description': 'Batch 2'}, output_file_id='file-FSPQ2qFc7XWnA4frf7MVrS', request_counts=BatchRequestCounts(completed=2067, failed=0, total=2067))


## Download single batch

In [10]:
# Set the base path
base_path = Path("../")

# Load the YAML configuration
config_path = base_path / "config.yaml"
config = load_yaml(config_path)

# Set the directory to save batch results
results_path = base_path / config["openai_batch_job_results_path"]
results_path.mkdir(parents=True, exist_ok=True)

# Retrieve the batch job details
batch = client.batches.retrieve(batch_id)
if not batch.output_file_id:
    print(f"No output file found for batch {batch_id}. Skipping.")
else:
    # Download the output file content
    file_response = client.files.content(batch.output_file_id)

    # Construct the path to save the result
    result_file_name = f"batch_result_{batch_id}.jsonl"
    result_file_path = results_path / result_file_name

    # Save the content to the file
    with open(result_file_path, "wb") as f:
        f.write(file_response.content)

    print(f"Batch job result saved to {result_file_path}")

Batch job result saved to ../openai_batch_job_results/batch_result_batch_679a7779e2988190b956489961393a5e.jsonl


## Insert in DB results of single batch

In [11]:
def process_jsonl_and_update_db(
    jsonl_file_path: Path, db_path: Path, debug: bool = False
):
    """
    Process a `.jsonl` file and update the database.

    Args:
        jsonl_file_path (Path): Path to the `.jsonl` file containing batch job results.
        db_path (Path): Path to the SQLite database file.
        debug (bool): If True, print debug messages (default: False).
    """
    with open(jsonl_file_path, "r", encoding="utf-8") as file:
        for line in file:
            # Parse each line as a JSON object
            row = json.loads(line)

            # Extract the required data
            custom_id = int(row["custom_id"])  # The ID of the row in the database
            response_body = row["response"]["body"]
            assistant_content = response_body["choices"][0]["message"][
                "content"
            ]  # Assistant's response
            completion_tokens = response_body["usage"][
                "completion_tokens"
            ]  # Token count
            model = response_body["model"]  # Model used for processing

            # Update the database row
            update_llm_cleaned_row(
                db_path=db_path,
                id=custom_id,
                model=model,
                llm_cleaned_text=assistant_content,
                llm_cleaned_text_tokens=completion_tokens,
                debug=debug,
            )


In [12]:
# Construct the full path to the database file
data_folder = base_path / config["data_folder"]
db_file = config["db_file"]
db_path = data_folder / db_file

result_file_name = f"batch_result_{batch_id}.jsonl"
result_file_path = results_path / result_file_name

# Process the `.jsonl` file and update the database
process_jsonl_and_update_db(
    jsonl_file_path=result_file_path,
    db_path=db_path,
    debug=True,  # Set to True to print debug messages
)

Row with id 73401 updated successfully with LLM-cleaned data.
Row with id 73407 updated successfully with LLM-cleaned data.
Row with id 73415 updated successfully with LLM-cleaned data.
Row with id 73423 updated successfully with LLM-cleaned data.
Row with id 73424 updated successfully with LLM-cleaned data.
Row with id 73425 updated successfully with LLM-cleaned data.
Row with id 73429 updated successfully with LLM-cleaned data.
Row with id 73430 updated successfully with LLM-cleaned data.
Row with id 73440 updated successfully with LLM-cleaned data.
Row with id 73441 updated successfully with LLM-cleaned data.
Row with id 73443 updated successfully with LLM-cleaned data.
Row with id 73449 updated successfully with LLM-cleaned data.
Row with id 73454 updated successfully with LLM-cleaned data.
Row with id 73455 updated successfully with LLM-cleaned data.
Row with id 73458 updated successfully with LLM-cleaned data.
Row with id 73462 updated successfully with LLM-cleaned data.
Row with

In [13]:
client.batches.cancel("batch_679b3785eea08190954b1e082303d987")

Batch(id='batch_679b3785eea08190954b1e082303d987', completion_window='24h', created_at=1738225542, endpoint='/v1/chat/completions', input_file_id='file-A3WUqMj3VuWssfM78AJ7qE', object='batch', status='cancelling', cancelled_at=None, cancelling_at=1738226848, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1738311942, failed_at=None, finalizing_at=None, in_progress_at=1738225543, metadata={'description': 'Batch 1'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=2054))