In [None]:
from openai import OpenAI
from dotenv import load_dotenv
import os
import json
import time

load_dotenv()

# Batch Files Directory for a given Category
BATCH_DIR = ""

In [2]:
org_id = os.getenv('OPENAI_ORG_ID')
project_id = os.getenv('OPENAI_PROJECT_ID')

client = OpenAI(
    organization=org_id,
    project=project_id,
)

In [3]:
years = ['2019', '2020', '2021', '2022', '2023']
gaps = ['2', '4', '6', '8', '10']

for year in years:
    batch_details_file = f'{BATCH_DIR}/{year}_batch_details.json'
    if os.path.exists(batch_details_file):
        os.remove(batch_details_file)
    for gap in gaps:
        file_name = f'{BATCH_DIR}/{year}_{gap}_batch.json'
        batch_file = client.files.create(
            file=open(file_name, 'rb'),
            purpose='batch'
        )

        batch_details = {
            f"{year}_{gap}_batch_id": batch_file.id
        }
        with open(batch_details_file, 'a') as f:
            f.write(json.dumps(batch_details) + '\n')

### Manually queing batches

In [None]:
# Replace with year and the batch details file path
year = '2023'
batch_details_file = f'{BATCH_DIR}/{year}_batch_details.json'

batch_details = []

with open(batch_details_file, 'r') as f:
    for line in f:
        batch_details.append(json.loads(line))

batch_details

In [10]:
# que batch job
year = '2023'
gap = '10'
batch_file_id = batch_details[4][f'{year}_{gap}_batch_id']

batch_job = client.batches.create(
    input_file_id=batch_file_id,
    endpoint='/v1/chat/completions',
    completion_window="24h"
)

### Automatically queing batches

In [4]:
def wait_for_batch_completion(client, batch_job_id, poll_interval=60):
    """
    Polls the status of the batch job until it is completed or failed.
    :param client: API client to interact with batch jobs
    :param batch_job_id: ID of the batch job to monitor
    :param poll_interval: Time (in seconds) between each status check
    :return: Status of the completed batch job (e.g., "completed", "failed")
    """
    while True:
        batch_status = client.batches.retrieve(batch_job_id).status

        if batch_status == 'completed':
            print(f"Batch {batch_job_id} completed successfully.")
            return 'completed'
        elif batch_status == 'failed':
            print(f"Batch {batch_job_id} failed.")
            return 'failed'
        elif batch_status == 'finalizing':
            print(f"Batch {batch_job_id} is finalizing, waiting for completion...")

        time.sleep(poll_interval)


In [3]:
'''
This code will que all the batches at once (all years and year gaps). Modify the loop and add calls to the wait_for_batch_completion to change how many batches to call at a given time.
NOTE: OpenAI API has limits on how many tokens you can que using batch processing, so modify as needed. 
'''

years = ['2019', '2020', '2021', '2022', '2023']
gap = ['2', '4', '6', '8', '10']

for year in years:
    batch_details_file = f'{BATCH_DIR}/{year}_batch_details.json'

    batch_details = []

    with open(batch_details_file, 'r') as f:
        for line in f:
            batch_details.append(json.loads(line))

    # for i, gap in tqdm(enumerate(gaps)):
    batch_file_id1 = batch_details[0][f'{year}_{gap[0]}_batch_id']

    batch_job1 = client.batches.create(
        input_file_id=batch_file_id1,
        endpoint='/v1/chat/completions',
        completion_window="24h"
    )

    batch_file_id2 = batch_details[1][f'{year}_{gap[1]}_batch_id']

    batch_job2 = client.batches.create(
        input_file_id=batch_file_id2,
        endpoint='/v1/chat/completions',
        completion_window="24h"
    )

    batch_file_id3 = batch_details[2][f'{year}_{gap[2]}_batch_id']

    batch_job3 = client.batches.create(
        input_file_id=batch_file_id3,
        endpoint='/v1/chat/completions',
        completion_window="24h"
    )

    batch_file_id4 = batch_details[3][f'{year}_{gap[3]}_batch_id']

    batch_job4 = client.batches.create(
        input_file_id=batch_file_id4,
        endpoint='/v1/chat/completions',
        completion_window="24h"
    )

    batch_file_id5 = batch_details[4][f'{year}_{gap[4]}_batch_id']

    batch_job5 = client.batches.create(
        input_file_id=batch_file_id5,
        endpoint='/v1/chat/completions',
        completion_window="24h"
    )

    # wait_for_batch_completion(client, batch_job1.id)
    # wait_for_batch_completion(client, batch_job2.id)
    # wait_for_batch_completion(client, batch_job3.id)
    # wait_for_batch_completion(client, batch_job4.id)
    # wait_for_batch_completion(client, batch_job5.id)
