In [21]:
# 1. Create jsonl file with requests for batch completion

import json
import tiktoken
import os
from generate_stories import create_simple_story_prompt, iterate_params

MODEL_PARAMETERS = {"top_p": 0.07}

def get_batch_dataset(num_completions, model, offset=0):
    
    enc = tiktoken.encoding_for_model(model)
    params_iterator = iterate_params()
    for _ in range(offset):
        next(params_iterator)
    
    while True:
        lines = []
        total_tokens = 0
        for i in range(num_completions):
            params = next(params_iterator)
            prompt, num_stories_in_completion = create_simple_story_prompt(params.copy())
            message_tokens = len(enc.encode(prompt))
            total_tokens += message_tokens
            messages=[{"role": "user", "content": prompt}]

            lines.append(json.dumps(
                {"custom_id": str(i)+json.dumps(params | {"expected_num_stories_in_completion": num_stories_in_completion}),
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {"model": model, "messages": messages, **MODEL_PARAMETERS}}
                ))
        
        print(f"Total input tokens: {total_tokens}")

        yield lines

def write_batch_completion_file(num_completions, model, base_filename, offset=0):
    iterator = get_batch_dataset(num_completions, model, offset)
    counter = 1
    filename = base_filename
    while True:
        lines = next(iterator)
        filename = f"{filename}_{str(counter)}.jsonl"
        with open(filename, "w") as fp:
            fp.write("\n".join(lines))
        counter += 1
        yield filename
    
if not os.path.exists("data"):
    os.makedirs("data")

Total input tokens: 1663194
{"custom_id": "0{\"theme\": \"Dreams\", \"topic\": \"royal kingdoms\", \"style\": \"tragic\", \"feature\": \"a MacGuffin\", \"grammar\": \"anaphora\", \"num_paragraphs\": 1, \"expected_num_stories_in_completion\": 13}", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-4o-mini", "messages": [{"role": "user", "content": "Write a short story (1 paragraphs) using very basic words that a preschool child could understand. \nThe story should be about Dreams, include royal kingdoms, be tragic in its writing style and ideally feature a MacGuffin. The most important thing is to write an engaging easy story, but where it makes sense, demonstrate the use of anaphora. If you need to use proper names, make them from space-separated common words. Either don't give characters a name, or select from Mia, Alex, Jean, Samuel, Lily, Leo, Jose, Kim, Alice, Lena, Rita, Emmanuel, Anne, Peter, Maria or Luis. Complex story structure is great, but please rememb

In [22]:
# 2. Execute Batch Jobs

import os
import time
from datetime import datetime
from openai import OpenAI
from tqdm import tqdm
        
NUM_COMPLETIONS = 100_000
NUM_COMPLETIONS_PER_REQUEST = 10_000 # Calculate this based on rate limits, to be checked at https://platform.openai.com/settings/organization/limits
MODEL = "gpt-4o-mini"
MAX_RETRIES = 3

client = OpenAI(api_key=os.environ["OPENAI_API_KEY_SIMPLESTORIES"])

def check_batch_status(batch_id, batch_number, directory):
    while True:
        batch_status = client.batches.retrieve(batch_id)

        status = batch_status.status
        print(f"Batch status: {status}")

        if status == "validating":
            print("The input file is being validated. Please wait...")
        elif status == "failed":
            print("The input file has failed validation.")
            return False
        elif status == "in_progress":
            print("The batch is currently being processed. Please wait...")
        elif status == "finalizing":
            print("The batch is completed and the results are being prepared.")
        elif status == "completed":
            print("The batch is complete, downloading the results...")
            download_batch_results(batch_status.output_file_id, batch_number, directory)
            return True
        elif status == "expired":
            print("The batch was not completed within the 24-hour time window.")
            return False
        elif status == "cancelling":
            print("The batch is being cancelled. Please wait...")
        elif status == "cancelled":
            print("The batch was cancelled.")
            return False
        else:
            print("Unknown status encountered.")

        time.sleep(30)  # Wait for 30 seconds before checking the status again
        
def download_batch_results(output_file_id, batch_number, directory):
    with open(os.path.join(directory, "output_file_ids.txt"), "a") as f:
        f.write(output_file_id + "\n")

    file_response = client.files.content(output_file_id)
    
    filename = f"{directory}/batch_data_{batch_number}.jsonl"
    with open(filename, 'w') as f:
        f.write(file_response.text)
total_completions = 0
batch_number = 0
consecutive_failures = 0

directory = os.path.join("data", f"batches_{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}")
os.makedirs(directory, exist_ok=True)
os.makedirs(os.path.join(directory, "prompts"), exist_ok=True)

base_filename = os.path.join(directory, "prompts", "batch")
batch_writer_iter = write_batch_completion_file(NUM_COMPLETIONS_PER_REQUEST, MODEL, base_filename, offset=10_000)
with tqdm(total=NUM_COMPLETIONS, desc="Batch Generation") as pbar:
    while total_completions < NUM_COMPLETIONS and consecutive_failures < MAX_RETRIES:
        try:
            # 1. Write the batch completion file
            batch_number += 1
            filename = next(batch_writer_iter)

            # 2. Upload the batch file
            batch_input_file = client.files.create(
                file=open(filename, "rb"),
                purpose="batch"
            )
            batch_input_file_id = batch_input_file.id
            with open(os.path.join(directory, "input_file_ids.txt"), "a") as f:
                f.write(batch_input_file_id + "\n")

            # 3. Create the batch job
            batch_info = client.batches.create(
                input_file_id=batch_input_file_id,
                endpoint="/v1/chat/completions",
                completion_window="24h",
                metadata={
                    "description": f"Simple Stories Story Generation - batch {batch_number}, n={NUM_COMPLETIONS_PER_REQUEST}"
                }
            )

            batch_id = batch_info.id
            with open(os.path.join(directory, "batch_job_ids.txt"), "a") as f:
                f.write(batch_id + "\n")

            # 4. Check the status and download the results
            if check_batch_status(batch_id, batch_number, directory):
                total_completions += NUM_COMPLETIONS_PER_REQUEST
                pbar.update(NUM_COMPLETIONS_PER_REQUEST)
                consecutive_failures = 0
            else:
                consecutive_failures += 1

        except Exception as e:
            print(f"An error occurred: {e}")
            consecutive_failures += 1

    if consecutive_failures >= MAX_RETRIES:
        print(f"Stopping due to {MAX_RETRIES} consecutive failures.")

Batch Generation:   0%|          | 0/100000 [00:00<?, ?it/s]

Total input tokens: 1663233
Batch status: validating
The input file is being validated. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch 

Batch Generation:  10%|█         | 10000/100000 [49:24<7:24:37,  3.37it/s]

Total input tokens: 1663038
Batch status: validating
The input file is being validated. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch 

Batch Generation:  20%|██        | 20000/100000 [1:37:17<6:28:06,  3.44it/s]

Total input tokens: 1663496
Batch status: validating
The input file is being validated. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch 

Batch Generation:  30%|███       | 30000/100000 [2:26:42<5:42:27,  3.41it/s]

Total input tokens: 1663234
Batch status: validating
The input file is being validated. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch 

Batch Generation:  40%|████      | 40000/100000 [3:31:14<5:30:30,  3.03it/s]

Total input tokens: 1663117
Batch status: validating
The input file is being validated. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch 

Batch Generation:  50%|█████     | 50000/100000 [4:17:09<4:18:54,  3.22it/s]

Total input tokens: 1663300
Batch status: validating
The input file is being validated. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch 

Batch Generation:  60%|██████    | 60000/100000 [4:59:29<3:14:16,  3.43it/s]

Total input tokens: 1663337
Batch status: validating
The input file is being validated. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch 

Batch Generation:  70%|███████   | 70000/100000 [5:48:54<2:26:32,  3.41it/s]

Total input tokens: 1663055
Batch status: validating
The input file is being validated. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch 

Batch Generation:  80%|████████  | 80000/100000 [6:37:47<1:37:42,  3.41it/s]

Total input tokens: 1663122
Batch status: validating
The input file is being validated. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch 

Batch Generation:  90%|█████████ | 90000/100000 [7:21:42<47:18,  3.52it/s]  

Total input tokens: 1663111
Batch status: validating
The input file is being validated. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch status: in_progress
The batch is currently being processed. Please wait...
Batch 

Batch Generation: 100%|██████████| 100000/100000 [8:07:07<00:00,  3.42it/s]


In [13]:
# 3. Format batch output
import re
import json
import os

from generate_stories import process_completion

def format_jsonl(input_files, output_file, input_files_nucleus=[]):
    assert not os.path.isfile(output_file), "output file already exists"

    for k, input_file in enumerate(input_files_nucleus + input_files):
        with open(input_file, 'r') as infile, open(output_file, 'a') as outfile:
            for line in infile:
                data = json.loads(line)
                
                custom_id = data['custom_id']
                match = re.search(r'{.*}', custom_id)
                if match:
                    params = json.loads(match.group(0))
                else:
                    continue
                
                completion = data['response']['body']['choices'][0]['message']['content']
                gen_model = data['response']['body']['model']
                
                json_struct = process_completion(gen_model, completion, params, expected_num_stories=params.get("expected_num_in_stories", None))
                json_struct = [item | {"nucleus":  k >= len(input_files_nucleus)} for item in json_struct]
                lines = [json.dumps(item) for item in json_struct if 'story' in item]
                outfile.write("\n".join(lines) + "\n")

input_dir = os.path.join('data', 'batches_2024-09-03-11-24-05')
input_files = []
input_files.extend([os.path.join(input_dir,file) for file in os.listdir(input_dir) if file.endswith('.jsonl') and os.path.isfile(os.path.join(input_dir, file))])
input_files.append(r'data\batches_2024-09-02-19-54-39\batch_data_1.jsonl')
output_file = os.path.join(input_dir, 'processed.jsonl')

format_jsonl(input_files, output_file, input_files_nucleus=[r"data\batches_2024-09-02-16-47-53\batch_data_1.jsonl"])

In [14]:
# 3.1: Optionally convert to parquet
import pandas as pd
import json

parquet_file_path = output_file.replace(".jsonl", ".parquet")

df = pd.read_json(output_file, lines=True)
df.to_parquet(parquet_file_path, engine='pyarrow', compression='snappy')


File successfully converted to data\batches_2024-09-03-11-24-05\processed.parquetl


4. Proceed to either analyse_dataset.ipynb or embeddings.ipynb