In [None]:
# Copyright (C) 2024  Edion Management Systems

### Prepare example file

In [None]:
import os

dataset_dir = "fine_tune_dataset"
if not os.path.exists(dataset_dir):
    os.makedirs(dataset_dir)

In [2]:
import json

with open("conversation_examples.json", 'r') as file:
    conversations = json.load(file)

### Sending batch api request (Faster and cheaper (50%) than single request) to create synthetic conversations

Due to the maximum tokens limitation (2,000,000), we devide them into batches, each batch contains 600 requests (3000x600 =1,800,000)

In [122]:
num_batches = 10 # We want the a dataset size 600x10 = 6,000

In [None]:
# Complete progress
from openai import OpenAI
from prompt import *
import time


OPENAI_API_KEY = "sk-proj-ltmkMm6qZ8oQCsusN5IOT3BlbkFJmsPopivPYwLtY7jlx5Pl"
client = OpenAI(api_key=OPENAI_API_KEY)


def create_batch_file(index):
    """
    Create batch file that contains json objects, each json object is an api request.
    Args:
        index: i'th batch
    returns:
        The saved jsonl file
    """
    batch_file = f'batch_request/batch_synthetic{index}.jsonl'
    with open(batch_file, 'w') as f:
        count = 0
        # Repeat all 6 conversation examples 100 times to have 600 requests
        for repeat in range(100):
            for i, c in enumerate(conversations):
                conversation_str = str(c)
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": f"{conversation_sys_prompt}\n{conversation_str}"}
                ]

                task = {
                    "custom_id": f"synthetic{count}_from_example{i}",
                    "method": "POST",
                    "url": "/v1/chat/completions",
                    "body": {
                        # Chat Completions API call
                        "model": "gpt-4o-mini",
                        "temperature": 1,
                        "max_tokens": 3000,
                        "messages": messages
                    }
                } 

                f.write(json.dumps(task) + '\n')
                count += 1
    return batch_file


def submit_and_retrival(client, batch_file):
    """
    Submit batch api request and retrival result
    Args:
        client: OpenAI client
        count: i'th batch
        batch_file: a jsonl file contains all batch requests
    Returns:
        Save results in files
    """
    # Upload batch file
    batch_input_file = client.files.create(
        file=open(batch_file, "rb"),
        purpose="batch"
    )

    batch_input_file_id = batch_input_file.id

    # Submit batch and start progressing
    sent = client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
        "description": f"create synthetic conversations {i}"
        }
    )

    batch_id = sent.to_dict()["id"]
    print(f"Batch: {batch_id} submited")

    # Monitor and retrival results
    complete = False
    while not complete:
        res = client.batches.retrieve(batch_id).to_dict()
        status = res["status"]
        completed_counts = res["request_counts"]["completed"]
        if status == "completed":
            complete = True
            output_file_id = res["output_file_id"]
            break
        time.sleep(30)
        print(f"Batch status: {status}, completed: {completed_counts}")

    print(f"Batch: {batch_id} completed")

    # Write raw results to file
    file_response = client.files.content(output_file_id)
    file_response.write_to_file(f"batch_request/batch_synthetic_output{i}.txt")

    with open(f"batch_request/batch_synthetic_output{i}.txt", "r") as f:
        lines = f.read().splitlines()

    # Write the actual synthetic conversations into jsonl file
    with open(f"batch_request/batch_synthetic_conversation{i}.jsonl", "w") as f:
        for line in lines:
            text = json.loads(line)
            result = text["response"]["body"]["choices"][0]["message"]["content"]
            clean_result = result.replace("```json\n", "").replace("\n```", "").replace("null", "None")
            try:
                f.write(json.dumps(eval(clean_result)) + '\n')
            except:
                print(clean_result)


# Do it in batches
for i in range(num_batches):
    batch_file = create_batch_file(i)
    submit_and_retrival(client, i, batch_file=batch_file)

In [None]:
# List all submited batches and check their status
client.batches.list().dict()

### Combine all results as dataset in sharGPT format

In [8]:
import glob
import json

conversations = []
for file in glob.glob(f'batch_request/batch_synthetic_conversation*.jsonl'):
    with open(file, "r") as f:
        conversations = conversations + [json.loads(line) for line in f.read().splitlines()]

with open("batch_request/all_synthetic_conversations.jsonl", "w") as f:
    [f.write(json.dumps(conv) + '\n') for conv in conversations]

In [6]:
len(conversations)

6119