In [None]:
# Copyright (C) 2024  Edion Management Systems

### Prepare example file

In [None]:
import os

dataset_dir = "fine_tune_dataset"
if not os.path.exists(dataset_dir):
    os.makedirs(dataset_dir)

In [2]:
import json

with open("conversation_examples.json", 'r') as file:
    conversations = json.load(file)

### Sending batch api request (Faster and cheaper (50%) than single request) to create synthetic conversations

Due to the maximum tokens limitation (2,000,000), we devide them into batches, each batch contains 600 requests (3000x600 =1,800,000)

In [122]:
num_batches = 10 # We want the a dataset size 600x10 = 6,000

In [None]:
# Complete progress
from openai import OpenAI
from prompt import *
import time


OPENAI_API_KEY = "sk-proj-ltmkMm6qZ8oQCsusN5IOT3BlbkFJmsPopivPYwLtY7jlx5Pl"
client = OpenAI(api_key=OPENAI_API_KEY)


def create_batch_file(index):
    """
    Create batch file that contains json objects, each json object is an api request.
    Args:
        index: i'th batch
    returns:
        The saved jsonl file
    """
    batch_file = f'batch_request/batch_synthetic{index}.jsonl'
    with open(batch_file, 'w') as f:
        count = 0
        # Repeat all 6 conversation examples 100 times to have 600 requests
        for repeat in range(100):
            for i, c in enumerate(conversations):
                conversation_str = str(c)
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": f"{conversation_sys_prompt}\n{conversation_str}"}
                ]

                task = {
                    "custom_id": f"synthetic{count}_from_example{i}",
                    "method": "POST",
                    "url": "/v1/chat/completions",
                    "body": {
                        # Chat Completions API call
                        "model": "gpt-4o-mini",
                        "temperature": 1,
                        "max_tokens": 3000,
                        "messages": messages
                    }
                } 

                f.write(json.dumps(task) + '\n')
                count += 1
    return batch_file


def submit_and_retrival(client, batch_file):
    """
    Submit batch api request and retrival result
    Args:
        client: OpenAI client
        count: i'th batch
        batch_file: a jsonl file contains all batch requests
    Returns:
        Save results in files
    """
    # Upload batch file
    batch_input_file = client.files.create(
        file=open(batch_file, "rb"),
        purpose="batch"
    )

    batch_input_file_id = batch_input_file.id

    # Submit batch and start progressing
    sent = client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
        "description": f"create synthetic conversations {i}"
        }
    )

    batch_id = sent.to_dict()["id"]
    print(f"Batch: {batch_id} submited")

    # Monitor and retrival results
    complete = False
    while not complete:
        res = client.batches.retrieve(batch_id).to_dict()
        status = res["status"]
        completed_counts = res["request_counts"]["completed"]
        if status == "completed":
            complete = True
            output_file_id = res["output_file_id"]
            break
        time.sleep(30)
        print(f"Batch status: {status}, completed: {completed_counts}")

    print(f"Batch: {batch_id} completed")

    # Write raw results to file
    file_response = client.files.content(output_file_id)
    file_response.write_to_file(f"batch_request/batch_synthetic_output{i}.txt")

    with open(f"batch_request/batch_synthetic_output{i}.txt", "r") as f:
        lines = f.read().splitlines()

    # Write the actual synthetic conversations into jsonl file
    with open(f"batch_request/batch_synthetic_conversation{i}.jsonl", "w") as f:
        for line in lines:
            text = json.loads(line)
            result = text["response"]["body"]["choices"][0]["message"]["content"]
            clean_result = result.replace("```json\n", "").replace("\n```", "").replace("null", "None")
            try:
                f.write(json.dumps(eval(clean_result)) + '\n')
            except:
                print(clean_result)


# Do it in batches
for i in range(num_batches):
    batch_file = create_batch_file(i)
    submit_and_retrival(client, i, batch_file=batch_file)

In [None]:
# List all submited batches and check their status
client.batches.list().dict()

### Combine all results as dataset in sharGPT format

In [8]:
import glob
import json

conversations = []
for file in glob.glob(f'batch_request/batch_synthetic_conversation*.jsonl'):
    with open(file, "r") as f:
        conversations = conversations + [json.loads(line) for line in f.read().splitlines()]

with open("batch_request/all_synthetic_conversations.jsonl", "w") as f:
    [f.write(json.dumps(conv) + '\n') for conv in conversations]

In [6]:
len(conversations)

6119

### Process and clean data (For tool calling)

In [None]:
import re


def generate_exercise(subjects: tuple, exercise_types: tuple, grade_levels: tuple, modifications: tuple, ready: bool) -> float:
    """
    Generate excercise based on user preference. 
    
    Args:
        subjects: a tuple of exercise subjects for each exercise in order
        exercise_types: a tuple of exercise type for each exercise in order
        grade_levels: a tuple of grade levels for each exercise in order
        modifications: a tuple of modified information for each exercise in order
        reday: if the user give all paramters he can and its ready to generated exercise
    Returns:
        A list of exercises based on user's preference, as a string.
    """
    assert len(subjects) == len(exercise_types) == len(grade_levels) == len(modifications)
    if ready:
        return ("exercise 1", "exercise 2", "exercise 3")
    else:
        return "Not ready yet, ask the user for more information."


def get_params(ori_params_group: tuple):
    """
    Turn a tuple of parameters (string) into a paramters dict, while modifying parameter names
    """
    new_params = {'subjects': (), 'exercise_types': (),  'grade_levels': (), 'modifications': (), 'ready': False}
    for params in ori_params_group:
        new_params = {
                    'subjects': (*new_params['subjects'], params['S']),
                    'exercise_types': (*new_params['exercise_types'], params['T']), 
                    'grade_levels': (*new_params['grade_levels'], params['G']),
                    'modifications': (*new_params['modifications'], params['M']),
                    'ready': bool(params['R'])
                    }
    return new_params


def find_params(msg):
    """
    Given a message, find exercise parameters
    """
    params_dict = re.findall(r'<E>(.*?)</E>', msg, re.DOTALL)
    if not params_dict:
        params_dict = re.findall(r'<E>(.*)', msg, re.DOTALL)
    if params_dict:  # Clean data
        params_dict[0] = params_dict[0].replace('false', 'False').replace('true', 'True')
    return params_dict


def process_one_sameple(sample, only_multi_params=False, only_single_params=False):
    """
    Process a conversation sample, turn it into tool calling openai format, and clean data at the same time.
    """
    multi_params_flag = False
    new_conversation = []
    for dialogue in sample["conversations"]:
        new_dict = {}
        params_dict = find_params(dialogue['value']) # Check if carying parameters
        if dialogue['from'] == "gpt":
            new_dict['role'] = 'assistant'
            old_msg = re.findall(r'<U>(.*?)</U>', dialogue['value'].replace('<\\/U>', '</U>'), re.DOTALL) # Check if carying response to user
            if old_msg: # When gpt carying response to user normally
                old_msg = old_msg[0]
                if params_dict: # If gpt response with parameters
                    try:
                        params = eval(params_dict[0])
                    except:
                        print("ERROR -- Response to user in <E></E>: ", dialogue) # Clean data like: {'from': 'gpt', 'value': '<U>Got it! Here’s your fill-in-the-blank exercise for elementary Spanish vocabulary:</U> <E>1. Yo ______ (to be) a estudiante. 2. Ella ______ (to eat) una pizza. 3. Nosotros ______ (to play) fútbol en el parque.</E>'}
                        return [] # exclude dirty conversation
                    if type(params) == dict:   # When it comes only one params set
                            new_params = get_params((params,))
                    elif type(params) == tuple:   # When it is a tuple of params
                            print("Multiple params *** ", dialogue)
                            multi_params_flag = True
                            new_params = get_params(params)
                    else: 
                        print('ERROR -- Incorrect params 1: ', params) # Clean data like </E>('S': 'xxx', ...}</E>
                        return [] # Only consider dict and tuple, otherwise it would be a dirty dialogue the whole conversation is excluded
                    if new_params['ready']:   # When ready
                        tool_call = {"name": "generate_exercise", "arguments": {**new_params}}
                        new_dict['tool_calls'] = [{"type": "function", "function": tool_call}]
                    else:   # Still need more paramters
                        new_dict['content'] = old_msg
                else:   # When gpt present exercise to user
                    new_dict['content'] = old_msg

            else: # When 'gpt' accidently carry message <E> exercise </E>, correct it to be 'human'/'tool' (data cleaning)
                try:
                    tool_result = params_dict[0]
                except:
                    print("ERROR -- Incorrect params 2", dialogue) # Clean data like: 'E': '{"S": "Language Arts", "T": "Writing Prompt", "G": None, "M": "Storytelling about animals", "R": False}'}
                    return []
                new_dict['role'] = 'tool'
                new_dict['name'] = 'generate_exercise'
                new_dict['content'] = (tool_result,)

        elif dialogue['from'] == "human":
            if params_dict: # If human is expert responding with generated exercise
                tool_result = params_dict[0]
                new_dict['role'] = 'tool'
                new_dict['name'] = 'generate_exercise'
                new_dict['content'] = (tool_result,)
            else:   # If human response to gpt's answer
                new_dict['role'] = 'user'
                try:
                    new_dict['content'] = re.findall(r'<U>(.*?)</U>', dialogue['value'].replace('<\\/U>', '</U>'), re.DOTALL)[0]
                except:
                    print("ERROR -- Message without <U></U>", dialogue) # Clean data like: {'from': 'human', 'value': "Yes, let's focus on chemical reactions."}
                    return []

        new_conversation.append(new_dict)
    

    assert not (only_multi_params and only_single_params), "Can't be True at the same time"

    if only_multi_params:
        if multi_params_flag:
            return new_conversation 
        else:
            return []
    elif only_single_params:
        if multi_params_flag:
            return []
        else:
            return new_conversation 
    else:
        return new_conversation

In [None]:
"""Reformat and clean the data"""
import json

multi_params_conversations = []
for i in range(1, 14):
    with open(f"batch_request/batch_synthetic_conversation{i}.jsonl", "r") as f:
        samples = [json.loads(line) for line in f.read().splitlines()]

    for sample in samples:
        new_conversation = process_one_sameple(sample, only_multi_params=True, only_single_params=False)
        if new_conversation:
            multi_params_conversations.append({"messages": new_conversation})

In [None]:
"""Reformat and clean the data"""
import json
with open("batch_request/batch_synthetic_conversation13.jsonl", "r") as f:
    samples = [json.loads(line) for line in f.read().splitlines()]

single_params_conversations = []
for sample in samples:
    new_conversation = process_one_sameple(sample, only_multi_params=False, only_single_params=True)
    if new_conversation:
        single_params_conversations.append({"messages": new_conversation})

In [None]:
import random

final_conversations = single_params_conversations + multi_params_conversations
random.shuffle(final_conversations)

with open("batch_request/synthetic_conversations13+multi_params.json", "w") as f:
    json.dump(final_conversations, f, indent=2)

# Manually clean the dataset
1. Search "two more", "three more", "some more", "two additional" and check if the number of parameters is correct
2. Replace all "1" to 1, apply the rule to all numbers from 1-11
3. Replace all <U> to ""
4. Found more based on testing (see what kind of mistakes the model made)