In [3]:
import re
from datasets import load_dataset
dataset = load_dataset("glaiveai/glaive-function-calling-v2", split="train")

GLAIVE_ROLES = ['USER', 'ASSISTANT', 'FUNCTION RESPONSE']
GLAIVE_TO_CONVERTED_ROLE = {
  'SYSTEM': 'system',
  'USER': 'user',
  'ASSISTANT': 'assistant',
  'FUNCTION RESPONSE': 'function',
}


# The split regex is a role, plus semicolon and space. For example
# "USER: " or "FUNCTION RESPONSE: ".
split_re = re.compile(r'({}): '.format('|'.join(GLAIVE_ROLES)))


def _parse_chat(row: dict):
  system_prompt = row.get('system')
  # Remove "SYSTEM: " from the beginning of the prompt.
  if system_prompt:
    system_prompt = system_prompt.removeprefix('SYSTEM: ')

  chat = row['chat']
  # Split chat by split_res, and remove empty strings.
  chats = [s.strip() for s in split_re.split(chat) if s]

  # results look like:
  # ['USER', 'Can you book a flight for me from New York to London?', 'ASSISTANT', '...']
  # We now want it to be a dictionary of {'from': 'user', 'value': 'Can you book a flight...'}
  chats = [
    {'from': GLAIVE_TO_CONVERTED_ROLE[role], 'value': value}
    for role, value in zip(chats[::2], chats[1::2])
  ]

  if system_prompt:
    chats = [{'from': GLAIVE_TO_CONVERTED_ROLE['SYSTEM'], 'value': system_prompt}] + chats

  return {
    "conversation": chats,
  }


res = dataset.map(_parse_chat)

Map: 100%|██████████| 112960/112960 [00:03<00:00, 33537.95 examples/s]


In [5]:
res

Dataset({
    features: ['system', 'chat', 'conversation'],
    num_rows: 112960
})

In [10]:
res.push_to_hub(
    'dinhdat1110/glaive-function-calling-v2-cleaned',
    token="")

Creating parquet from Arrow format: 100%|██████████| 57/57 [00:00<00:00, 79.26ba/s]
Creating parquet from Arrow format: 100%|██████████| 57/57 [00:00<00:00, 85.26ba/s]
Uploading the dataset shards: 100%|██████████| 2/2 [00:26<00:00, 13.36s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/dinhdat1110/glaive-function-calling-v2-cleaned/commit/a5d1f29b536c1376480d6cb36ca69fbe8db90d8b', commit_message='Upload dataset', commit_description='', oid='a5d1f29b536c1376480d6cb36ca69fbe8db90d8b', pr_url=None, pr_revision=None, pr_num=None)