In [1]:
# Load jsonl file
import json
dataset_paths = {
    "multi_turn_long_context": "data/BFCL_v3_multi_turn_long_context.json",
}
datasets = {}
for dataset_name, dataset_path in dataset_paths.items():
    with open(dataset_path, "r") as f:
        dataset = f.readlines()
        dataset = [json.loads(line) for line in dataset]
    datasets[dataset_name] = dataset
    print(f"Loaded {dataset_name} dataset with {len(dataset)} samples")

Loaded multi_turn_long_context dataset with 200 samples


In [2]:
# Check a sample of each dataset
onerows = {}
for dataset_name, dataset in datasets.items():
    onerows[dataset_name] = dataset[0]

onerow_multi_turn_long_context = onerows["multi_turn_long_context"]

In [3]:
# Get number of tools per domain
domains = {
    "GorillaFileSystem": "gorilla_file_system.json",
    "MathAPI": "math_api.json",
    "MessageAPI": "message_api.json",
    "TwitterAPI": "posting_api.json",
    "TicketAPI": "ticket_api.json",
    "TradingBot": "trading_bot.json",
    "TravelAPI": "travel_booking.json",
    "VehicleControlAPI": "vehicle_control.json",
}
tools_base_path = "data/multi_turn_func_doc/"
tools_per_domain = {}
num_tools_per_domain = {}
for domain_name, domain_file in domains.items():
    with open(f"{tools_base_path}{domain_file}", "r") as f:
        tools = f.readlines()
        tools = [json.loads(line) for line in tools]
    tools_per_domain[domain_name] = tools
    num_tools_per_domain[domain_name] = len(tools)
    print(f"Domain {domain_name} has {len(tools)} tools/functions")

Domain GorillaFileSystem has 18 tools/functions
Domain MathAPI has 17 tools/functions
Domain MessageAPI has 10 tools/functions
Domain TwitterAPI has 14 tools/functions
Domain TicketAPI has 9 tools/functions
Domain TradingBot has 22 tools/functions
Domain TravelAPI has 17 tools/functions
Domain VehicleControlAPI has 22 tools/functions


In [4]:
def get_dataset_stats(dataset):
    dataset_stats = {
        "keys": dataset[0].keys(),
        "num_samples": len(dataset),
        "num_messages": [len(sample["question"]) for sample in dataset],
        "num_involved_domains": [len(sample["involved_classes"]) for sample in dataset],
        "num_tool_calls": [len(sample["path"]) for sample in dataset],
        "avg_num_messages": sum([len(sample["question"]) for sample in dataset]) / len(dataset) if dataset else 0,
        "max_num_messages": max([len(sample["question"]) for sample in dataset]) if dataset else 0,
        "min_num_messages": min([len(sample["question"]) for sample in dataset]) if dataset else 0,
        "avg_num_involved_domains": sum([len(sample["involved_classes"]) for sample in dataset]) / len(dataset) if dataset else 0,
        "max_num_involved_domains": max([len(sample["involved_classes"]) for sample in dataset]) if dataset else 0,
        "min_num_involved_domains": min([len(sample["involved_classes"]) for sample in dataset]) if dataset else 0,
        "avg_num_tool_calls": sum([len(sample["path"]) for sample in dataset]) / len(dataset) if dataset else 0,
        "max_num_tool_calls": max([len(sample["path"]) for sample in dataset]) if dataset else 0,
        "min_num_tool_calls": min([len(sample["path"]) for sample in dataset]) if dataset else 0,
    }
    # Get number of tools available per sample
    for sample in dataset:
        sample["num_tools_available"] = sum(num_tools_per_domain.get(domain_name, 0) for domain_name in sample["involved_classes"])
    dataset_stats["num_tools_available"] = [sample["num_tools_available"] for sample in dataset]
    dataset_stats["avg_num_tools_available"] = sum([sample["num_tools_available"] for sample in dataset]) / len(dataset) if dataset else 0
    dataset_stats["max_num_tools_available"] = max([sample["num_tools_available"] for sample in dataset]) if dataset else 0
    dataset_stats["min_num_tools_available"] = min([sample["num_tools_available"] for sample in dataset]) if dataset else 0

    return dataset_stats

In [5]:
import numpy as np
from copy import deepcopy

def extend_dataset_double_available_tools(dataset, new_tag="extended_double"):
    extended_dataset = deepcopy(dataset)

    prev_name = str(dataset[0]["id"]).split("_")[:-1]
    prev_name = "_".join(prev_name)
    print(f"Extending dataset with tag {new_tag} and previous name {prev_name}")
    new_tagged_name = f"{prev_name}_{new_tag}" if prev_name else new_tag

    for sample_index, sample in enumerate(extended_dataset):
        # Change sample id name
        sample["id"] = f"{new_tagged_name}_{sample['id'].split('_')[-1]}"
        sample["involved_classes_original"] = deepcopy(sample["involved_classes"])

        # Get all tools available for this sample
        num_tools_in_sample = sum([num_tools_per_domain.get(domain_name, 0) for domain_name in sample["involved_classes"]])
        target_num_tools = 2 * num_tools_in_sample

        print(f"Sample {sample_index} from {num_tools_in_sample} tools (with domains {sample['involved_classes']})", end=" ")

        # Let's try to double the number of tools available
        # Let's randomly pick a domain (not in the involved_classes) and add all its tools
        # Only consider domains in which the number of tools is less or equal than the target number of tools
        available_domains = [domain_name for domain_name, num_tools in num_tools_per_domain.items() if num_tools <= target_num_tools and domain_name not in sample["involved_classes"]]
        if not available_domains:
            print("No more domains available")
            # Get the domain with the least number of tools (but not in involved_classes) and add its tools
            available_domains = [domain_name for domain_name, num_tools in num_tools_per_domain.items() if domain_name not in sample["involved_classes"]]
            available_domains = [min(available_domains, key=lambda domain_name: num_tools_per_domain[domain_name])]
            min_tools_domain = available_domains[0]
            domain_tools = tools_per_domain[min_tools_domain]
            sample["involved_classes"].append(min_tools_domain)
            num_tools_in_sample += len(domain_tools)

        # Add tools while the number of tools is less than the target number of tools
        while num_tools_in_sample < target_num_tools and available_domains:
            domain_name = str(np.random.choice(available_domains))
            # Do not add the tool if the distance from the target number of tools will be greater than the current distance
            curr_distance = abs(num_tools_in_sample - target_num_tools)
            future_distance = abs(num_tools_in_sample + len(tools_per_domain[domain_name]) - target_num_tools)
            if future_distance >= curr_distance:
                # Remove from available domains and continue
                available_domains.remove(domain_name)
                continue
            available_domains.remove(domain_name)
            domain_tools = tools_per_domain[domain_name]
            sample["involved_classes"].append(domain_name)
            num_tools_in_sample += len(domain_tools)

        print(f"to {num_tools_in_sample} ({num_tools_in_sample - target_num_tools} far from target)", end=" ")
        print(f"has now domains {sample['involved_classes']}")

    # Check that there are not repeated "involved_classes"
    for sample in extended_dataset:
        assert len(sample["involved_classes"]) == len(set(sample["involved_classes"]))

    print(f"Extended dataset has {len(extended_dataset)} samples")

    return extended_dataset

In [6]:
def extend_dataset_full_available_tools(dataset, new_tag="extended_full"):
    extended_dataset = deepcopy(dataset)

    prev_name = str(dataset[0]["id"]).split("_")[:-1]
    prev_name = "_".join(prev_name)
    print(f"Extending dataset with tag {new_tag} and previous name {prev_name}")
    new_tagged_name = f"{prev_name}_{new_tag}" if prev_name else new_tag

    for sample_index, sample in enumerate(extended_dataset):
        # Change sample id name
        sample["id"] = f"{new_tagged_name}_{sample['id'].split('_')[-1]}"
        sample["involved_classes_original"] = deepcopy(sample["involved_classes"])

        # Add tools while the number of tools is less than the target number of tools
        target_num_tools = 128
        num_tools_in_sample = len(sample.get("involved_classes", []))
        available_domains = list(set(tools_per_domain.keys()).difference(set(sample["involved_classes"])))

        while num_tools_in_sample < target_num_tools and available_domains:
            domain_name = str(np.random.choice(available_domains))
            available_domains.remove(domain_name)
            domain_tools = tools_per_domain[domain_name]
            sample["involved_classes"].append(domain_name)
            num_tools_in_sample += len(domain_tools)

        print(f"to {num_tools_in_sample} ({num_tools_in_sample - target_num_tools} far from target)", end=" ")
        print(f"has now domains {sample['involved_classes']}")

    # Check that there are not repeated "involved_classes"
    for sample in extended_dataset:
        assert len(sample["involved_classes"]) == len(set(sample["involved_classes"]))

    print(f"Extended dataset has {len(extended_dataset)} samples")

    return extended_dataset

In [7]:
# Extend the dataset
extended_double_available_tools_dataset = extend_dataset_double_available_tools(datasets["multi_turn_long_context"])
extended_full_available_tools_dataset = extend_dataset_full_available_tools(datasets["multi_turn_long_context"])

# Get stats of base and extended datasets
base_dataset_stats = get_dataset_stats(datasets["multi_turn_long_context"])
extended_double_dataset_stats = get_dataset_stats(extended_double_available_tools_dataset)
extended_full_dataset_stats = get_dataset_stats(extended_full_available_tools_dataset)

print("Base dataset stats:")
print(base_dataset_stats)
print("Extended double dataset stats:")
print(extended_double_dataset_stats)
print("Extended full dataset stats:")
print(extended_full_dataset_stats)

# Save extended datasets
# Double
extended_double_dataset_path = "data/BFCL_v3_multi_turn_long_context_extended_double.json"
with open(extended_double_dataset_path, "w") as f:
    for sample in extended_double_available_tools_dataset:
        f.write(json.dumps(sample) + "\n")
# Full
extended_full_dataset_path = "data/BFCL_v3_multi_turn_long_context_extended_full.json"
with open(extended_full_dataset_path, "w") as f:
    for sample in extended_full_available_tools_dataset:
        f.write(json.dumps(sample) + "\n")

Extending dataset with tag extended_double and previous name multi_turn_long_context
Sample 0 from 32 tools (with domains ['TwitterAPI', 'GorillaFileSystem']) to 68 (4 far from target) has now domains ['TwitterAPI', 'GorillaFileSystem', 'TravelAPI', 'MessageAPI', 'TicketAPI']
Sample 1 from 18 tools (with domains ['GorillaFileSystem']) to 40 (4 far from target) has now domains ['GorillaFileSystem', 'VehicleControlAPI']
Sample 2 from 27 tools (with domains ['TicketAPI', 'GorillaFileSystem']) to 61 (7 far from target) has now domains ['TicketAPI', 'GorillaFileSystem', 'TravelAPI', 'MathAPI']
Sample 3 from 18 tools (with domains ['GorillaFileSystem']) to 40 (4 far from target) has now domains ['GorillaFileSystem', 'VehicleControlAPI']
Sample 4 from 32 tools (with domains ['TwitterAPI', 'GorillaFileSystem']) to 68 (4 far from target) has now domains ['TwitterAPI', 'GorillaFileSystem', 'MathAPI', 'MessageAPI', 'TicketAPI']
Sample 5 from 32 tools (with domains ['TwitterAPI', 'GorillaFileSyste

In [8]:
import json
from copy import deepcopy

# Generate the possible answer file for the extended dataset
possible_answers = {}
possible_answers["multi_turn_long_context"] = "data/possible_answer/BFCL_v3_multi_turn_long_context.json"
possible_answers["multi_turn_long_context_extended_double"] = "data/possible_answer/BFCL_v3_multi_turn_long_context_extended_double.json"
possible_answers["multi_turn_long_context_extended_full"] = "data/possible_answer/BFCL_v3_multi_turn_long_context_extended_full.json"

with open(possible_answers["multi_turn_long_context"], "r") as f:
    possible_answers_base = f.readlines()
    possible_answers_base = [json.loads(line) for line in possible_answers_base]

# Save possible answers DOUBLE dataset
extended_possible_answers = deepcopy(possible_answers_base)
prev_name = str(datasets["multi_turn_long_context"][0]["id"]).split("_")[:-1]
prev_name = "_".join(prev_name)
for sample in extended_possible_answers:
    sample["id"] = f"{prev_name}_extended_double_{sample['id'].split('_')[-1]}"

with open(possible_answers["multi_turn_long_context_extended_double"], "w") as f:
    for sample in extended_possible_answers:
        f.write(json.dumps(sample) + "\n")

# Save possible answers FULL dataset
extended_possible_answers = deepcopy(possible_answers_base)
prev_name = str(datasets["multi_turn_long_context"][0]["id"]).split("_")[:-1]
prev_name = "_".join(prev_name)
for sample in extended_possible_answers:
    sample["id"] = f"{prev_name}_extended_full_{sample['id'].split('_')[-1]}"

with open(possible_answers["multi_turn_long_context_extended_full"], "w") as f:
    for sample in extended_possible_answers:
        f.write(json.dumps(sample) + "\n")