In [2]:
import os
import json

# Load all functions from the multi turn dataset
# each file is a jsonl file with a list of functions
original_functions_path = "data/multi_turn_func_doc"
functions_dict = {}
for file in os.listdir(original_functions_path):
    file_path = os.path.join(original_functions_path, file)
    if os.path.isfile(file_path) and file.endswith('.json'):
        with open(file_path, 'r') as f:
            content = f.readlines()
            content = [json.loads(line) for line in content]
        functions_dict[file] = content
sample_function = content[0]
print("Sample function:")
print(json.dumps(sample_function, indent=2))

# Get a list with all functions together
all_original_functions = []
for file, functions in functions_dict.items():
    all_original_functions.extend(functions)
print(f"Total number of functions: {len(all_original_functions)}")

Sample function:
{
  "name": "authenticate_twitter",
  "description": "This tool belongs to the TwitterAPI, which provides core functionality for posting tweets, retweeting, commenting, and following users on Twitter. Tool description: Authenticate a user with username and password.",
  "parameters": {
    "type": "dict",
    "properties": {
      "username": {
        "type": "string",
        "description": "Username of the user."
      },
      "password": {
        "type": "string",
        "description": "Password of the user."
      }
    },
    "required": [
      "username",
      "password"
    ]
  },
  "response": {
    "type": "dict",
    "properties": {
      "authentication_status": {
        "type": "boolean",
        "description": "True if authenticated, False otherwise."
      }
    }
  }
}
Total number of functions: 129


In [3]:
# Iterate every json file (a dataset) in the data folder
# and get all the functions in each dataset entry
# and add them to a new_functions list
new_functions_dict = {}
for file in os.listdir("data"):
    file_path = os.path.join("data", file)
    if os.path.isfile(file_path) and file.endswith('.json'):
        # Skip files that have "multi_turn" in the name
        # and also the "chatable" dataset cause it doesn't have functions
        if "multi_turn" in file or "chatable" in file:
            continue
        with open(file_path, 'r') as f:
            content = f.readlines()
            content = [json.loads(line) for line in content]
        # Add each function in the "function" entry field to the file dict key
        new_functions = []
        for entry in content:
            if "function" in entry:
                new_functions.extend(entry["function"])
        new_functions_dict[file] = new_functions
        print(f"File {file} has {len(new_functions)} functions.")

# For each function file, check duplicates by comparing the function name
for file, functions in new_functions_dict.items():
    unique_function_names = set()
    unique_functions = []
    duplicates = 0
    for function in functions:
        if function["name"] in unique_function_names:
            duplicates += 1
        else:
            unique_function_names.add(function["name"])
            unique_functions.append(function)
    # Set the new functions list to the unique functions list
    new_functions_dict[file] = unique_functions
    print(f"File {file} now has {len(new_functions_dict[file])} functions. Had {duplicates} duplicates.")

# Get a list with all functions together
all_new_functions = []
for file, functions in new_functions_dict.items():
    all_new_functions.extend(functions)
print(f"Total number of functions: {len(all_new_functions)}")

# Sample function from the new dataset
new_sample_function = all_new_functions[0]
print("Sample function:")
print(json.dumps(new_sample_function, indent=2))

File BFCL_v3_javascript.json has 50 functions.
File BFCL_v3_live_multiple.json has 4178 functions.
File BFCL_v3_live_parallel.json has 18 functions.
File BFCL_v3_live_relevance.json has 50 functions.
File BFCL_v3_parallel_multiple.json has 520 functions.
File BFCL_v3_exec_multiple.json has 150 functions.
File BFCL_v3_exec_parallel.json has 50 functions.
File BFCL_v3_rest.json has 70 functions.
File BFCL_v3_simple.json has 400 functions.
File BFCL_v3_parallel.json has 200 functions.
File BFCL_v3_multiple.json has 557 functions.
File BFCL_v3_exec_simple.json has 100 functions.
File BFCL_v3_java.json has 100 functions.
File BFCL_v3_live_parallel_multiple.json has 95 functions.
File BFCL_v3_exec_parallel_multiple.json has 118 functions.
File BFCL_v3_live_simple.json has 258 functions.
File BFCL_v3_irrelevance.json has 240 functions.
File BFCL_v3_sql.json has 100 functions.
File BFCL_v3_live_irrelevance.json has 2060 functions.
File BFCL_v3_javascript.json now has 50 functions. Had 0 duplic

In [None]:
# Check if the sample_function has the same keys/structure as the new_sample_function
if sample_function.keys() == new_sample_function.keys():
    print("The keys are the same.")
else:
    print("The keys are different.")
    print("They match in the following keys:", set(sample_function.keys()).intersection(set(new_sample_function.keys())))
    print("They differ in the following keys:", set(sample_function.keys()).difference(set(new_sample_function.keys())))

# Check if all functions are unique by name
unique_function_names = set()
unique_functions = []
repeated_function_names = set()
repeated_functions = []
for function in all_new_functions + all_original_functions:
    if function["name"] in unique_function_names:
        repeated_function_names.add(function["name"])
        repeated_functions.append(function)
    else:
        unique_function_names.add(function["name"])
        unique_functions.append(function)

print(f"Total number of uniques by name: {len(unique_functions)}")
print(f"Total number of duplicates by name: {len(repeated_functions)}")

# Check if the repeated functions are the same in structure and arguments
all_equal_functions_names = []
all_equal_functions = []
all_different_functions_names = []
all_different_functions = []
for repeated_function_name in repeated_function_names:
    repeated_functions_by_name = [f for f in repeated_functions if f["name"] == repeated_function_name]
    # Check if their str representations are equal
    str_representations = [str(f) for f in repeated_functions_by_name]
    if len(set(str_representations)) == 1:
        # print(f"Function {repeated_function_name} has the same structure.")
        all_equal_functions_names.append(repeated_function_name)
        all_equal_functions.extend(repeated_functions_by_name)
    else:
        # print(f"Function {repeated_function_name} has different structures.")
        # for i, str_rep in enumerate(str_representations):
        #     print(f"Function {i}: {str_rep}")
        all_different_functions_names.append(repeated_function_name)
        all_different_functions.extend(repeated_functions_by_name)

print(f"Total number of name duplicate functions with the same structure: {len(all_equal_functions_names)}")
print(f"Total number of name duplicate functions with different structures: {len(all_different_functions_names)}")

# Now filter all the equal functions such that we keep only one instance of the function in the list and remove the rest
all_equal_functions_filtered = []
for repeated_function_name in all_equal_functions_names:
    repeated_functions_by_name = [f for f in all_equal_functions if f["name"] == repeated_function_name]
    all_equal_functions_filtered.append(repeated_functions_by_name[0])
print(f"Total number of unique functions with the same structure (filtered): {len(all_equal_functions_filtered)}")

# TODO filter out the functions that share the same name but different structure to keep at least one of them
# TODO filter out the functions in which their names when replacing dots to underscores are the same

# Print all names of the filtered list
function_names_before = [f["name"] for f in all_equal_functions]
function_names_after = [f["name"] for f in all_equal_functions_filtered]
print(f"Len function names before filtering: {len(function_names_before)}")
print(f"Len function names after filtering: {len(function_names_after)}")

# Create a global list of functions that includes all unique functions by name and all functions that are repeated in name but have the same structure
global_functions = unique_functions
print(f"Total number of global functions: {len(global_functions)}")

# Shuffle them
import random
random.seed(42)
random.shuffle(global_functions)
print("Shuffled global functions.")

The keys are different.
They match in the following keys: {'parameters', 'name', 'description'}
They differ in the following keys: {'response'}
Total number of uniques by name: 2031
Total number of duplicates by name: 1286
Total number of name duplicate functions with the same structure: 596
Total number of name duplicate functions with different structures: 186
Total number of unique functions with the same structure (filtered): 596
Len function names before filtering: 786
Len function names after filtering: 596
Total number of global functions: 2031
Shuffled global functions.


In [9]:
# Save all functions to a jsonl file
with open("data/global_functions.jsonl", 'w') as f:
    for function in global_functions:
        f.write(json.dumps(function) + "\n")
print("Saved global functions to data/global_functions.jsonl")

Saved global functions to data/global_functions.jsonl
