In [109]:
import os
import re
import json
from collections import Counter
from collections import defaultdict

In [None]:
# Build a function-domain mapping dictionary
def build_function_domain_map(root_folder):
    function_domain_map = {}
    
    for domain_folder in os.listdir(root_folder):
        domain_path = os.path.join(root_folder, domain_folder)
        if os.path.isdir(domain_path):
            for subdir, _, files in os.walk(domain_path):
                for file in files:
                    if file == "api.py":
                        api_file_path = os.path.join(subdir, file)
                        with open(api_file_path, "r") as f:
                            content = f.read()
                            # Find all functions defined in api.py and map them to the domain folder
                            matches = re.findall(r"def\s+(\w+)\s*\(", content)
                            for function_name in matches:
                                function_domain_map[function_name] = domain_folder
    return function_domain_map

# Add domain field to tools using the function-domain map
def add_domain_to_tools(data, function_domain_map):
    for entry in data:
        tools = json.loads(entry.get("tools", "[]"))  # Parse the tools JSON
        for tool in tools:
            function_name = tool["name"]
            domain = function_domain_map.get(function_name, "Other")
            tool["domain"] = domain  # Add the domain key-value pair
        entry["tools"] = json.dumps(tools)  # Convert tools back to a JSON string
    return data

# Load dataset and update with domains
def process_dataset(file_path, root_folder, output_path):
    # Build the function-domain map once
    function_domain_map = build_function_domain_map(root_folder)
    
    with open(file_path, "r") as f:
        data = json.load(f)
    
    # Add domain information to each tool in each entry
    updated_data = add_domain_to_tools(data, function_domain_map)
    
    # Write updated dataset to a new JSON file
    with open(output_path, "w") as f:
        json.dump(updated_data, f, indent=4)
    
    print(f"Updated dataset with domains saved to {output_path}")

root_folder = "/Users/cho-seonggeun/Downloads/data/toolenv/tools"  
input_file_path = "data/xlam_fc.json" 
output_file_path = "data/xlam_fc_w_domain.json" 

process_dataset(input_file_path, root_folder, output_file_path)

Updated dataset with domains saved to data/xlam_fc_w_domain.json


In [107]:
with open(output_file_path, "r") as f:
    data = json.load(f)

domain_counter = {}
tool_list = []
for entry in data:
    for tool in json.loads(entry["tools"]):
        tool_name, domain = tool["name"], tool["domain"]
        if domain not in domain_counter:
            domain_counter[domain] = [tool_name]
        else:
            if tool_name not in domain_counter[domain]:
                domain_counter[domain].append(tool_name)

        if tool_name not in tool_list:
            tool_list.append(tool_name)

for domain in domain_counter:
    print(domain, ":", len(domain_counter[domain]))

Gaming : 99
Financial : 53
Finance : 398
Entertainment : 146
Sports : 358
Media : 97
Commerce : 17
Movies : 33
Weather : 78
Other : 197
Data : 637
eCommerce : 63
Travel : 108
Location : 106
Social : 212
Science : 32
Advertising : 6
Business_Software : 25
News_Media : 67
Events : 22
Business : 67
Video_Images : 42
Food : 61
Search : 36
Medical : 11
Music : 79
Education : 54
Text_Analysis : 28
Database : 76
Tools : 101
Devices : 14
Logistics : 14
Health_and_Fitness : 36
Email : 36
Communication : 29
Monitoring : 15
Artificial_Intelligence_Machine_Learning : 22
Mapping : 43
SMS : 12
Transportation : 44
Visual_Recognition : 7
Translation : 7
Energy : 5
Jobs : 5
Payments : 2
Cybersecurity : 3
Storage : 2


In [114]:
import os
import json
from collections import defaultdict

def save_tools_by_domain(updated_data, output_folder):
    # Step 1: Create 'data/tools' directory if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Step 2: Organize tools by domain and collect unique tools for tools_general.json
    tools_by_domain = defaultdict(list)
    seen_functions_by_domain = defaultdict(set)
    all_unique_tools = {}  # To store all unique tools across domains

    for entry in updated_data:
        tools = entry["tools"]  # Already converted to list of dictionaries
        for tool in tools:
            domain = tool.get("domain", "Not Found")
            function_name = tool["name"]

            # Add tool to specific domain if not already added
            if function_name not in seen_functions_by_domain[domain]:
                tools_by_domain[domain].append(tool)
                seen_functions_by_domain[domain].add(function_name)

            # Add tool to the general collection if it's a new function
            if function_name not in all_unique_tools:
                all_unique_tools[function_name] = tool

    # Step 3: Save each domain's tools to its own JSON file
    for domain, tools in tools_by_domain.items():
        domain_filename = f"{domain}.json"
        domain_file_path = os.path.join(output_folder, domain_filename)
        with open(domain_file_path, "w") as f:
            json.dump(tools, f, indent=4)
        print(f"Saved {len(tools)} unique tools to {domain_file_path}")

    # Step 4: Save all unique tools to tools_general.json
    general_file_path = os.path.join(output_folder, "tools_general.json")
    with open(general_file_path, "w") as f:
        json.dump(list(all_unique_tools.values()), f, indent=4)
    print(f"Saved {len(all_unique_tools)} unique tools to {general_file_path}")

# Load updated dataset
def load_and_save_by_domain(updated_file_path, output_folder):
    with open(updated_file_path, "r") as f:
        updated_data = json.load(f)

    # Ensure each tool entry is a list of dictionaries
    for entry in updated_data:
        entry['tools'] = json.loads(entry['tools'])  # Convert tools back to dictionary format if stored as JSON string

    # Save tools grouped by domain to separate JSON files, and save unique tools to tools_general.json
    save_tools_by_domain(updated_data, output_folder)

output_folder = "data/tools"  # Folder to store domain JSON files

load_and_save_by_domain(output_file_path, output_folder)


Saved 99 unique tools to data/tools/Gaming.json
Saved 53 unique tools to data/tools/Financial.json
Saved 398 unique tools to data/tools/Finance.json
Saved 146 unique tools to data/tools/Entertainment.json
Saved 358 unique tools to data/tools/Sports.json
Saved 97 unique tools to data/tools/Media.json
Saved 17 unique tools to data/tools/Commerce.json
Saved 33 unique tools to data/tools/Movies.json
Saved 78 unique tools to data/tools/Weather.json
Saved 197 unique tools to data/tools/Other.json
Saved 637 unique tools to data/tools/Data.json
Saved 63 unique tools to data/tools/eCommerce.json
Saved 108 unique tools to data/tools/Travel.json
Saved 106 unique tools to data/tools/Location.json
Saved 212 unique tools to data/tools/Social.json
Saved 32 unique tools to data/tools/Science.json
Saved 6 unique tools to data/tools/Advertising.json
Saved 25 unique tools to data/tools/Business_Software.json
Saved 67 unique tools to data/tools/News_Media.json
Saved 22 unique tools to data/tools/Events.js